# Explore different strategies to join peptide and HLA encodings

- TEIM VAE encodes each peptide in a 32-dimensional vector
- HLA fingerprints extracted from MaSIF are encoded in a 36x80 matrix (num of points from NetMHCpan pseudoseq, 80 descriptors that MaSIF outputs from each residue)

**/!\ WARNING:** This notebook is not longer used. The final strategy is to use the peptides as they are, without any transformation. The final notebook is `3.2-amdr-join-peptide-hla-data.ipynb`

In [1]:
import os
import numpy as np
import pandas as pd
from typing import Tuple

DATA_FOLDER = os.path.join('..', 'data')
INTERIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'interim')
INTERIM_pHLA_BINDING_DATA_FOLDER = os.path.join(INTERIM_DATA_FOLDER, 'pHLA_binding')
INTERIM_HLA_FP_FOLDER = os.path.join(INTERIM_DATA_FOLDER, 'hla_fingerprints')
PROCESS_DATA_FOLDER = os.path.join(DATA_FOLDER, 'processed')
PROCESS_pHLA_BINDING_FOLDER = os.path.join(PROCESS_DATA_FOLDER, 'pHLA_binding')

## Flatten HLA fingerprints and concatenate with peptide encodings

This is a simple strategy that concatenates the peptide encoding with the flattened HLA fingerprint. The resulting vector is 32+36x80=2912-dimensional.

By now we will focus on the monoallelic dataset from NetMHCpan.

In [9]:
hla_fp_file = os.path.join(INTERIM_HLA_FP_FOLDER, 'hla_af_patch_emb_netMHCpan_pseudoseq_res_scaled0-1.npy')
hla_fp_data_file = os.path.join(INTERIM_HLA_FP_FOLDER, 'hla_index_in_emb_netMHCpan_pseudoseq_res.csv')

interim_netmhcpan_folder = os.path.join(INTERIM_pHLA_BINDING_DATA_FOLDER, 'NetMHCpan_dataset')
process_netmhcpan_folder = os.path.join(PROCESS_pHLA_BINDING_FOLDER, 'NetMHCpan_dataset')
train_peptide_encoding_ba_file = os.path.join(interim_netmhcpan_folder, 'train_binding_affinity_peptides_encodings.npy')
train_peptide_data_ba_file = os.path.join(interim_netmhcpan_folder, 'train_binding_affinity_peptides_data.csv.gz')
#train_peptide_encoding_el_file = os.path.join(interim_netmhcpan_folder, 'train_eluted_peptides_encodings.npy')

test_peptide_encoding_file = os.path.join(interim_netmhcpan_folder, 'test_set_peptides_encodings.npy')
test_peptide_data_file = os.path.join(interim_netmhcpan_folder, 'test_set_peptides_data.csv.gz')

In [3]:
hla_norm_fp = np.load(hla_fp_file)
# Load df with HLA names as index to get the index of the HLA in the hla_norm_fp
hla_fp_data = pd.read_csv(hla_fp_data_file, index_col=1, names=['index', 'hla'], skiprows=1).to_dict()['index']

In [4]:
train_ba_peptide_encodings = np.load(train_peptide_encoding_ba_file)
train_ba_peptide_data = pd.read_csv(train_peptide_data_ba_file)

test_peptide_encodings = np.load(test_peptide_encoding_file)
test_peptide_data = pd.read_csv(test_peptide_data_file)

In [5]:
def get_flatten_hla_fp(hla_fp: np.array, hla_name: str, hla_fp_data: dict) -> np.array:
    """
    Get the flattened HLA fingerprint for a given HLA name
    :param hla_fp: np.array
        Array containing all HLA fingerprints in the same order as the hla_fp_data
    :param hla_name: str
        HLA name to get the fingerprint. It MUST follow the format 'HLA-A01-01'
    :param hla_fp_data: dict
        Dictionary with the HLA names as keys and the index in the hla_fp as values
    :return: np.array
        HLA fingerprint in a 1D array
    """
    try:
        hla_index = hla_fp_data[hla_name]
    except KeyError:
        raise KeyError(f'{hla_name} not found in the hla_fp_data. Make sure that the HLA name is in the format HLA-A01-01')
    return hla_fp[hla_index].flatten()

def get_tested_hla_for_peptide_idx(peptide_idx: int, peptide_data: pd.DataFrame) -> str:
    """
    Get the HLA name that have been tested for a given peptide index
    :param peptide_idx: int
        Index of the peptide in the peptide_data
    :param peptide_data: pd.DataFrame
        DataFrame with the peptide data
    :return: str
        HLA name in the format 'HLA-A01-01'
    """
    return peptide_data.iloc[peptide_idx]['hla_allele']

def get_label_for_peptide_idx(peptide_idx: int, peptide_data: pd.DataFrame) -> float:
    """
    Get the label for a given peptide index
    :param peptide_idx: int
        Index of the peptide in the peptide_data
    :param peptide_data: pd.DataFrame
        DataFrame with the peptide data
    :return: float
        Binding affinity label
    """
    return peptide_data.iloc[peptide_idx]['label']

def get_merged_peptide_hla_encodings(peptide_encoding: np.array, 
                                     peptide_data: pd.DataFrame,
                                     scaled_hla_fp: np.array, 
                                     hla_fp_data: dict, 
                                     scale_peptide_encoding: bool = True) -> Tuple[np.array, np.array]:
    """
    Merge the peptide encoding with the HLA fingerprint
    :param peptide_encoding: np.array
        Array with the peptide encoding
    :param peptide_data: pd.DataFrame
        DataFrame with the peptide data with the same index as the peptide_encoding
    :param scaled_hla_fp: np.array
        Array containing all HLA fingerprints in the same order as the hla_fp_data
    :param hla_fp_data: dict
        Dictionary with the HLA names as keys and the index in the hla_fp as values
    :return: Tuple[np.array, np.array]
        Merged array with the peptide encoding and the HLA fingerprint
        Labels for the merged array (0 for non-binders, 1 for binders)
    """
    
    if scale_peptide_encoding:
        from sklearn.preprocessing import MinMaxScaler
        peptide_encoding = MinMaxScaler().fit_transform(peptide_encoding)
    
    merged_peptide_hla_list = []
    labels = []
    for idx, pe in enumerate(peptide_encoding):
        tested_hla = get_tested_hla_for_peptide_idx(idx, peptide_data)
        if tested_hla[4] not in 'ABC':
            # Skip peptides with HLA alleles that are not A, B or C
            print(f'WARNING /!\\: Peptide {idx} has an HLA allele {tested_hla} that is not A, B or C, skipping it')
            continue
        p_label = get_label_for_peptide_idx(idx, peptide_data)
        hla_fp = get_flatten_hla_fp(scaled_hla_fp, tested_hla, hla_fp_data)
        merged_peptide_hla_list.append(np.concatenate([pe, hla_fp]))
        labels.append(p_label)
    
    merged_peptide_hla = np.array(merged_peptide_hla_list)
    
    assert merged_peptide_hla.shape[0] == len(labels), 'Number of labels does not match the number of merged peptide-HLA encodings'
    
    return merged_peptide_hla, labels
    

In [6]:
train_ba_merged_enc, train_ba_labels = get_merged_peptide_hla_encodings(train_ba_peptide_encodings, train_ba_peptide_data, hla_norm_fp, hla_fp_data)
test_merged_enc, test_labels = get_merged_peptide_hla_encodings(train_ba_peptide_encodings, train_ba_peptide_data, hla_norm_fp, hla_fp_data)



In [12]:
np.save(os.path.join(process_netmhcpan_folder, 'train_ba_merged_enc.npy'), train_ba_merged_enc)
np.save(os.path.join(process_netmhcpan_folder, 'train_ba_labels.npy'), train_ba_labels)
np.save(os.path.join(process_netmhcpan_folder, 'test_merged_enc.npy'), test_merged_enc)
np.save(os.path.join(process_netmhcpan_folder, 'test_labels.npy'), test_labels)

In [11]:
train_ba_merged_enc.shape

(169500, 2912)