# Prepare pHLA binding data

In this notebook we will prepare the pHLA binding data for the training and test sets. 
We will process peptide sequences with their correspongind HLA allele and label.
## Load useful functions

In [1]:
import os
import glob
import numpy as np
import pandas as pd
from typing import Tuple
from sklearn.preprocessing import MinMaxScaler

In [2]:
def is_valid_peptide(peptide: str) -> bool:
    """
    Check if the peptide has valid characters
    :param peptide: str
        Peptide to check
    :return: bool
        True if peptide is valid, False otherwise
    """
    peptide = peptide.upper()
    valid_aa = 'ACDEFGHIKLMNPQRSTVWY'
    return all([aa in valid_aa for aa in peptide])

def filter_peptides_by_len(peptides_list: list,
                           min_len: int = 8, 
                           max_len: int = 12) -> Tuple[list, list]:
    """
    Get list of peptides that have min_len or more AA and
    less or fewer max_len AA (min_len <= len(peptide) <= max_len)
    :param peptides_list: list
        List of peptides (as str) to filter
    :param min_len: int
        Minimum length of peptides to keep
    :param max_len: int
        Maximum length of peptides to keep
    :return: Tuple[list, list]
        * List of peptides that have min_len or more AA and
        less or fewer max_len AA
        * List of indices of the peptides in the original list
        that passed the filter
    """
    # Filter epitopes to 8 to 12 AA
    filt_peptides = []
    index_mask = []
    n_invalid = 0
    for i, p in enumerate(peptides_list):
        if max_len >= len(p) >= min_len and is_valid_peptide(p):
            filt_peptides.append(p)
            index_mask.append(i)
        else:
            n_invalid +=1
    print(f'\t {n_invalid} peptides out of {len(seqs_epi_raw)} were not betweem {min_len} and {max_len} AA.')
    return filt_peptides, index_mask

def scale_peptide_encodings(peptide_encodings: np.ndarray) -> np.ndarray:
    """
    Scale each feature (col) of the peptide encodings to be between 0 and 1
    :param peptide_encodings: np.ndarray
        Encodings to scale
    :return: np.ndarray
        Scaled encodings
    """
    scaler = MinMaxScaler()
    return scaler.fit_transform(peptide_encodings)


# Globals

In [3]:
DATA_FOLDER = os.path.join('..', 'data')
RAW_DATA_FOLDER = os.path.join(DATA_FOLDER, 'raw')
RAW_pHLA_BINDING_DATA_FOLDER = os.path.join(RAW_DATA_FOLDER, 'pHLA_binding')
INTERIM_DATA_FOLDER = os.path.join(DATA_FOLDER, 'interim')
INTERIM_pHLA_BINDING_DATA_FOLDER = os.path.join(INTERIM_DATA_FOLDER, 'pHLA_binding')
PROCESS_DATA_FOLDER = os.path.join(DATA_FOLDER, 'processed')
PROCESS_pHLA_BINDING_DATA_FOLDER = os.path.join(PROCESS_DATA_FOLDER, 'pHLA_binding')

## Process NetMHCpan data

We will generate the train and test sets using the paper original split.

### Warnings

- Only the binding affinity trainig set data is upload to the repository. The EL and test data are too large. Therefore, the used will need to executed it locally!

In [5]:
netmhcpan_raw_train_folder = os.path.join(RAW_pHLA_BINDING_DATA_FOLDER, 'NetMHCpan_train')
alleles_list_file = os.path.join(netmhcpan_raw_train_folder, 'allelelist')
netmhcpan_raw_test_folder = os.path.join(RAW_pHLA_BINDING_DATA_FOLDER, 'CD8_benchmark_filtered')
netmhcpan_interim_folder = os.path.join(INTERIM_pHLA_BINDING_DATA_FOLDER, 'NetMHCpan_dataset')

In [6]:
# binding affinity data
ba_files = glob.glob(f'{netmhcpan_raw_train_folder}/*_ba')
# eluted ligand data
el_files = glob.glob(f'{netmhcpan_raw_train_folder}/*el')
# Test set files
test_files = glob.glob(f'{netmhcpan_raw_test_folder}/*HLA*')

# Make dict with allelelist data for Multi-allelic data
with open(alleles_list_file, 'r') as f:
    alleles_dict = {}
    for line in f:
        line = line.strip()
        if line:
            line = line.replace(':', '-')
            allele, hla_list = line.split()
            hla_list = hla_list.split(',')
            alleles_dict[allele] = hla_list


## Process NetMHCpan data

We will generate a dataframe with the following columns:
- `peptide`: the peptide/epitope amino acid sequence
- `is_mono_allelic`: whether the epitope is presented by a single HLA allele (True, False)
- `hla_allele`: the HLA allele or alleles that the epitope binds (name format: HLA-A01-01)
- `label`: Whether the peptide binds to the HLA allele (1: Binder, 0: Non-binder)

### Warnings
- We won't apply the filter for the peptide length, as we are not going to encode the peptides.
- Only HLA with allele A, B or C will be kept.
- HLA-alleles names Normalized to HLA-A01-01 format

In [4]:
MAX_PEPTIDE_LEN = 15 # Usually epitopes are between 8 to 14 aminoacids, but we set to 15 to be sure


netmhcpan_raw_train_folder = os.path.join(RAW_pHLA_BINDING_DATA_FOLDER, 'NetMHCpan_train')
alleles_list_file = os.path.join(netmhcpan_raw_train_folder, 'allelelist')
netmhcpan_raw_test_folder = os.path.join(RAW_pHLA_BINDING_DATA_FOLDER, 'CD8_benchmark_filtered')
netmhcpan_process_folder = os.path.join(PROCESS_pHLA_BINDING_DATA_FOLDER, 'NetMHCpan_dataset')

In [5]:
# binding affinity data
ba_files = glob.glob(f'{netmhcpan_raw_train_folder}/*_ba')
# eluted ligand data
el_files = glob.glob(f'{netmhcpan_raw_train_folder}/*el')
# Test set files
test_files = glob.glob(f'{netmhcpan_raw_test_folder}/*HLA*')

# Make dict with allelelist data for Multi-allelic data
with open(alleles_list_file, 'r') as f:
    alleles_dict = {}
    for line in f:
        line = line.strip()
        if line:
            line = line.replace(':', '-')
            allele, hla_list = line.split()
            hla_list = hla_list.split(',')
            alleles_dict[allele] = hla_list


In [6]:
# Process Binding Affinity data
ba_seq_pep_list = [] # List of peptide sequences for all files
ba_epi_labels_list = [] # List of epitope labels for all files
ba_hla_list = [] # List of HLA alleles per epitope for all files
ba_is_multi_allelic = [] # List of whether the epitope is presented by multiple alleles

for f in ba_files:
    print(f'Processing {f}')
    df = pd.read_csv(f, sep=' ', header=None, names=('epitope', 'binding_affinity', 'hla_allele'))
    df = df[df['hla_allele'].str.startswith('HLA')] # Only consider HLAs
    # Normalize HLA naming to HLA-A01-01 format
    df['hla_allele'] = df['hla_allele'].str.replace(':', '-')
    # Filter the HLA alleles to only keep A, B or C
    df = df[df['hla_allele'].str[4].isin(['A', 'B', 'C'])]
    seqs_epi_raw = df['epitope'].values.tolist()
    valid_epi, valid_epi_idx = filter_peptides_by_len(seqs_epi_raw, max_len=MAX_PEPTIDE_LEN)
    
    df = df.iloc[valid_epi_idx] # Filter out invalid epitopes
    binding_labels = df['binding_affinity'] >= 0.426 # Above is considered a binder
    binding_labels_arr = binding_labels.to_numpy().astype(int)
    
    # Get array of HLAs
    hla_array = df['hla_allele'].values
    
    ba_seq_pep_list.append(df['epitope'].values)
    ba_epi_labels_list.append(binding_labels_arr)
    ba_hla_list.append(hla_array)
    ba_is_multi_allelic.append(np.zeros(binding_labels_arr.shape[0], dtype=bool))
    
all_seq_pep = np.concatenate(ba_seq_pep_list)
all_ba_epi_labels = np.concatenate(ba_epi_labels_list)
all_ba_hla = np.concatenate(ba_hla_list)
all_ba_is_multi_allelic = np.concatenate(ba_is_multi_allelic)

assert (all_ba_epi_labels.shape[0] == all_ba_hla.shape[0]
        == all_ba_is_multi_allelic.shape[0]), 'Mismatch in data shapes.'

binding_affinity_df = pd.DataFrame({
    'peptide': all_seq_pep,
    'is_mono_allelic': ~all_ba_is_multi_allelic,
    'hla_allele': all_ba_hla,
    'label': all_ba_epi_labels,
})

binding_affinity_df.to_csv(os.path.join(netmhcpan_process_folder, 'train_binding_affinity_peptides_data_MaxLenPep15_hla_ABC.csv'), index=False)

Processing ../data/raw/pHLA_binding/NetMHCpan_train/c003_ba
	 0 peptides out of 33789 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c002_ba
	 0 peptides out of 33897 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c000_ba
	 0 peptides out of 33446 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c004_ba
	 0 peptides out of 34527 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c001_ba
	 0 peptides out of 34448 were not betweem 8 and 15 AA.


In [14]:
binding_affinity_df.peptide.str.len().value_counts()

peptide
9     126375
10     31939
8       5562
11      5391
13       387
12       233
14       220
Name: count, dtype: int64

In [19]:
# Process Eluted Ligand data

el_seq_pep_list = [] # List of peptide sequences for all files
el_epi_labels_list = [] # List of epitope labels for all files
el_hla_list = [] # List of HLA alleles per epitope for all files
el_is_multi_allelic = [] # List of whether the epitope is presented by multiple alleles

for f in el_files:
    print(f'Processing {f}')
    df = pd.read_csv(f, sep=' ', header=None, names=('epitope', 'eluted_ligand', 'mhc_codes'))
    mhc_codes_in_file = df['mhc_codes'].values
    mhc_equivalences = [alleles_dict.get(mhc, [mhc]) for mhc in mhc_codes_in_file]
    # Check if each equivalence in mhc_equivalences are HLAs
    has_hla = [all([e.startswith('HLA') for e in equivalences]) for equivalences in mhc_equivalences]
    is_multi_allelic = [len(equivalences) > 1 for equivalences in mhc_equivalences]
    df['is_multi_allelic'] = is_multi_allelic
    df['hla_allele'] = mhc_equivalences
    df = df[has_hla] # Only consider HLAs
    
    # DEBUG
    # df = df[:1000]
    
    seqs_epi_raw = df['epitope'].values.tolist()
    valid_epi, valid_epi_idx = filter_peptides_by_len(seqs_epi_raw, max_len=MAX_PEPTIDE_LEN)
    
    df = df.iloc[valid_epi_idx] # Filter out invalid epitopes
    el_labels = df['eluted_ligand'] == 1 
    el_labels_arr = el_labels.to_numpy().astype(int)
    
    # Normalize HLA naming
    hla_array = df['hla_allele'].values
    
    el_seq_pep_list.append(df['epitope'].values)
    el_epi_labels_list.append(el_labels_arr)
    el_hla_list.append(hla_array)
    el_is_multi_allelic.append(df['is_multi_allelic'].values)
    
    
all_seq_pep = np.concatenate(el_seq_pep_list)
all_el_epi_labels = np.concatenate(el_epi_labels_list)
all_el_hla = np.concatenate(el_hla_list)
all_el_is_multi_allelic = np.concatenate(el_is_multi_allelic)

assert (all_el_epi_labels.shape[0] == all_el_hla.shape[0] == 
        all_el_is_multi_allelic.shape[0]), 'Mismatch in data shapes.'

eluted_ligand_df = pd.DataFrame({
    'peptide': all_seq_pep,
    'is_mono_allelic': ~all_el_is_multi_allelic,
    'hla_allele': all_el_hla,
    'label': all_el_epi_labels,
})

# Separate between multialelic and monoalelic data in EL data
eluted_ligand_df_mono = eluted_ligand_df[~all_el_is_multi_allelic]
eluted_ligand_df_multi = eluted_ligand_df[all_el_is_multi_allelic]
# Change hla_allele to string for monoalelic data
eluted_ligand_df_mono['hla_allele'] = eluted_ligand_df_mono['hla_allele'].apply(lambda x: x[0])
# Filter Monoallelic data to only keep HLA-A, B or C
eluted_ligand_df_mono = eluted_ligand_df_mono[eluted_ligand_df_mono['hla_allele'].str[4].isin(['A', 'B', 'C'])]
# Filter Multiallelic data to only keep instances with all HLAs as HLA-A, B or C
eluted_ligand_df_multi = eluted_ligand_df_multi[eluted_ligand_df_multi['hla_allele'].apply(lambda x: all([hla[4] in ['A', 'B', 'C'] for hla in x]))]

eluted_ligand_df_mono.to_csv(os.path.join(netmhcpan_process_folder, 'train_eluted_ligand_peptides_data_mono_MaxLenPep15_hla_ABC.csv.gz'), index=False)
eluted_ligand_df_multi.to_pickle(os.path.join(netmhcpan_process_folder, 'train_eluted_ligand_peptides_data_multi_MaxLenPep15_hla_ABC.pkl'))


Processing ../data/raw/pHLA_binding/NetMHCpan_train/c003_el
	 4 peptides out of 2139289 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c001_el
	 1 peptides out of 2128029 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c000_el
	 0 peptides out of 2135167 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c002_el
	 0 peptides out of 2137018 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/NetMHCpan_train/c004_el
	 0 peptides out of 2130923 were not betweem 8 and 15 AA.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eluted_ligand_df_mono['hla_allele'] = eluted_ligand_df_mono['hla_allele'].apply(lambda x: x[0])


In [10]:
eluted_ligand_df_mono.peptide.str.len().value_counts()

peptide
9     633595
10    566243
11    552255
8     542317
12    481920
13    467900
14    422991
15     12180
Name: count, dtype: int64

In [12]:
eluted_ligand_df_multi.peptide.str.len().value_counts()

peptide
9     1294911
10    1145518
11    1123207
8     1094057
12     781594
13     777007
14     774726
Name: count, dtype: int64

In [17]:
# Process Test set data 

test_seq_pep_list = [] # List of peptide sequences for all files
test_epi_labels_list = [] # List of epitope labels for all files
test_hla_list = [] # List of HLA alleles per epitope for all files
test_is_multi_allelic = [] # List of whether the epitope is presented by multiple alleles

for f in test_files:
    print(f'Processing {f}')
    df = pd.read_csv(f, sep=' ', header=None, names=('epitope', 'binding_label', 'hla_allele'))
    df = df[df['hla_allele'].str.startswith('HLA')] # Only consider HLAs
    # Normalize HLA naming to HLA-A01-01 format
    df['hla_allele'] = df['hla_allele'].str.replace(':', '-')
    # Filter the HLA alleles to only keep A, B or C
    df = df[df['hla_allele'].str[4].isin(['A', 'B', 'C'])]
    
    seqs_epi_raw = df['epitope'].values.tolist()
    valid_epi, valid_epi_idx = filter_peptides_by_len(seqs_epi_raw, max_len=MAX_PEPTIDE_LEN)
    
    df = df.iloc[valid_epi_idx] # Filter out invalid epitopes
    binding_labels = df['binding_label'] == 1 
    binding_labels_arr = binding_labels.to_numpy().astype(int)
    
    # HLA to array
    hla_array = df['hla_allele'].values
    
    test_seq_pep_list.append(df['epitope'].values)
    test_epi_labels_list.append(binding_labels_arr)
    test_hla_list.append(hla_array)
    test_is_multi_allelic.append(np.zeros(binding_labels_arr.shape[0], dtype=bool))
    
all_seq_pep = np.concatenate(test_seq_pep_list)
all_test_epi_labels = np.concatenate(test_epi_labels_list)
all_test_hla = np.concatenate(test_hla_list)
all_test_is_multi_allelic = np.concatenate(test_is_multi_allelic)

assert (all_test_epi_labels.shape[0] == all_test_hla.shape[0] ==
        all_test_is_multi_allelic.shape[0]), 'Mismatch in data shapes.'

test_set_df = pd.DataFrame({
    'peptide': all_seq_pep,
    'is_mono_allelic': ~all_test_is_multi_allelic,
    'hla_allele': all_test_hla,
    'label': all_test_epi_labels,
})

test_set_df.to_csv(os.path.join(netmhcpan_process_folder, 'test_set_peptides_data_MaxLenPep15_hla_ABC.csv.gz'), index=False)


Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/SLCKLNNVFY_HLA-A03:01
	 0 peptides out of 1092 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/LAMPFATPM_HLA-B35:01
	 0 peptides out of 1190 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/IVSPFIPLL_HLA-A02:01
	 0 peptides out of 2730 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/IETVPVKL_HLA-B40:01
	 0 peptides out of 9975 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/FLFTFFASI_HLA-A02:01
	 0 peptides out of 2128 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/GMPPHMLPVL_HLA-A02:01
	 0 peptides out of 5075 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filtered/LLGRDSFEV_HLA-A02:01
	 0 peptides out of 2660 were not betweem 8 and 15 AA.
Processing ../data/raw/pHLA_binding/CD8_benchmark_filt

In [18]:
test_set_df.peptide.str.len().value_counts()

peptide
8     1686889
9     1685112
10    1683467
11    1681756
12    1680045
13    1678335
14    1676626
Name: count, dtype: int64