# Running Binder Dataset Benchmark (with all peptide data)
### Option 1, 2, 3

### Libraries

In [1]:
#analysis
import numpy as np
import pandas as pd

#plotting
import matplotlib.pyplot as plt

#logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('matplotlib.font_manager').disabled = True

### Class Dataset
One can generate a dataset and analyze or visualize parts of it.
You can only add elements to dataset once.
Analysis works on ABC loci.

In [2]:
class DataSet:
    def __init__(self):
        """
        Initializes the dataset object.
        The mode is set to ABC by default meaning to only look at MHC alleles starting with A, B or C (not E, G).

        """
        self.dataframe = pd.DataFrame({'Allele': [], 'Peptide': [], 'Source': []})
        self.mode = "ABC"

    def get_dataframe(self):
        return self.dataframe

    def add_dfs_to_dataset(self, paths, sources, seps):
        """
        Adds dataframes to dataset object, assumes datsets are annotated with Allele(seperator)Peptide.
        Gets you a datset with Allele Peptide and Source column

        Args:
            paths (list): list of paths to dataframes
            sources (list): list of Sources of dataframes (if you want to annotate where the data comes from)
            seps (list): list of seperators of dataframes
        
        """
        assert len(paths) == len(sources) == len(seps)
        assert len(self.dataframe) == 0
        assert len(paths) > 0
        
        df_list = []
        for i,path in enumerate(paths):
            df = pd.read_csv(path, sep=seps[i])
            #remove duplicates input df
            df = df.drop_duplicates(subset=['Allele','Peptide'], keep='first')
            df['Source'] = sources[i]
            df_list.append(df)
        joined_df = pd.concat(df_list)
        joined_df = joined_df.drop_duplicates(subset=['Allele','Peptide'], keep='first')
        joined_df = joined_df[joined_df['Allele'].str.startswith('A') | joined_df['Allele'].str.startswith('B') | joined_df['Allele'].str.startswith('C')]
        self.dataframe = joined_df

    
    def cleanup_peptide(self):
        """
        removes peptide annotation: AYSSY(Oxidation)KIIK -> AYSSYKIIK.

        """
        self.dataframe["Peptide"] = self.dataframe["Peptide"].str.replace(r"\(.*\)","", regex=True)


# Datasets

In [3]:
all_peptide_allele_data = DataSet()
all_peptide_allele_data.add_dfs_to_dataset(paths=['data_readout_final/1_readout_data_mhcmotifatlas_classI_MS_Peptides_all_peptides.csv', 
                                    'data_readout_final/2_1readout_data_MSV000084172_2016.csv',
                                    'data_readout_final/2_2readout_data_MSV000084172_2017.csv',
                                    'data_readout_final/3_1readout_data_MSV000090323_batch12.csv',
                                    'data_readout_final/3_2readout_data_MSV000090323_batch15.csv',
                                    'data_readout_final/4_readout_data_PXD009531.csv',
                                    'data_readout_final/5_1breadout_data_netmhcpan_binder.csv',
                                    'data_readout_final/mhcflurry_data.csv'], 
                            sources=['mhcmotifatlas',
                                    'msv84172_2016',
                                    'msv84172_2017',
                                    'msv90323_12',
                                    'msv90323_15',
                                    'pxd9531',
                                    'netmhcpan_binder',
                                    'mhcflurry'],
                            seps=['\t']*8)
all_peptide_allele_data.cleanup_peptide()

# Binder Dataset Run with all peptides
### Settings

In [None]:
#SWITCH:
# OPTION 1: Scaled over specifity False and importance score = 1
#           every peptide is scored equally
# OPTION 2: Scaled over specifity True and importance score = 1
#           calculates the peptides that have multiple alleles with 1/(number of alleles per that peptide)
# OPTION 3: Scaled over specifity True and importance score = integer
#           calculates the peptides that have multiple alleles with 1/(number of alleles per that peptide), and the peptides with one allele with the importance score

SCALED_OVER_SPECIFICITY = False
IMPORTANCE_SCORE = 1

In [4]:
PATH_TO_TUEDB  = '../db_dump_311023_cleaned.tsv'
PATH_TO_FILTERED_OUT_PEPTIDES = 'filtered_peptides_after_gibbs_clustering.txt'

In [5]:
# Prepare our own datasets
binder_df = all_peptide_allele_data.get_dataframe()
binder_df = binder_df.groupby('Peptide')['Allele'].apply(list).reset_index()

with open(PATH_TO_FILTERED_OUT_PEPTIDES, 'r') as f:
    filtered_peptides = f.readline()
    filtered_peptides = eval(filtered_peptides)

print('This is the database with the monoallelic peptide-allele data:')
display(binder_df.head())
print('This is the list of peptides that were filtered out by GibbsCluster (per allele):')
print(filtered_peptides)

This is the database with the monoallelic peptide-allele data:


Unnamed: 0,Peptide,Allele
0,AAAAAAAAA,[C1202]
1,AAAAAAAAAA,[C1202]
2,AAAAAAAAAAA,"[A0205, B5601]"
3,AAAAAAAAAAAAAYSS,[B5701]
4,AAAAAAAAAAAAHQ,[B5701]


This is the list of peptides that were filtered out by GibbsCluster (per allele):
{'A0101': ['APGSAAPAAGSAPAA', 'ALGQNPTNAEVLK', 'TEIRLRLHY', 'QGPGLGVYAY', 'PKPQDGKETKAAD', 'SDVSLTACKV', 'SQRYESLK', 'AHFGGADAARRY', 'ERLYPLRKYAVKA', 'ASVAWAVLK', 'LECFININY', 'AKKGGEKKK', 'IVDARPAMAATSF', 'ASFLRAQER', 'NTLPVSGNLID', 'LAKAGKNQGD', 'ADKASASAPAPASA', 'EKLRQNLNK', 'TKDGKKDKKEEDKK', 'MCPFLFLAV', 'KREPEDEGEDDD', 'SKNASKVANKGKSKS', 'TLDRLLALNS', 'VLPPPPPDT', 'YGGGGGGGY', 'KVRGTAKANVGAGKK', 'KSAQKAQKA', 'NFKTPRGPV', 'IIHEIAVLEL', 'ALDNTDLVFG', 'VSLTACKV', 'ASVPAGGAVAV', 'VRYHTKVR', 'SENSSQPAKK', 'ANFVMNPGDA', 'MVASVAFGH', 'AMSARAEAIK', 'NCRACAKSY', 'VLDINSIDN', 'GMGQKDSYVGD', 'LGPTAQWSVEDEEE', 'PRYHEVHY', 'GPGARARYQKSYR', 'AVLNRNRPEKN', 'RIHGVGFK', 'IVGRPRHQGVMV', 'SAPKKSKADGQ', 'ITELKAQLA', 'VTGKSKKRN', 'KKHKKLELDGSY', 'KIEARERK', 'VLLADGNN', 'RSHRRDQKW', 'GKGDPKKPRGK', 'VIILNHPGQISAG', 'SKKPAGGVDFDET', 'LFDNAMLR', 'SQSDTVFDY', 'VVVQPPTP', 'AGGKAGKDSGKAK', 'YESINYIF', 'APSGSSGGICEKSK', 'RVKENDQ

In [6]:
# Prepare the tuedb database
tuedb_df = pd.read_csv(PATH_TO_TUEDB, sep='\t',index_col=0)

tdb_df = tuedb_df.copy()

alleles = tdb_df['all_hla_alleles_donor'].explode().unique()
donors = tdb_df['donor_code'].unique()
tdb_df = tdb_df.sort_values(by=['donor_code'])
group1_df = tdb_df.groupby('donor_code')['peptide_sequence'].apply(list).reset_index()
group1_df['alleles'] = tdb_df.groupby('donor_code')['all_hla_alleles_donor'].apply(list).reset_index()['all_hla_alleles_donor']
group1_df['alleles'] = group1_df['alleles'].apply(lambda x: x[0])
display(group1_df.head())


Unnamed: 0,donor_code,peptide_sequence,alleles
0,04-001,"[TTDLFGRDLSY, AYLEAHETF, NRFQIATV, TAASRLVTL, ...","['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ..."
1,1003,"[DAVTAFESI, PIDGNFFTY, FLSFMNTEL, YTWEEVFRV, T...","['A0101', 'A0201', 'B5101', 'B5701', 'C0401', ..."
2,1008,"[SLFEEMLQV, TLIDLPGITKV, VVYEGQLISI, AEFKEAFQL...","['A0101', 'A0201', 'B0801', 'B4001', 'C0304', ..."
3,1010,"[ALWSLPLYL, FLLPILSQI, DAYVILKTV, IYEPNFIFF, L...","['A0201', 'A2402', 'B5001', 'B5101', 'C0102', ..."
4,1012,"[TLLPLRVFL, VLWDRTFSLF, SLLDIIEKV, SRLPVLLLL, ...","['A0201', 'A0301', 'B0702', 'B4402', 'C0501', ..."


In [7]:
grouped_tuedb_df = group1_df.copy()

result_dict = {}

for _, row in grouped_tuedb_df.iterrows():
    #get data of one donor in tuedb database
    donor = row['donor_code']
    peptides = row['peptide_sequence']
    alleles = eval(row['alleles'])

    #create result_dict entry for donor and initialise it
    result_dict[donor] = {
        'overall peptides in tuedb': len(peptides),
        'peptide not in monoallelic ds': 0,
        'peptides': {'donor allele': {}, 'not donor allele': {}}
    }
    for allele in alleles:
        if allele not in result_dict[donor]['peptides']['donor allele']:
            result_dict[donor]['peptides']['donor allele'][allele] = 0
    
    #Iterate through the peptide list of a donor
    for i, peptide in enumerate(peptides):
        #peptide in tuedb dataset is interesting if it is in monoallelic dataset
        if peptide in binder_df['Peptide'].values:
            binder_alleles = binder_df[binder_df['Peptide'] == peptide]['Allele'].values
            binder_alleles = binder_alleles[0]
            for filtered_allele in binder_alleles:
                if filtered_allele in filtered_peptides:
                    # Check if peptide is in the list associated with trash_allele
                    if peptide in filtered_peptides[filtered_allele]:
                        # Remove trash_allele from monoallelic_alleles
                        binder_alleles = [i for i in binder_alleles if i != filtered_allele]
            #if there are alleles associated to the peptide, add an importance score to the result_dict under the allele as key
            if len(binder_alleles) != 0:
                for a in binder_alleles:
                    target_dict = result_dict[donor]['peptides']['donor allele'] if a in alleles else result_dict[donor]['peptides']['not donor allele']
                    if SCALED_OVER_SPECIFICITY:
                        importance = 1/len(binder_alleles)
                    else:
                        importance = IMPORTANCE_SCORE
                    if a not in target_dict:
                        target_dict[a] = importance
                    else:
                        target_dict[a] += importance
        else:
            result_dict[donor]['peptide not in monoallelic ds'] += 1
    print('\''+str(donor)+'\' : ',result_dict[donor], ', ')

with open('result_dict_monoallelic_dataset_benchmark_all_data.txt', 'w') as f:
    f.write(str(result_dict))


            Peptide          Allele
0         AAAAAAAAA         [C1202]
1        AAAAAAAAAA         [C1202]
2       AAAAAAAAAAA  [A0205, B5601]
3  AAAAAAAAAAAAAYSS         [B5701]
4    AAAAAAAAAAAAHQ         [B5701]


'04-001' :  {'overall peptides in tuedb': 1610, 'peptide not in monoallelic ds': 147, 'peptides': {'donor allele': {'A0101': 266.5390984015985, 'A2402': 283.4170103280765, 'B0801': 62.0847070759142, 'B1402': 137.6479829810712, 'C0701': 8.976925338470988, 'C0802': 4.450327011393126}, 'not donor allele': {'A0301': 5.981161893661893, 'A3601': 77.37063492063491, 'A0201': 18.54753972729779, 'B2705': 19.683512610623612, 'B3501': 5.007166489424554, 'A2301': 143.98959330065918, 'A2407': 76.05857299047713, 'C0702': 22.423876997922648, 'C1402': 28.217733638582345, 'C1403': 19.590804750804736, 'A2413': 14.301482961482956, 'B3924': 6.195670995670996, 'B3901': 3.4424274255156604, 'C1601': 11.224126836887319, 'C1203': 3.1226939560426796, 'B0702': 9.458802076060143, 'B3502': 1.4165873015873016, 'B3503': 1.8834846157426801, 'B6701': 1.0337301587301586, 'B8101': 0.9813852813852814, 'B4201': 4.197734254992319, 'C0202': 3.651885597734321, 'C0501': 6.56067035162777, 'C1701': 3.7017121600608838, 'A6802': 2