# Preparing Gibbs Clustering Input and Running Monoallelic Dataset Benchmark
### Libraries

In [6]:
#analysis
import numpy as np
import pandas as pd
import re

#plotting
import matplotlib.pyplot as plt

#logging
import logging
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('matplotlib.font_manager').disabled = True

### Class Dataset
One can generate a dataset and analyze or visualize parts of it.
You can only add elements to dataset once.
Analysis works on ABC loci.

In [7]:
class DataSet:
    def __init__(self):
        """
        Initializes the dataset object.
        The mode is set to ABC by default meaning to only look at MHC alleles starting with A, B or C (not E, G).

        """
        self.dataframe = pd.DataFrame({'Allele': [], 'Peptide': [], 'Source': []})
        self.mode = "ABC"

    def get_dataframe(self):
        return self.dataframe

    def add_dfs_to_dataset(self, paths, sources, seps):
        """
        Adds dataframes to dataset object, assumes datsets are annotated with Allele(seperator)Peptide.
        Gets you a datset with Allele Peptide and Source column

        Args:
            paths (list): list of paths to dataframes
            sources (list): list of Sources of dataframes (if you want to annotate where the data comes from)
            seps (list): list of seperators of dataframes
        
        """
        assert len(paths) == len(sources) == len(seps)
        assert len(self.dataframe) == 0
        assert len(paths) > 0
        
        df_list = []
        for i,path in enumerate(paths):
            df = pd.read_csv(path, sep=seps[i])
            #remove duplicates input df
            df = df.drop_duplicates(subset=['Allele','Peptide'], keep='first')
            df['Source'] = sources[i]
            df_list.append(df)
        joined_df = pd.concat(df_list)
        joined_df = joined_df.drop_duplicates(subset=['Allele','Peptide'], keep='first')
        joined_df = joined_df[joined_df['Allele'].str.startswith('A') | joined_df['Allele'].str.startswith('B') | joined_df['Allele'].str.startswith('C')]
        self.dataframe = joined_df

    
    def cleanup_peptide(self):
        """
        removes peptide annotation: AYSSY(Oxidation)KIIK -> AYSSYKIIK.

        """
        self.dataframe["Peptide"] = self.dataframe["Peptide"].str.replace(r"\(.*\)","", regex=True)


def _check_dict_allele_specific(dict):
    """
    Checks if dictionary belongs to an allele specific peptide. There needs to be exactly one 1 value and two 0 values in the dictionary.

    Args:
        dict (dict): dictionary with allele counts
    
    Returns:
        bool: True if dict belongs to allele specific peptide, False otherwise

    """
    return sum(1 for value in dict.values() if value == 0) == 2 and sum(1 for value in dict.values() if value == 1) == 1


# Allele-associated dataset

In [8]:
all_peptide_allele_data = DataSet()
all_peptide_allele_data.add_dfs_to_dataset(paths=['../../DATA/data_readout_final/1_readout_data_mhcmotifatlas_classI_MS_Peptides_all_peptides.csv', 
                                    '../../DATA/data_readout_final/2_1readout_data_MSV000084172_2016.csv',
                                    '../../DATA/data_readout_final/2_2readout_data_MSV000084172_2017.csv',
                                    '../../DATA/data_readout_final/3_1readout_data_MSV000090323_batch12.csv',
                                    '../../DATA/data_readout_final/3_2readout_data_MSV000090323_batch15.csv',
                                    '../../DATA/data_readout_final/4_readout_data_PXD009531.csv',
                                    '../../DATA/data_readout_final/5_1breadout_data_netmhcpan_binder.csv',
                                    '../../DATA/data_readout_final/mhcflurry_data.csv'], 
                            sources=['mhcmotifatlas',
                                    'msv84172_2016',
                                    'msv84172_2017',
                                    'msv90323_12',
                                    'msv90323_15',
                                    'pxd9531',
                                    'netmhcpan_binder',
                                    'mhcflurry'],
                            seps=['\t']*8)
all_peptide_allele_data.cleanup_peptide()

# Binder Dataset Run with only allele specific peptides
### Settings

In [9]:
#SWITCH:
# ON: Gibbs Clustering: peptides that were filtered out while clustering peptides per alleles are not taken into account
# OFF: Use all allele specific peptides
GIBBS_CLUSTERING = True

In [10]:
PATH_TO_TUEDB  = '../db_dump_311023_cleaned.tsv'
PATH_TO_FILTERED_OUT_PEPTIDES = '../../DATA/filtered_peptides_after_gibbs_clustering.txt'

In [11]:
# Prepare our own datasets
binder_df = all_peptide_allele_data.get_dataframe()

#only allele specific ones
binder_df_allele_specific = binder_df.groupby('Peptide')['Allele'].apply(list).reset_index()
temp_df = binder_df_allele_specific['Allele'].apply(lambda x: {'A': ''.join(x).count('A'), 'B': ''.join(x).count('B'), 'C': ''.join(x).count('C')})
binder_df_allele_specific = binder_df_allele_specific[temp_df.apply(_check_dict_allele_specific)]
binder_df_allele_specific['Allele'] = binder_df_allele_specific['Allele'].apply(lambda x: x[0])

with open(PATH_TO_FILTERED_OUT_PEPTIDES, 'r') as f:
    filtered_peptides = f.readline()
    filtered_peptides = eval(filtered_peptides)


In [12]:
if GIBBS_CLUSTERING:
    for allele, peptides in filtered_peptides.items():
        mask = binder_df_allele_specific[(binder_df_allele_specific['Allele'] == allele) & binder_df_allele_specific['Peptide'].isin(peptides)]
        binder_df_allele_specific = binder_df_allele_specific.drop(mask.index)

    print('This is the database with the monoallelic peptide-allele data:')
    display(binder_df_allele_specific.head())
    print('This is the list of peptides that were filtered out by GibbsCluster (per allele):')
    print(filtered_peptides)
else:
    print('No GibbsClustering was performed, so no peptides were filtered out.')

This is the database with the monoallelic peptide-allele data:


Unnamed: 0,Peptide,Allele
0,AAAAAAAAA,C1202
1,AAAAAAAAAA,C1202
14,AAAAAAAAAF,C1202
16,AAAAAAAAALL,C1202
26,AAAAAAAAFT,C1202


This is the list of peptides that were filtered out by GibbsCluster (per allele):
{'A0101': ['APGSAAPAAGSAPAA', 'ALGQNPTNAEVLK', 'TEIRLRLHY', 'QGPGLGVYAY', 'PKPQDGKETKAAD', 'SDVSLTACKV', 'SQRYESLK', 'AHFGGADAARRY', 'ERLYPLRKYAVKA', 'ASVAWAVLK', 'LECFININY', 'AKKGGEKKK', 'IVDARPAMAATSF', 'ASFLRAQER', 'NTLPVSGNLID', 'LAKAGKNQGD', 'ADKASASAPAPASA', 'EKLRQNLNK', 'TKDGKKDKKEEDKK', 'MCPFLFLAV', 'KREPEDEGEDDD', 'SKNASKVANKGKSKS', 'TLDRLLALNS', 'VLPPPPPDT', 'YGGGGGGGY', 'KVRGTAKANVGAGKK', 'KSAQKAQKA', 'NFKTPRGPV', 'IIHEIAVLEL', 'ALDNTDLVFG', 'VSLTACKV', 'ASVPAGGAVAV', 'VRYHTKVR', 'SENSSQPAKK', 'ANFVMNPGDA', 'MVASVAFGH', 'AMSARAEAIK', 'NCRACAKSY', 'VLDINSIDN', 'GMGQKDSYVGD', 'LGPTAQWSVEDEEE', 'PRYHEVHY', 'GPGARARYQKSYR', 'AVLNRNRPEKN', 'RIHGVGFK', 'IVGRPRHQGVMV', 'SAPKKSKADGQ', 'ITELKAQLA', 'VTGKSKKRN', 'KKHKKLELDGSY', 'KIEARERK', 'VLLADGNN', 'RSHRRDQKW', 'GKGDPKKPRGK', 'VIILNHPGQISAG', 'SKKPAGGVDFDET', 'LFDNAMLR', 'SQSDTVFDY', 'VVVQPPTP', 'AGGKAGKDSGKAK', 'YESINYIF', 'APSGSSGGICEKSK', 'RVKENDQ

### TueDB for validation

In [7]:
# Prepare the tuedb database
tuedb_df = pd.read_csv(PATH_TO_TUEDB, sep='\t',index_col=0)
tdb_df = tuedb_df.copy()

alleles = tdb_df['all_hla_alleles_donor'].explode().unique()
donors = tdb_df['donor_code'].unique()
tdb_df = tdb_df.sort_values(by=['donor_code'])
group1_df = tdb_df.groupby('donor_code')['peptide_sequence'].apply(list).reset_index()
group1_df['alleles'] = tdb_df.groupby('donor_code')['all_hla_alleles_donor'].apply(list).reset_index()['all_hla_alleles_donor']
group1_df['alleles'] = group1_df['alleles'].apply(lambda x: x[0])
display(group1_df.head())


Unnamed: 0,donor_code,peptide_sequence,alleles
0,04-001,"[TTDLFGRDLSY, AYLEAHETF, NRFQIATV, TAASRLVTL, ...","['A0101', 'A2402', 'B0801', 'B1402', 'C0701', ..."
1,1003,"[DAVTAFESI, PIDGNFFTY, FLSFMNTEL, YTWEEVFRV, T...","['A0101', 'A0201', 'B5101', 'B5701', 'C0401', ..."
2,1008,"[SLFEEMLQV, TLIDLPGITKV, VVYEGQLISI, AEFKEAFQL...","['A0101', 'A0201', 'B0801', 'B4001', 'C0304', ..."
3,1010,"[ALWSLPLYL, FLLPILSQI, DAYVILKTV, IYEPNFIFF, L...","['A0201', 'A2402', 'B5001', 'B5101', 'C0102', ..."
4,1012,"[TLLPLRVFL, VLWDRTFSLF, SLLDIIEKV, SRLPVLLLL, ...","['A0201', 'A0301', 'B0702', 'B4402', 'C0501', ..."


### Run for TueDB validation set

In [8]:
group = group1_df.copy()

result_dict = {}

for _, row in group.iterrows():
    donor = row['donor_code']
    peptides = row['peptide_sequence']
    alleles = eval(row['alleles'])
    
    result_dict[donor] = {
        'overall peptides in tuedb': len(peptides),
        'peptide not in monoallelic ds': 0,
        'peptides': {'donor allele': {}, 'not donor allele': {}}
    }

    for allele in alleles:
        if allele not in result_dict[donor]['peptides']['donor allele']:
            result_dict[donor]['peptides']['donor allele'][allele] = 0
    
    for i, peptide in enumerate(peptides):
        if peptide in binder_df_allele_specific['Peptide'].values:
            binder_alleles = binder_df_allele_specific[binder_df_allele_specific['Peptide'] == peptide]['Allele'].values
            if len(binder_alleles) > 1:
                raise Exception("Monoallelic allele should only have one allele")
            binder_alleles = binder_alleles[0]
            
            target_dict = result_dict[donor]['peptides']['donor allele'] if binder_alleles in alleles else result_dict[donor]['peptides']['not donor allele']

            if binder_alleles not in target_dict:
                target_dict[binder_alleles] = 1
            else:
                target_dict[binder_alleles] += 1
        else:
            result_dict[donor]['peptide not in monoallelic ds'] += 1
    
    #print('\''+str(donor)+'\': '+str(result_dict[donor])+', ')

with open('monoallelic_dataset_benchmark_result_dict_RUN.txt', 'w') as f:
    f.write(str(result_dict))

'04-001': {'overall peptides in tuedb': 1610, 'peptide not in monoallelic ds': 1249, 'peptides': {'donor allele': {'A0101': 143, 'A2402': 112, 'B0801': 15, 'B1402': 74, 'C0701': 0, 'C0802': 1}, 'not donor allele': {'A2301': 5, 'A2902': 1, 'A6802': 1, 'C1402': 1, 'B2705': 3, 'A0207': 1, 'A2501': 1, 'A0201': 1, 'C0401': 1, 'B5703': 1}}}, 
'1003': {'overall peptides in tuedb': 2018, 'peptide not in monoallelic ds': 1574, 'peptides': {'donor allele': {'A0101': 118, 'A0201': 74, 'B5101': 177, 'B5701': 47, 'C0401': 5, 'C0602': 7}, 'not donor allele': {'C0303': 1, 'B5801': 3, 'A0207': 1, 'C0501': 1, 'B1501': 1, 'B2705': 1, 'C1505': 2, 'A6901': 1, 'A3201': 2, 'A2402': 1, 'A0205': 2}}}, 
'1008': {'overall peptides in tuedb': 1484, 'peptide not in monoallelic ds': 1197, 'peptides': {'donor allele': {'A0101': 92, 'A0201': 55, 'B0801': 34, 'B4001': 101, 'C0304': 0, 'C0701': 0}, 'not donor allele': {'C0303': 1, 'B1801': 1, 'B1501': 2, 'B4002': 1}}}, 
'1010': {'overall peptides in tuedb': 1866, 'pep

# RUN for USER

In [19]:
peptides_df = pd.read_csv('../../DATA/HNSCC_1_34_Benign.tsv', sep='\t')
peptides = peptides_df['sequence']
#remove (...) information in peptides
pattern = re.compile(r'\(.*?\)')
peptides = [pattern.sub('', peptide) for peptide in peptides]
#remove duplicates
peptides = list(set(peptides))

result_dict = {}

peptides_not_in_ds = 0

result_dict = {}

for i, peptide in enumerate(peptides):
    if peptide in binder_df_allele_specific['Peptide'].values:
        binder_alleles = binder_df_allele_specific[binder_df_allele_specific['Peptide'] == peptide]['Allele'].values
        binder_alleles = binder_alleles[0]
        
        if binder_alleles not in result_dict:
            result_dict[binder_alleles] = 1
        else:
            result_dict[binder_alleles] += 1
    else:
        peptides_not_in_ds += 1

print(result_dict)
result_dict = {'B2705': 6, 'A0301': 51, 'B0801': 267, 'A1101': 130, 'B1801': 9, 'A3001': 1, 'A1102': 4, 'B4001': 380, 'A0201': 413, 'C1601': 2, 'B1501': 2, 'B4002': 158, 'A3101': 3, 'C0501': 3, 'B4403': 14, 'B0702': 4, 'C0303': 35, 'B4901': 11, 'B3701': 2, 'C0702': 1, 'C1602': 1, 'B4405': 7, 'A7401': 1, 'A0252': 5, 'C0304': 22, 'A6801': 3, 'A2902': 3, 'B0801L': 3, 'C0401': 1, 'A3402': 2, 'A0101': 1, 'B1302': 1, 'A0207': 5, 'B4032': 1, 'A0205': 6, 'C1202': 2, 'B3503': 3, 'C1502': 2, 'C0102': 1, 'B4402': 3, 'B5108': 1, 'B5001': 1, 'C0704': 1, 'A0211': 1, 'A0204': 2, 'B1402': 3, 'A0302': 1, 'C0701': 3, 'A6802': 1, 'C1203': 1, 'C0801': 1, 'B4104': 1, 'A0203': 1}

print(f'There are {peptides_not_in_ds} peptides out of {len(peptides)} not in allele-associated dataset.')

dict_a = {k: v for k, v in result_dict.items() if k.startswith('A')}
dict_a = sorted(dict_a.items(), key=lambda item: item[1], reverse=True)
dict_b = {k: v for k, v in result_dict.items() if k.startswith('B')}
dict_b = sorted(dict_b.items(), key=lambda item: item[1], reverse=True)
dict_c = {k: v for k, v in result_dict.items() if k.startswith('C')}
dict_c = sorted(dict_c.items(), key=lambda item: item[1], reverse=True)

prediction = [key for key,_ in dict_a[:2]] + [key for key,_ in dict_b[:2]] + [key for key,_ in dict_c[:2]]

print('The prediction is: ', end='')
for i, p in enumerate(prediction):
    if i == len(prediction)-1:
        print(p)
    else:
        print(p, end=', ')


There are 6283 peptides out of 7870 not in allele-associated dataset.
The prediction is: A0201, A1101, B4001, B0801, C0303, C0304


  '''peptides_df = pd.read_csv('../../DATA/HNSCC_1_34_Benign.tsv', sep='\t')
