In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.notebook import tqdm

In [2]:
def element_wise_compare(text1, text2):
    if len(text1) != len(text2):
        print("texts not equal lenth")
    return sum([int(c0==c1) for c0, c1 in zip(text1, text2)]) / len(text1)

def find_matches(list1, list2, threshold=0.80):
    output = np.zeros((len(list1), len(list2)))
    for i, a in enumerate(list1):
        for j, b in enumerate(list2):
            output[i, j] = element_wise_compare(a, b)
    output = output >= threshold
    # output = (output.sum(axis=1) > 0).sum()
    return (output.sum())

In [3]:
species = {'alpha1': [32, 42],
           'alpha2': [3, 10, 28, 29],
           'alpha3': [61],
           'alpha4': [2, 27, 57],
           'alpha5': [51, 69],
           'alpha6': [56],
           'alpha7': [18, 39, 45, 59, 68, 70],
           'alpha8': [7],
           'alpha9': [16, 31, 33, 35, 52, 58, 67],
           'alpha11': [34],
           'alpha13': [54],
           'beta1': [5, 8, 12, 14, 19, 20, 21, 24, 25, 36, 47, 93], 
           'beta2': [9, 15, 17, 22, 23, 37, 38, 80, 100, 151],
           'beta3': [49, 75, 76], 
           'beta4': [92],
           'beta5': [96],
           'gamma1': [4, 65, 95], 
           'gamma2': [48],
           'gamma3': [50],
           'gamma4': [60],
           'gamma5': [88],
           'gamma12': [132],
           'gamma24': [197],
           'mu': [1],
           'nu': [41],
}

inverted_dict = {}
for specy, types in species.items():
    for typ in types:
        inverted_dict[typ] = specy
        
inverted_dict = {'HPV' + str(i): inverted_dict[i] for i in sorted(inverted_dict.keys())}

In [4]:
full_df = pd.read_csv('../results/resultsnetMHCpan-selected-500.csv')
df = full_df[['virus', 'peptide', 'allele']]#.drop_duplicates()
df

Unnamed: 0,virus,peptide,allele
0,HPV36,FAFPNPFPM,HLA-C*03:04
1,HPV8,FAMSLIQVL,HLA-C*03:04
2,HPV61,FLLCKDYEV,HLA-A*02:01
3,HPV69,IPFPNTFPF,HLA-B*35:01
4,HPV21,TMWRYVYYV,HLA-A*02:01
...,...,...,...
19991,HPV65,EVDYDGLYF,HLA-C*05:01
19992,HPV4,EVDYDGLYF,HLA-C*05:01
19993,HPV132,VELASFHYK,HLA-A*11:01
19994,HPV34,ALYWYRTSL,HLA-B*07:02


In [5]:
temp_df_1 = df.groupby(['virus', 'peptide']).count()
temp_df_1 = temp_df_1.groupby(['virus']).count()
temp_df_1 = temp_df_1.sort_values('allele', ascending=False).reset_index()
temp_df_1['immunogenicity_count_1'] = temp_df_1['allele']
temp_df_1 = temp_df_1.drop(['allele'], axis=1)
temp_df_1['species'] = temp_df_1['virus'].map(inverted_dict)
temp_df_1 = temp_df_1[['virus', 'species', 'immunogenicity_count_1']]
temp_df_1.to_csv('../results/immunogenicity_counts_1.csv')
temp_df_1

Unnamed: 0,virus,species,immunogenicity_count_1
0,HPV41,nu,211
1,HPV132,gamma12,211
2,HPV21,beta1,210
3,HPV48,gamma2,209
4,HPV60,gamma4,209
...,...,...,...
62,HPV33,alpha9,154
63,HPV3,alpha2,153
64,HPV32,alpha1,153
65,HPV54,alpha13,151


In [6]:
temp_df_sqrt = df.groupby(['virus', 'peptide']).count()
temp_df_sqrt['allele'] = temp_df_sqrt['allele'].apply(np.sqrt)
temp_df_sqrt = temp_df_sqrt.groupby(['virus']).sum()
temp_df_sqrt = temp_df_sqrt.sort_values('allele', ascending=False).reset_index()
temp_df_sqrt['immunogenicity_count_sqrt'] = temp_df_sqrt['allele']
temp_df_sqrt = temp_df_sqrt.drop(['allele'], axis=1)
temp_df_sqrt['species'] = temp_df_sqrt['virus'].map(inverted_dict)
temp_df_sqrt = temp_df_sqrt[['virus', 'species', 'immunogenicity_count_sqrt']]
temp_df_sqrt.to_csv('../results/immunogenicity_counts_sqrt.csv')
temp_df_sqrt

Unnamed: 0,virus,species,immunogenicity_count_sqrt
0,HPV41,nu,262.243888
1,HPV21,beta1,258.144216
2,HPV95,gamma1,257.824400
3,HPV132,gamma12,256.312285
4,HPV60,gamma4,254.865054
...,...,...,...
62,HPV33,alpha9,190.010739
63,HPV32,alpha1,189.739390
64,HPV54,alpha13,189.537979
65,HPV3,alpha2,184.760793


In [7]:
temp_df_n = df.groupby(['virus', 'peptide']).count()
temp_df_n = temp_df_n.groupby(['virus']).sum()
temp_df_n = temp_df_n.sort_values('allele', ascending=False).reset_index()
temp_df_n['immunogenicity_count_n'] = temp_df_n['allele']
temp_df_n = temp_df_n.drop(['allele'], axis=1)
temp_df_n['species'] = temp_df_n['virus'].map(inverted_dict)
temp_df_n = temp_df_n[['virus', 'species', 'immunogenicity_count_n']]
temp_df_n.to_csv('../results/immunogenicity_counts_n.csv')
temp_df_n

Unnamed: 0,virus,species,immunogenicity_count_n
0,HPV41,nu,350
1,HPV95,gamma1,349
2,HPV21,beta1,340
3,HPV8,beta1,338
4,HPV17,beta2,335
...,...,...,...
62,HPV32,alpha1,252
63,HPV33,alpha9,249
64,HPV96,beta5,244
65,HPV10,alpha2,240


In [8]:
temp_df_all = temp_df_1.merge(temp_df_sqrt, on=['species', 'virus']).merge(
    temp_df_n, on=['species', 'virus'])
temp_df_all.to_csv('../results/immunogenicity_counts_all.csv')
temp_df_all

Unnamed: 0,virus,species,immunogenicity_count_1,immunogenicity_count_sqrt,immunogenicity_count_n
0,HPV41,nu,211,262.243888,350
1,HPV132,gamma12,211,256.312285,329
2,HPV21,beta1,210,258.144216,340
3,HPV48,gamma2,209,249.620149,314
4,HPV60,gamma4,209,254.865054,331
...,...,...,...,...,...
62,HPV33,alpha9,154,190.010739,249
63,HPV3,alpha2,153,184.760793,237
64,HPV32,alpha1,153,189.739390,252
65,HPV54,alpha13,151,189.537979,253
