In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.notebook import tqdm

In [2]:
def element_wise_compare(text1, text2):
    if len(text1) != len(text2):
        print("texts not equal lenth")
    return sum([int(c0==c1) for c0, c1 in zip(text1, text2)]) / len(text1)

def find_matches(list1, list2, threshold=0.80):
    output = np.zeros((len(list1), len(list2)))
    for i, a in enumerate(list1):
        for j, b in enumerate(list2):
            output[i, j] = element_wise_compare(a, b)
    output = output >= threshold
    # output = (output.sum(axis=1) > 0).sum()
    return (output.sum())

In [3]:
species = {'beta1': [5, 8, 12, 14, 19, 20, 21, 24, 25, 36, 47, 93], 
           'beta2': [9, 15, 17, 22, 23, 37, 38, 80, 100, 151],
           'beta3': [49, 75, 76], 
           'gamma1': [4, 65, 95], 
}

inverted_dict = {}
for specy, types in species.items():
    for typ in types:
        inverted_dict[typ] = specy
        
inverted_dict = {i: inverted_dict[i] for i in sorted(inverted_dict.keys())}

relevant_species = np.concatenate(list(species.values()))

In [4]:
def species_coverage(virus_coverage):
    return np.unique([inverted_dict[int(i[3:])] for i in virus_coverage])

In [5]:
full_df = pd.read_csv('../results/resultsnetMHCpan-selected-500.csv')
df = full_df[['virus', 'peptide', 'protein']].drop_duplicates()
df

Unnamed: 0,virus,peptide,protein
0,HPV36,FAFPNPFPM,E1
1,HPV8,FAMSLIQVL,E1
2,HPV61,FLLCKDYEV,E6
3,HPV69,IPFPNTFPF,E1
4,HPV21,TMWRYVYYV,E2
...,...,...,...
19989,HPV197,LLKHNNQVK,E1
19990,HPV31,VETQQMVQV,E1
19991,HPV65,EVDYDGLYF,E2
19992,HPV4,EVDYDGLYF,E2


In [6]:
df['virus_num'] = df['virus'].apply(lambda x: int(x[3:]))
df = df[df['virus_num'].isin(relevant_species)][['virus', 'peptide', 'protein']]
df['species'] = df.virus.apply(lambda x: inverted_dict[int(x[3:])])
df

Unnamed: 0,virus,peptide,protein,species
0,HPV36,FAFPNPFPM,E1,beta1
1,HPV8,FAMSLIQVL,E1,beta1
4,HPV21,TMWRYVYYV,E2,beta1
6,HPV25,TMWRYIYYV,E2,beta1
7,HPV36,FAMSLIRVL,E1,beta1
...,...,...,...,...
19974,HPV151,RFQGLNFIV,E1,beta2
19983,HPV5,LGYQPVPVK,E2,beta1
19984,HPV36,LGYQPVPVK,E2,beta1
19991,HPV65,EVDYDGLYF,E2,gamma1


In [7]:
results = []
for i, protein in enumerate(['E1', 'E2', 'E6', 'E7']):
    temp_df = df[df['protein'] == protein].copy()
    temp_df['virus_coverage'] = pd.Series().astype('object')
    for row in tqdm(temp_df.iterrows()):
        matches_count = temp_df['peptide'].apply(
            lambda x: element_wise_compare(row[1]['peptide'], x) > 0.8)
        temp_df['matches_count'] = matches_count
        temp_df.at[row[0], 'virus_coverage'] = temp_df[temp_df.matches_count].virus.drop_duplicates().values
    temp_df['virus_coverage_count'] = temp_df.virus_coverage.apply(len)
    temp_df['species_coverage'] = temp_df.virus_coverage.apply(species_coverage)
    temp_df['species_coverage_count'] = temp_df.species_coverage.apply(len)
    for vir in inverted_dict.keys():
        temp_df['HPV' + str(vir) + '_coverage'] = temp_df['virus_coverage'].apply(
            lambda x: ('HPV' + str(vir)) in x).astype(int)
    for spec in ['beta1', 'beta2', 'beta3', 'gamma1']:
        temp_df[spec + '_coverage'] = temp_df['species_coverage'].apply(
            lambda x: (spec in x)).astype(int)
    temp_df = temp_df.drop('matches_count', axis=1)
    results.append(temp_df.sort_values('virus_coverage_count', ascending=False))

  after removing the cwd from sys.path.


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [12]:
results[0].head()

Unnamed: 0,virus,peptide,protein,species,virus_coverage,virus_coverage_count,species_coverage,species_coverage_count,HPV4_coverage,HPV5_coverage,...,HPV76_coverage,HPV80_coverage,HPV93_coverage,HPV95_coverage,HPV100_coverage,HPV151_coverage,beta1_coverage,beta2_coverage,beta3_coverage,gamma1_coverage
13118,HPV12,LTDQSWKSF,E1,beta1,"[HPV23, HPV151, HPV100, HPV38, HPV22, HPV12, H...",25,"[beta1, beta2, beta3]",3,0,1,...,1,1,1,0,1,1,1,1,1,0
10513,HPV75,QSWKSFFKR,E1,beta3,"[HPV38, HPV23, HPV151, HPV100, HPV22, HPV8, HP...",25,"[beta1, beta2, beta3]",3,0,1,...,1,1,1,0,1,1,1,1,1,0
13132,HPV15,LTDQSWKSF,E1,beta2,"[HPV23, HPV151, HPV100, HPV38, HPV22, HPV12, H...",25,"[beta1, beta2, beta3]",3,0,1,...,1,1,1,0,1,1,1,1,1,0
13131,HPV75,LTDQSWKSF,E1,beta3,"[HPV23, HPV151, HPV100, HPV38, HPV22, HPV12, H...",25,"[beta1, beta2, beta3]",3,0,1,...,1,1,1,0,1,1,1,1,1,0
13134,HPV47,LTDQSWKSF,E1,beta1,"[HPV23, HPV151, HPV100, HPV38, HPV22, HPV12, H...",25,"[beta1, beta2, beta3]",3,0,1,...,1,1,1,0,1,1,1,1,1,0


In [11]:
for i, protein in enumerate(['E1', 'E2', 'E6', 'E7']):
    results[i].to_csv('../results/peptide_coverage_' + protein + '.csv')