In [1]:
%matplotlib inline

#system and time
import os
import sys
import multiprocessing as mp
from tqdm import tqdm, tqdm_notebook, tqdm_pandas, trange
import time

#data manipulation
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

#statistics
import itertools
from scipy import stats
from scipy.stats import shapiro

#local modules
from modules.kidera import alprop_sequence

#jupyter settings
import warnings
warnings.filterwarnings('ignore')

In [2]:
directory = 'bash_code/'
ls = [i.replace('_8', '_08').replace('_9', '_09') for i in os.listdir(directory) if 'txt' in i]
ls = np.sort(ls)
ls = ls[ls != 'all_arguments.txt']

In [3]:
def unite_df(lst):
    df = pd.DataFrame()

    for filename in tqdm(lst):
        filename = filename.replace('_08', '_8').replace('_09', '_9')
        df1 = pd.read_csv(directory + "/" + filename)
        if len(df1.columns) != 4:
            print(len(df1.columns))
            print(filename)
            print(df1.head())
            break
        df = pd.concat([df, df1], ignore_index=True)

    return df

In [4]:
df = unite_df(ls)

100%|██████████| 60/60 [00:43<00:00,  1.33s/it]


In [5]:
df.head()

Unnamed: 0,HLA,Peptide,Aff(nM),Origin_protein
0,HLA-A*01:01,KLPYSITV,38855.4,UL107_HCMVA
1,HLA-A*01:01,LPYSITVT,41355.8,UL107_HCMVA
2,HLA-A*01:01,PYSITVTY,34020.0,UL107_HCMVA
3,HLA-A*01:01,YSITVTYD,29186.2,UL107_HCMVA
4,HLA-A*01:01,SITVTYDH,39723.4,UL107_HCMVA


In [6]:
df.to_csv("output/iedb_data/predicted_new_alleles.csv", index=False)

In [6]:
df.shape

(3802620, 4)

In [7]:
hlalst = df.HLA.unique()
print("The number of epitopes taken for prediction {}".format(int(df.shape[0]/10)))
print("{:<15} {:>15} {:>15}".format("HLA", "bind count", "bind fraction"))
for hla in hlalst:
    tdf = df[df['HLA'] == hla]
    print("{:<15} {:>15} {:>15.03}".format(hla, len(tdf[tdf['Aff(nM)']<=500]), len(tdf[tdf['Aff(nM)']<=500])/380262))

The number of epitopes taken for prediction 380262
HLA                  bind count   bind fraction
HLA-A*01:01                1027          0.0027
HLA-A*02:01                9660          0.0254
HLA-A*24:02                2483         0.00653
HLA-A*31:01               11734          0.0309
HLA-B*07:02                3991          0.0105
HLA-B*08:01                3093         0.00813
HLA-B*35:01                2849         0.00749
HLA-B*35:02                 454         0.00119
HLA-B*55:01                 747         0.00196
HLA-C*06:02                 761           0.002


In [None]:
def plot_hla(hla, ax):
    sns.distplot(df[df['HLA']==hla]['Aff(nM)'], ax=ax)
    ax.set_xlabel("")
    ax.set_title(hla)

fig, axs = plt.subplots(5, 2)
fig.set_size_inches(7, 12.5)
n = 0
for ax1, ax2 in axs:
    plot_hla(hlalst[n], ax1)
    n+=1
    plot_hla(hlalst[n], ax2)
    n+=1
plt.show()