In [99]:
%matplotlib inline

import os, sys, gc
from tqdm import tqdm, tqdm_notebook, tqdm_pandas
from tqdm import trange
import time
from scipy import stats
from scipy.stats import shapiro

import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

from multiprocessing import Pool, Process

import itertools
from modules.kidera import score_positions, score_sequence

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_curve, auc, classification_report, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

In [2]:
idf1 = pd.read_csv('data/immunogenic_peptides.csv')
anchorpos = pd.read_csv('output/anchorpos.csv')

In [4]:
idf1.columns

Index(['Epitope', 'Epitope Start', 'Epitope End', ' MHC Allele',
       ' Epitope Source Organism Name', 'Length', 'Immunogenicity'],
      dtype='object')

In [20]:
cho_mhc = idf1[idf1[' MHC Allele'].str.startswith("HLA")][' MHC Allele'].unique()

In [27]:
cho_mhc.shape

(143,)

In [43]:
ind = [x[:4] == 'HLA-' and x[5:][:1] == '*' and x[8:][:1] == ':' and len(x) == 11 for x in cho_mhc]
ind = np.array(ind)

In [46]:
cho_hla = cho_mhc[ind]

In [47]:
anchorpos.columns = ['HLA', 0, 1, 2]
predicted_hla = anchorpos.HLA.unique()

In [48]:
overlap_hla = np.intersect1d(cho_hla, predicted_hla)
overlap_hla.shape

(60,)

In [49]:
overlap_hla

array(['HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:02', 'HLA-A*02:03',
       'HLA-A*02:05', 'HLA-A*02:06', 'HLA-A*02:07', 'HLA-A*03:01',
       'HLA-A*11:01', 'HLA-A*23:01', 'HLA-A*24:02', 'HLA-A*24:03',
       'HLA-A*25:01', 'HLA-A*26:01', 'HLA-A*29:02', 'HLA-A*30:01',
       'HLA-A*30:02', 'HLA-A*31:01', 'HLA-A*33:01', 'HLA-A*68:01',
       'HLA-A*68:02', 'HLA-B*07:02', 'HLA-B*08:01', 'HLA-B*14:02',
       'HLA-B*15:01', 'HLA-B*15:03', 'HLA-B*27:02', 'HLA-B*27:05',
       'HLA-B*35:01', 'HLA-B*35:02', 'HLA-B*35:03', 'HLA-B*35:14',
       'HLA-B*37:01', 'HLA-B*38:01', 'HLA-B*39:01', 'HLA-B*39:06',
       'HLA-B*40:01', 'HLA-B*40:02', 'HLA-B*40:06', 'HLA-B*41:02',
       'HLA-B*44:02', 'HLA-B*44:03', 'HLA-B*45:01', 'HLA-B*48:01',
       'HLA-B*50:01', 'HLA-B*51:01', 'HLA-B*52:01', 'HLA-B*53:01',
       'HLA-B*55:01', 'HLA-B*55:02', 'HLA-B*57:01', 'HLA-B*57:03',
       'HLA-B*58:01', 'HLA-C*03:03', 'HLA-C*03:04', 'HLA-C*04:01',
       'HLA-C*06:02', 'HLA-C*07:01', 'HLA-C*08:01', 'HLA-C*08:

In [92]:
idf1[(idf1[' MHC Allele'].isin(overlap_hla)) & (idf1.Epitope.str.len()==9)].shape

(3516, 7)

In [87]:
def mask_pos(epitope, lu):
    return "".join([k for i,k in enumerate([i for i in epitope]) if i not in lu])

In [94]:
pos = anchorpos.set_index('HLA')
ndf = idf1[(idf1[' MHC Allele'].isin(overlap_hla)) & (idf1.Epitope.str.len()==9)]
ndf = ndf.reset_index(drop=True)
grouped = ndf.groupby(' MHC Allele')
groups = []
for name, group in grouped:
    lu = [int(i) for i in pos.loc[hla] if i==i]
    group.Epitope = group.Epitope.apply(lambda x: mask_pos(x, lu))
    groups.append(group)
mask_df = pd.concat(groups)
mask_df = mask_df.reset_index(drop=True)
mask_df.head()

Unnamed: 0,Epitope,Epitope Start,Epitope End,MHC Allele,Epitope Source Organism Name,Length,Immunogenicity
0,ITFNIDTY,278,286,HLA-A*01:01,Vaccinia virus NYCBH - Dryvax,9,Positive
1,ATALMTGF,1436,1444,HLA-A*01:01,Hepatitis C virus,9,Positive
2,SSIMSESY,420,428,HLA-A*01:01,Vaccinia virus,9,Positive
3,VSKYTDMY,842,850,HLA-A*01:01,Vaccinia virus,9,Positive
4,FTWANKQY,575,583,HLA-A*01:01,Vaccinia virus,9,Positive


In [120]:
features = ["helix.bend.pref", "side.chain.size",\
        "extended.str.pref", "hydrophobicity", "double.bend.pref", "partial.spec.vol",\
        "flat.ext.pref", "occurrence.alpha.reg", "pK.C", "surrounding.hydrop"]

n_estimators = [int(x) for x in np.linspace(start = 20, stop = 600, num = 15)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]


random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

def add_kidera(idf1):
    idf1 = idf1[['Epitope', 'Immunogenicity']]
    idf2 = idf1.Epitope.apply(lambda s: score_sequence(s))
    idf = pd.concat([idf1, idf2], axis=1)
    idf.columns = ['Epitope', 'Immunogenicity'] + features
    idf.Immunogenicity = idf.Immunogenicity.map({'immunogenic': 1, 'non-immunogenic': 0,
                                             'Positive': 1, 'Negative': 0})
    return idf

def preprocess(df):
    idf = add_kidera(df)
    X = idf[features].values
    y = idf['Immunogenicity'].values
    return train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

def train_classifier(data):
    X_train, X_test, y_train, y_test = preprocess(data)
    rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid,
                               n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1)

    rf_random.fit(X_train, y_train)
    
    print("roc auc score is {:.3}".format(roc_auc_score(y_test, rf_random.predict(X_test))))
    print(classification_report(y_test, rf_random.predict(X_test)))
    return rf_random

In [118]:
train_classifier(ndf) # These are the original Chowell data

roc auc score is 0.731
             precision    recall  f1-score   support

          0       0.75      0.86      0.80       508
          1       0.76      0.60      0.67       371

avg / total       0.75      0.75      0.75       879



In [121]:
clf = train_classifier(mask_df) # These are the HLA binding position masked data

roc auc score is 0.734
             precision    recall  f1-score   support

          0       0.76      0.82      0.79       508
          1       0.72      0.65      0.68       371

avg / total       0.75      0.75      0.75       879

