In [1]:
%matplotlib inline

import os, sys, gc
from tqdm import tqdm, tqdm_notebook, tqdm_pandas
from tqdm import trange
import time
from scipy import stats
from scipy.stats import shapiro

import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

from multiprocessing import Pool, Process

import itertools
from modules.kidera import score_positions, score_sequence

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit

from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = 'data/dataset.csv'
destination = ''
num_cores = 40
num_partitions = 120
features = ["helix.bend.pref", "side.chain.size",\
        "extended.str.pref", "hydrophobicity", "double.bend.pref", "partial.spec.vol",\
        "flat.ext.pref", "occurrence.alpha.reg", "pK.C", "surrounding.hydrop"]

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    with Pool(num_cores) as pool:
        df = pd.concat(pool.map(func, df_split))
    return df

def score_kidera(df_split):
    return df_split.apply(lambda s: score_sequence(s))

"""
inds = np.permutation(15000000)
inds записываешь в файл
итерация один
df = pd.read_csv()
df = df.loc[inds[:10**6],].reset_index(drop=True)
итерация два
df = pd.read_csv()
df = df.loc[inds[1*10**6:2*10**6],].reset_index(drop=True)
"""

def load_data(path):
    idf1 = pd.read_csv(path)
    idf1 = idf1[['Epitope', 'Immunogenicity']]
    np.random.seed(42)
    high = 15000000
    size = 1000000
    low = 30000
    rand = np.random.randint(low=low, high=high, size=size)
    pos = np.array([i for i in range(low)])
    inds = np.concatenate((pos, rand))
    idf1 = idf1.iloc[inds].reset_index(drop=True)
    idf2 = parallelize_dataframe(idf1.Epitope, score_kidera)
#     idf2 = idf1.Epitope.apply(lambda s: score_sequence(s))
    idf = pd.concat([idf1, idf2], axis=1)

    idf.columns = ['Epitope', 'Immunogenicity'] + features
    idf.Immunogenicity = idf.Immunogenicity.map({'immunogenic': 1, 'non-immunogenic': 0,
                                             'Positive': 1, 'Negative': 0})
    return idf

In [None]:
# idf = load_data(path)

# X = idf[features].values
# y = idf['Immunogenicity'].values
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=True)

In [3]:
idf = load_data(path)

In [4]:
idf.Immunogenicity.value_counts()

0    1008414
1      21586
Name: Immunogenicity, dtype: int64

In [5]:
pos_ind = idf[idf['Immunogenicity']==1].index
neg_ind = idf[idf['Immunogenicity']==0].index
pos_ind.shape, neg_ind.shape

((21586,), (1008414,))

In [None]:
idf.iloc[]

In [6]:
def bootstrap_data():
    training_ind = np.append(pos_ind[5000:], np.random.choice(neg_ind, 16586))
    data = idf[features].values[training_ind]
    target = idf['Immunogenicity'].values[training_ind]
    return data, target

In [7]:
X_train, y_train = bootstrap_data()

In [8]:
n_estimators = [int(x) for x in np.logspace(start=4, stop=10, num=7, base=2)]
max_features = ["auto", "log2", None]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]


random_grid = {'n_estimators': n_estimators,
               'max_features': max_features}
#                'max_depth': max_depth}
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf}

In [9]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid,
                               n_iter = 20, cv = StratifiedShuffleSplit(n_splits=5, test_size=0.5),
                               verbose=0, random_state=42, n_jobs = -1)

rf_random.fit(X_train, y_train)

rf_random.best_params_, rf_random.best_score_

({'max_features': 'log2', 'n_estimators': 1024}, 0.85953213553599417)

In [10]:
clf = []

for i in tqdm_notebook(range(10)):
    rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid,
                               n_iter = 20, cv = StratifiedShuffleSplit(n_splits=5, test_size=0.5),
                               verbose=0, random_state=42, n_jobs = -1)

    X_train, y_train = bootstrap_data()
    
    rf_random.fit(X_train, y_train)
    
    clf.append(rf_random)




In [23]:
def make_test_data():
    training_ind = np.append(pos_ind[:5000], np.random.choice(neg_ind, 5000))
    data = idf[features].values[training_ind]
    target = idf['Immunogenicity'].values[training_ind]
    return data, target, sequences

X_test, y_test = make_test_data()

In [24]:
from scipy.stats.mstats import gmean
def predict_ens(clf):
    res = []
    for c in clf:
        res.append(c.predict_proba(X_test)[:,1])
    res = np.stack(res, axis=0)
    fin = gmean(res, axis=0)
    return [0 if i <.5 else 1 for i in fin]

scores = predict_ens(clf)
print("roc auc score is {:.3}".format(roc_auc_score(y_test, scores)))
print(classification_report(y_test, scores))

roc auc score is 0.812
             precision    recall  f1-score   support

          0       0.73      0.98      0.84      5000
          1       0.97      0.64      0.77      5000

avg / total       0.85      0.81      0.81     10000



In [25]:
from joblib import dump, load

In [14]:
dump(clf, "output/ensemble.joblib")

['output/ensemble.joblib']