In [1]:
%matplotlib inline

import os, sys, gc
from tqdm import tqdm, tqdm_notebook, tqdm_pandas
from tqdm import trange
import time
from scipy import stats
from scipy.stats import shapiro

import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

from multiprocessing import Pool, Process

import itertools
from modules.kidera import score_positions, score_sequence

from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_curve, auc, roc_auc_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, StratifiedShuffleSplit

from joblib import dump, load

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = 'data/dataset.csv'
destination = ''
num_cores = 40
num_partitions = 120
features = ["helix.bend.pref", "side.chain.size",\
        "extended.str.pref", "hydrophobicity", "double.bend.pref", "partial.spec.vol",\
        "flat.ext.pref", "occurrence.alpha.reg", "pK.C", "surrounding.hydrop"]

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    with Pool(num_cores) as pool:
        df = pd.concat(pool.map(func, df_split))
    return df

def score_kidera(df_split):
    return df_split.apply(lambda s: score_sequence(s))

"""
inds = np.permutation(15000000)
inds записываешь в файл
итерация один
df = pd.read_csv()
df = df.loc[inds[:10**6],].reset_index(drop=True)
итерация два
df = pd.read_csv()
df = df.loc[inds[1*10**6:2*10**6],].reset_index(drop=True)
"""

def load_data(path):
    idf1 = pd.read_csv(path)
    idf1 = idf1[['Epitope', 'Immunogenicity']]
    np.random.seed(42)
    high = 15000000
    size = 1000000
    low = 30000
    rand = np.random.randint(low=low, high=high, size=size)
    pos = np.array([i for i in range(low)])
    inds = np.concatenate((pos, rand))
    idf1 = idf1.iloc[inds].reset_index(drop=True)
    idf2 = parallelize_dataframe(idf1.Epitope, score_kidera)
#     idf2 = idf1.Epitope.apply(lambda s: score_sequence(s))
    idf = pd.concat([idf1, idf2], axis=1)

    idf.columns = ['Epitope', 'Immunogenicity'] + features
    idf.Immunogenicity = idf.Immunogenicity.map({'immunogenic': 1, 'non-immunogenic': 0,
                                             'Positive': 1, 'Negative': 0})
    return idf

In [3]:
idf = load_data(path)

In [4]:
pos_ind = idf[idf['Immunogenicity']==1].index
neg_ind = idf[idf['Immunogenicity']==0].index
pos_ind.shape, neg_ind.shape
def bootstrap_data():
    training_ind = np.append(pos_ind[5000:], np.random.choice(neg_ind, 16586))
    data = idf[features].values[training_ind]
    target = idf['Immunogenicity'].values[training_ind]
    return data, target
X_train, y_train = bootstrap_data()

In [5]:
def make_test_data():
    training_ind = np.append(pos_ind[:5000], np.random.choice(neg_ind, 5000))
    data = idf[features].values[training_ind]
    target = idf['Immunogenicity'].values[training_ind]
    return data, target

X_test, y_test = make_test_data()

In [8]:
clf = CatBoostClassifier(verbose=False)
clf.fit(X_train, y_train)
roc_auc_score(y_test, clf.predict(X_test)) # It takes forever to predict

0.73089999999999999