In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
import glob
import itertools
import os
import sys
import gc

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

In [4]:
sys.path.append("../../..")

In [5]:
from bertrand.immrep.data_sources import read_test, read_train_be
from bertrand.immrep.sample_test_set import sample_test
from bertrand.immrep.sample_train_set import sample_train

In [6]:
test = read_test()
train = read_train_be()

## Some tests

In [7]:
test_sample = sample_test(train, test, seed=43)

GILGFVFTL 1879 0.05
RAKFKQLL 979 0.05
IVTDFSVIK 856 0.05
NLVPMVATV 644 0.05
GLCTLVAML 531 0.05
YLQPRTFLL 394 0.05
TPRVTGGGAM 375 0.05
RPPIFIRRL 341 0.05
VTEHDTLLY 339 0.05
RPHERNGFTVL 294 0.05


In [8]:
test_sample2 = sample_test(train, test, seed=42)

GILGFVFTL 1879 0.05
RAKFKQLL 979 0.05
IVTDFSVIK 856 0.05
NLVPMVATV 644 0.05
GLCTLVAML 531 0.05
YLQPRTFLL 394 0.05
TPRVTGGGAM 375 0.05
RPPIFIRRL 341 0.05
VTEHDTLLY 339 0.05
RPHERNGFTVL 294 0.05


In [9]:
compare(test_sample, test_sample2)

NameError: name 'compare' is not defined

In [None]:
def compare(s1, s2):
    for pep in np.intersect1d(s1.Peptide, s2.Peptide):
        a = s1.query(f'Peptide == "{pep}"')
        b = s2.query(f'Peptide == "{pep}"')
        posa = a[a.y == 1]
        posb = b[b.y == 1]
        ipos = len(
            np.intersect1d(
                posa.CDR3a_extended + "_" + posa.CDR3b_extended, posb.CDR3a_extended + "_" + posb.CDR3b_extended
            )
        )
        nega = a[a.y == 0]
        negb = b[b.y == 0]
        ineg = len(
            np.intersect1d(
                nega.CDR3a_extended + "_" + nega.CDR3b_extended, negb.CDR3a_extended + "_" + negb.CDR3b_extended
            )
        )
        print(f"{pep} +: {len(posa)}-{ipos}-{len(posb)} -: {len(nega)}-{ineg}-{len(negb)}")

        iposneg = len(
            np.intersect1d(
                posa.CDR3a_extended + "_" + posa.CDR3b_extended, negb.CDR3a_extended + "_" + negb.CDR3b_extended
            )
        )
        inegpos = len(
            np.intersect1d(
                nega.CDR3a_extended + "_" + nega.CDR3b_extended, posb.CDR3a_extended + "_" + posb.CDR3b_extended
            )
        )
        assert iposneg == 0 and inegpos == 0

In [75]:
train_sample = sample_train(train, test, test_sample, seed=42)

In [76]:
train_sample2 = sample_train(train, test, test_sample, seed=43)

In [73]:
compare(train_sample, train_sample2)

GILGFVFTL +: 428-428-428 -: 1519-1400-1519
GLCTLVAML +: 193-193-193 -: 965-528-965
IVTDFSVIK +: 511-511-511 -: 1877-1339-1877
NLVPMVATV +: 340-340-340 -: 1644-1537-1644
RAKFKQLL +: 271-271-271 -: 1355-1017-1355
RPHERNGFTVL +: 130-130-130 -: 650-238-650
RPPIFIRRL +: 165-165-165 -: 825-388-825
TPRVTGGGAM +: 170-170-170 -: 850-395-850
VTEHDTLLY +: 157-157-157 -: 785-352-785
YLQPRTFLL +: 187-187-187 -: 935-350-935


In [77]:
train_sample = sample_train(train, test, test_sample, seed=42, include_other_peptides=True)

In [78]:
train_sample2 = sample_train(train, test, test_sample, seed=43, include_other_peptides=True)

In [79]:
compare(train_sample, train_sample2)

GILGFVFTL +: 428-428-428 -: 2140-846-2140
GLCTLVAML +: 193-193-193 -: 965-196-965
IVTDFSVIK +: 511-511-511 -: 2555-1097-2555
NLVPMVATV +: 340-340-340 -: 1700-514-1700
RAKFKQLL +: 271-271-271 -: 1355-353-1355
RPHERNGFTVL +: 130-130-130 -: 650-105-650
RPPIFIRRL +: 165-165-165 -: 825-136-825
TPRVTGGGAM +: 170-170-170 -: 850-145-850
VTEHDTLLY +: 157-157-157 -: 785-111-785
YLQPRTFLL +: 187-187-187 -: 935-179-935


## Estimators

In [7]:
from scipy.spatial.distance import cdist
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier

In [8]:
from bertrand.immrep.sample_test_set import dist

In [9]:
def compute_pairwise_dist(x, y):
    adist = cdist(x.CDR3a_extended.values.reshape(-1, 1), y.CDR3a_extended.values.reshape(-1, 1), metric=dist)
    bdist = cdist(x.CDR3b_extended.values.reshape(-1, 1), y.CDR3b_extended.values.reshape(-1, 1), metric=dist)
    d = adist + bdist
    return d

In [10]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import RepeatedStratifiedKFold

In [11]:
class KNNLevenshtein(BaseEstimator):
    def __init__(self):
        self.clf = KNeighborsClassifier(n_neighbors=10, metric="precomputed")
        self.classes_ = None
        
    def featurize(self, X):
        tcr_str = X.CDR3a_extended.apply(middle_padding, MAX=25) + X.CDR3b_extended.apply(middle_padding, MAX=25)
    def fit(self, X, y=None):
        dist = compute_pairwise_dist(X, X)
        self.clf.fit(dist, y)
        self.classes_ = np.unique(y)
        self.X = X
        return self

    def predict_proba(self, X):
        dist = compute_pairwise_dist(X, self.X)
        return self.clf.predict_proba(dist)

In [12]:
clf = KNNLevenshtein()

In [13]:
def middle_padding(x, MAX):
    d = MAX - len(x)
    l =len(x) // 2
    mp = x[:l] +('-'*d) + x[l:]
    if l > MAX:
        raise NotImplementedError()
    # assert mp.replace('-', '') == x
    return mp

In [14]:
from bertrand.model.tokenization import AA_list

AA_dict = {aa: i for i, aa in enumerate(AA_list)}

def onehot(tcr_str, MAX=50):
    oh_matrix = np.zeros(shape=(len(tcr_str), MAX, 20))
    for p in range(50):
        c = tcr_str.str[p]
        not_na = c != '-'
        # print(not_na.shape, oh_matrix.shape)
        oh_matrix[not_na, p, c[not_na].map(AA_dict).values] = 1

    oh_array = oh_matrix.reshape(-1, MAX*20)
    return oh_array




In [15]:
from sklearn.ensemble import ExtraTreesClassifier

In [16]:
class XTMiddlePadding(BaseEstimator):
    def __init__(self):
        self.clf = ExtraTreesClassifier()
        self.classes_ = None
        
    def featurize(self, X):
        tcr_str = X.CDR3a_extended.apply(middle_padding, MAX=25) + X.CDR3b_extended.apply(middle_padding, MAX=25)
        Xm = onehot(tcr_str)
        return Xm
    
    def fit(self, X, y=None):
        self.clf.fit(self.featurize(X), y)
        self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(self.featurize(X))

In [17]:
def tcr_3_regions(tcr):
    if len(tcr) < 12:
        l, r = 3, len(tcr)-3
    elif len(tcr) < 20:
        l, r = 4, len(tcr)-4
    else:
        l, r = 5, len(tcr)-5
    return tcr[:l].ljust(5, '-') + tcr[l:r].ljust(15, '-') + tcr[r:].rjust(5, '-')

In [18]:
tcr = train.CDR3a_extended.iloc[20]

In [19]:
tcr = train[train.CDR3a_extended.str.len() == 23].CDR3a_extended.iloc[0]

In [20]:
tcr = train[train.CDR3b_extended.str.len() == 24].CDR3b_extended.iloc[0]

In [21]:
tcr = train[train.CDR3b_extended.str.len() == 10].CDR3b_extended.iloc[0]

In [22]:
tcr = test[test.CDR3a_extended.str.len() == 7].CDR3a_extended.iloc[0]

In [23]:
tcr = train[train.CDR3b_extended.str.len() == 15].CDR3b_extended.sample().iloc[0]

In [24]:
middle_padding(tcr, 25)

'CASSQTS----------GDFSEQYF'

In [25]:
tcr_3_regions(tcr)

'CASS-QTSGDFS---------EQYF'

In [26]:
# min length -> left=3, right=3
# max length -> left=5, right=5

In [27]:
class XT3Regions(BaseEstimator):
    def __init__(self):
        self.clf = ExtraTreesClassifier(class_weight='balanced')
        self.classes_ = None
        
    def featurize(self, X):
        tcr_str = X.CDR3a_extended.apply(tcr_3_regions) + X.CDR3b_extended.apply(tcr_3_regions)
        Xm = onehot(tcr_str)
        return Xm
    
    def fit(self, X, y=None):
        self.clf.fit(self.featurize(X), y)
        self.classes_ = np.unique(y)
        return self

    def predict_proba(self, X):
        return self.clf.predict_proba(self.featurize(X))

In [28]:
# clf = XTMiddlePadding()

# clf = KNNLevenshtein()

# pep = 'GILGFVFTL'

# X = data_split['test_sample'].query(f'Peptide == "{pep}"')

# roc_auc_score(X.y, clf.fit(X, X.y).predict_proba(X)[:, 1])

In [29]:
def average_predictions(cv_df, X_test):
    y_pred_list = []
    for _, row in cv_df.iterrows():
        y_pred_1 = row.estimator.predict_proba(X_test)[:, 1]
        y_pred_list.append(y_pred_1)
    y_pred_mean = np.vstack(y_pred_list).mean(axis=0)
    return y_pred_mean

In [30]:
from bertrand.immrep.training.cv import train_test_generator

In [31]:
splits_fn = 'data_splits7.pkl'
if os.path.isfile(splits_fn):
    data_splits = pd.read_pickle(splits_fn)
else:
    data_splits = list(train_test_generator(train, test))
    pd.to_pickle(data_splits, splits_fn)

In [32]:
# data_splits = list(train_test_generator(train, tesplits_fn)

In [33]:
clfs = [
    ('KNNLevenshtein',  KNNLevenshtein()),
    ('XTMiddlePad', XTMiddlePadding()),
    ('XT3Regions', XT3Regions()),
]

In [202]:
# results = []


In [203]:
for data_split in data_splits:
    for clf_name, clf in clfs:
        print(clf_name)
        fn = f"{clf_name}-{data_split['test_iteration']}-{data_split['train_iteration']}.pkl"
        if os.path.isfile(fn):
            print('skip cache')
            continue
        pep_results = []
        y_pred_global = pd.Series(index=data_split['test_sample'].index, dtype=float)
        for pep, pep_df_train in data_split['train_sample'].groupby("Peptide", sort=False):
            # if pep == 'RPPIFIRRL':
            #     continue
            pep_df_test = data_split['test_sample'].query(f'Peptide == "{pep}"')

            X = pep_df_train[["CDR3a_extended", "CDR3b_extended"]].reset_index(drop=True)
            X_test = pep_df_test[["CDR3a_extended", "CDR3b_extended"]].reset_index(drop=True)

            cv_res = cross_validate(
                estimator=clf,
                X=X,
                y=pep_df_train.y.values,
                scoring="roc_auc",
                cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42),
                return_estimator=True,
                return_train_score=True,
                n_jobs=11,
            )

            cv_df = pd.DataFrame(cv_res)
            cv_df["Peptide"] = pep

            clf.fit(X, pep_df_train.y)
            y_pred_test = clf.predict_proba(X_test)[:, 1]
            roc_single = roc_auc_score(pep_df_test.y, y_pred_test)
            y_pred_test_cv = average_predictions(cv_df, X_test)
            roc_cv = roc_auc_score(pep_df_test.y, y_pred_test_cv)
            y_pred_global.loc[pep_df_test.index] = y_pred_test_cv

            cv_df = cv_df.rename(columns={"test_score": "cv_score"})
            cv_df["roc_single"] = roc_single
            cv_df["roc_cv"] = roc_cv
            cv_df["test_iteration"] = data_split["test_iteration"]
            cv_df["train_iteration"] = data_split["train_iteration"]
            cv_df['model'] = clf_name

            pep_results.append(cv_df)
            print(f"Peptide {pep} AUC={roc_single:.3f} AUCcv={roc_cv:.3f}")
        not_na =  ~y_pred_global.isna()
        roc_global = roc_auc_score(data_split['test_sample'].y[not_na], y_pred_global[not_na])
        print('Global AUC ', roc_global)
        for r in pep_results:
            r['roc_global'] = roc_global
            # results.append(r)
        
        pd.to_pickle(pd.concat(pep_results).reset_index(drop=True), fn)
        gc.collect()
        

KNNLevenshtein
skip cache
XTMiddlePad
skip cache
XT3Regions
Peptide TPRVTGGGAM AUC=0.512 AUCcv=0.570
Peptide RPPIFIRRL AUC=0.556 AUCcv=0.578
Peptide IVTDFSVIK AUC=0.622 AUCcv=0.573
Peptide RAKFKQLL AUC=0.743 AUCcv=0.757
Peptide GILGFVFTL AUC=0.874 AUCcv=0.872
Peptide YLQPRTFLL AUC=0.667 AUCcv=0.708
Peptide NLVPMVATV AUC=0.689 AUCcv=0.647
Peptide VTEHDTLLY AUC=0.640 AUCcv=0.631
Peptide RPHERNGFTVL AUC=0.715 AUCcv=0.675
Peptide GLCTLVAML AUC=0.704 AUCcv=0.712
Global AUC  0.7161778006076251
KNNLevenshtein
skip cache
XTMiddlePad
skip cache
XT3Regions
Peptide RPPIFIRRL AUC=0.574 AUCcv=0.638
Peptide IVTDFSVIK AUC=0.668 AUCcv=0.686
Peptide GLCTLVAML AUC=0.729 AUCcv=0.742
Peptide NLVPMVATV AUC=0.612 AUCcv=0.589
Peptide YLQPRTFLL AUC=0.863 AUCcv=0.875
Peptide GILGFVFTL AUC=0.863 AUCcv=0.870
Peptide RAKFKQLL AUC=0.645 AUCcv=0.672
Peptide VTEHDTLLY AUC=0.649 AUCcv=0.682
Peptide RPHERNGFTVL AUC=0.614 AUCcv=0.640
Peptide TPRVTGGGAM AUC=0.789 AUCcv=0.777
Global AUC  0.7373040185091835
KNNLevenshtein

In [34]:
results = []
for data_split in data_splits:
    for clf_name, clf in clfs:
        fn = f"{clf_name}-{data_split['test_iteration']}-{data_split['train_iteration']}.pkl"
        print(fn)

        if not os.path.isfile(fn):
            print('skip not cache')
            continue
        results_split = pd.read_pickle(fn)
        results_split = results_split.drop(columns=['estimator'])
        results.append(results_split)
        del results_split
        gc.collect()

KNNLevenshtein-0-0.pkl
XTMiddlePad-0-0.pkl
XT3Regions-0-0.pkl
KNNLevenshtein-1-0.pkl
XTMiddlePad-1-0.pkl
XT3Regions-1-0.pkl
KNNLevenshtein-2-0.pkl
XTMiddlePad-2-0.pkl
XT3Regions-2-0.pkl
KNNLevenshtein-3-0.pkl
XTMiddlePad-3-0.pkl
XT3Regions-3-0.pkl
KNNLevenshtein-4-0.pkl
XTMiddlePad-4-0.pkl
XT3Regions-4-0.pkl
KNNLevenshtein-5-0.pkl
XTMiddlePad-5-0.pkl
XT3Regions-5-0.pkl
KNNLevenshtein-6-0.pkl
XTMiddlePad-6-0.pkl
XT3Regions-6-0.pkl


In [35]:
results_df = pd.concat(results).reset_index(drop=True)

In [36]:
results_df.sample(3)

Unnamed: 0,fit_time,score_time,cv_score,train_score,Peptide,roc_single,roc_cv,test_iteration,train_iteration,model,roc_global
2332,3.784006,0.952806,0.661729,0.826511,YLQPRTFLL,0.77,0.758,3,0,KNNLevenshtein,0.721131
1247,0.756395,0.040966,0.73494,1.0,TPRVTGGGAM,0.673961,0.703047,1,0,XTMiddlePad,0.708703
933,1.812234,0.468928,0.563218,0.766033,VTEHDTLLY,0.572318,0.561246,1,0,KNNLevenshtein,0.70473


In [37]:
# res_agg = results_df.groupby(["model", "test_iteration", "Peptide"]).agg(
#     {"cv_score": "mean", "roc_single": "first", "roc_cv": "first", "roc_global": 'first'}
# )


In [38]:
res_agg = results_df.groupby(["model", "Peptide"]).agg(
    {"cv_score": ["mean", "std"], "roc_single": ["mean", "std"], "roc_cv": ["mean", "std"], "roc_global": ["mean", "std"]}
)

In [39]:
res_agg.cv_score['mean'].unstack()

Peptide,GILGFVFTL,GLCTLVAML,IVTDFSVIK,NLVPMVATV,RAKFKQLL,RPHERNGFTVL,RPPIFIRRL,TPRVTGGGAM,VTEHDTLLY,YLQPRTFLL
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
KNNLevenshtein,0.659559,0.665555,0.631538,0.62225,0.606861,0.566196,0.544265,0.591379,0.541936,0.669808
XT3Regions,0.742081,0.698086,0.753604,0.682131,0.687052,0.594663,0.56301,0.652599,0.608021,0.774384
XTMiddlePad,0.737664,0.689565,0.744699,0.676933,0.658038,0.580479,0.555287,0.621875,0.574208,0.764361


In [40]:
mean_cv_auc = res_agg.cv_score['mean'].unstack().mean(axis=1)

In [41]:
mean_cv_auc

model
KNNLevenshtein    0.609935
XT3Regions        0.675563
XTMiddlePad       0.660311
dtype: float64

In [42]:
res_agg.roc_cv['mean'].unstack()

Peptide,GILGFVFTL,GLCTLVAML,IVTDFSVIK,NLVPMVATV,RAKFKQLL,RPHERNGFTVL,RPPIFIRRL,TPRVTGGGAM,VTEHDTLLY,YLQPRTFLL
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
KNNLevenshtein,0.823633,0.73153,0.603322,0.656041,0.658146,0.625079,0.615522,0.63522,0.560306,0.78375
XT3Regions,0.876323,0.745483,0.648837,0.657729,0.693563,0.656063,0.63826,0.664662,0.662234,0.793286
XTMiddlePad,0.850287,0.746933,0.619192,0.650516,0.676535,0.629778,0.597627,0.625049,0.586406,0.786179


In [61]:
res_agg.roc_single['mean'].unstack().mean(axis=1)

model
KNNLevenshtein    0.656190
XT3Regions        0.688168
XTMiddlePad       0.669001
dtype: float64

In [62]:
mean_test_auc = res_agg.roc_cv['mean'].unstack().mean(axis=1)

In [63]:
mean_test_auc

model
KNNLevenshtein    0.669255
XT3Regions        0.703644
XTMiddlePad       0.676850
dtype: float64

In [45]:
mean_global_auc = res_agg.roc_global['mean'].unstack().iloc[:, 0]

In [46]:
mean_global_auc

model
KNNLevenshtein    0.704622
XT3Regions        0.732852
XTMiddlePad       0.710128
Name: GILGFVFTL, dtype: float64

In [48]:
bertrand_results = pd.read_csv('/mnt/storage2/olek/bertrand-new/results/peptide-models-5/results.csv', index_col=0).reset_index(drop=True)

In [49]:
bertrand_results.head()

Unnamed: 0,epoch,learning_rate,loss,step,eval_loss,eval_roc,eval_runtime,eval_samples_per_second,eval_steps_per_second,Peptide,test_iteration,train_iteration,model,roc,roc_global
0,4.0,,,385.0,0.024713,0.756498,0.0593,1736.597,16.86,TPRVTGGGAM,0,0,BERTrand(pep),0.757341,0.689481
1,10.0,,,759.0,0.038495,0.507895,0.0522,1741.988,19.143,RPPIFIRRL,0,0,BERTrand(pep),0.56263,0.689481
2,16.0,,,2941.0,0.078764,0.68777,0.1457,1578.731,6.864,IVTDFSVIK,0,0,BERTrand(pep),0.529367,0.689481
3,5.0,,,702.0,0.026894,0.661243,0.087,1792.133,11.488,RAKFKQLL,0,0,BERTrand(pep),0.732767,0.689481
4,7.0,,,1160.0,0.02775,0.729437,0.1164,1657.509,8.588,GILGFVFTL,0,0,BERTrand(pep),0.830247,0.689481


In [50]:
bertrand_results.sample(3)

Unnamed: 0,epoch,learning_rate,loss,step,eval_loss,eval_roc,eval_runtime,eval_samples_per_second,eval_steps_per_second,Peptide,test_iteration,train_iteration,model,roc,roc_global
64,3.0,,,224.0,0.028701,0.607527,0.0442,1674.996,22.635,RPHERNGFTVL,6,0,BERTrand(pep),0.560889,0.671133
51,19.0,,,2440.0,0.058974,0.685871,0.0876,1848.812,11.412,RAKFKQLL,5,0,BERTrand(pep),0.71454,0.680903
13,4.0,,,680.0,0.025515,0.704857,0.1004,1802.55,9.959,NLVPMVATV,1,0,BERTrand(pep),0.473242,0.663778


In [51]:
f'{bertrand_results.roc_global.mean():.3f}+-{bertrand_results.roc_global.std():.3f}'

'0.680+-0.009'

In [55]:
bertrand_agg = bertrand_results.groupby('Peptide').agg({'eval_roc': ['mean', 'std'], 'roc': ['mean', 'std']})

In [56]:
bertrand_agg.round(3)

Unnamed: 0_level_0,eval_roc,eval_roc,roc,roc
Unnamed: 0_level_1,mean,std,mean,std
Peptide,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
GILGFVFTL,0.707,0.059,0.807,0.032
GLCTLVAML,0.665,0.053,0.653,0.033
IVTDFSVIK,0.705,0.02,0.589,0.053
NLVPMVATV,0.691,0.019,0.657,0.101
RAKFKQLL,0.673,0.07,0.707,0.037
RPHERNGFTVL,0.662,0.089,0.544,0.073
RPPIFIRRL,0.588,0.084,0.539,0.065
TPRVTGGGAM,0.683,0.08,0.647,0.071
VTEHDTLLY,0.561,0.047,0.576,0.087
YLQPRTFLL,0.743,0.065,0.763,0.069


In [58]:
pd.concat([res_agg.cv_score['mean'].unstack().T, bertrand_agg[('eval_roc', 'mean')]], axis=1)

Unnamed: 0_level_0,KNNLevenshtein,XT3Regions,XTMiddlePad,"(eval_roc, mean)"
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GILGFVFTL,0.659559,0.742081,0.737664,0.707367
GLCTLVAML,0.665555,0.698086,0.689565,0.664922
IVTDFSVIK,0.631538,0.753604,0.744699,0.705228
NLVPMVATV,0.62225,0.682131,0.676933,0.691368
RAKFKQLL,0.606861,0.687052,0.658038,0.67261
RPHERNGFTVL,0.566196,0.594663,0.580479,0.662009
RPPIFIRRL,0.544265,0.56301,0.555287,0.587601
TPRVTGGGAM,0.591379,0.652599,0.621875,0.682561
VTEHDTLLY,0.541936,0.608021,0.574208,0.561277
YLQPRTFLL,0.669808,0.774384,0.764361,0.743153


In [60]:
pd.concat([res_agg.roc_cv['mean'].unstack().T, bertrand_agg[('roc', 'mean')]], axis=1)

Unnamed: 0_level_0,KNNLevenshtein,XT3Regions,XTMiddlePad,"(roc, mean)"
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GILGFVFTL,0.823633,0.876323,0.850287,0.806864
GLCTLVAML,0.73153,0.745483,0.746933,0.652675
IVTDFSVIK,0.603322,0.648837,0.619192,0.588627
NLVPMVATV,0.656041,0.657729,0.650516,0.657254
RAKFKQLL,0.658146,0.693563,0.676535,0.707043
RPHERNGFTVL,0.625079,0.656063,0.629778,0.543873
RPPIFIRRL,0.615522,0.63826,0.597627,0.539298
TPRVTGGGAM,0.63522,0.664662,0.625049,0.646537
VTEHDTLLY,0.560306,0.662234,0.586406,0.576174
YLQPRTFLL,0.78375,0.793286,0.786179,0.762714


In [None]:
# y_pred = pd.Series(index=synthetic_test_limited.index, dtype=float)
# test_results = []
# for pep, test_pep_df in synthetic_test_limited.groupby('Peptide'):


#     y_pred_list = []
#     for _, result_row in results_df[results_df.peptide == pep].iterrows():
#         dt = compute_pairwise_dist(test_pep_df, result_row['train'])
#         y_pred_pep = result_row.clf.predict_proba(dt)[:, 1]
#         y_pred_list.append(y_pred_pep)
#     y_pred_mean = np.vstack(y_pred_list).mean(axis=0)

#     y_pred_mean = np.vstack(y_pred_list).mean(axis=0)
#     local_roc = roc_auc_score(test_pep_df.y, y_pred_mean)
#     y_pred.loc[test_pep_df.index] = y_pred_mean
#     print(pep, local_roc)
#     test_results.append(dict(
#         peptide=pep,
#         roc=local_roc))