In [83]:
import os #
import random #
import numpy as np #
import pandas as pd #
import multiprocessing as mp #

#-----RANDOM FOREST------
from sklearn.model_selection import RandomizedSearchCV #
from sklearn.metrics import precision_score #
from sklearn.metrics import recall_score #
from sklearn.metrics import confusion_matrix #
from sklearn import metrics #
from sklearn.ensemble import RandomForestClassifier #
from sklearn.metrics import balanced_accuracy_score #
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold #

In [53]:
%load_ext rpy2.ipython

In [54]:
%%R
library(ggplot2)
library(RColorBrewer)
library(ggbeeswarm)
library(ggpubr)

In [202]:
#-----read FMBA metadata------
FMBA_metadata=pd.read_csv("../fmba/fmba_metadata_edited.tsv",
                          sep=",",header=0, dtype = str)
print("FMBA_metadata \n", FMBA_metadata.COVID_status.value_counts())

#-----read Adaptive metadata------
AB_metadata=pd.read_csv("../adaptive/adaptive-metadata-edited.tsv",sep="\t")
AB_metadata["sample_short_name"]=AB_metadata["sample_name"].apply(lambda x: x[:-5]) #remove _TCRB from end of names
print("\nAdaptive_metadata \n", AB_metadata["COVID-19-status"].value_counts())

FMBA_metadata 
 COVID       1061
healthy      433
precovid     118
unknown       27
Name: COVID_status, dtype: int64

Adaptive_metadata 
 acute        1140
recovered     239
baseline       74
exposed        26
non-acute       4
Name: COVID-19-status, dtype: int64


In [203]:
#----select FMBA cohorts-----
fmba_IgM_pos = set(FMBA_metadata.loc[FMBA_metadata['COVID_IgM']=='yes', "name"])
print("fmba_IgM_pos:", len(fmba_IgM_pos))
fmba_IgM_neg_IgG_pos = set(FMBA_metadata.loc[(FMBA_metadata['COVID_IgM']=='no')&((FMBA_metadata['COVID_IgG']=='yes')), "name"])
print("fmba_IgM_neg_IgG_pos:", len(fmba_IgM_neg_IgG_pos))
fmba_PCR=set(FMBA_metadata.loc[FMBA_metadata['COVID_PCR']=='positive', "name"])
print("fmba_PCR:", len(fmba_PCR))
fmba_COVID=set(FMBA_metadata.loc[FMBA_metadata['COVID_status']=='COVID', "name"])
print("fmba_COVID:", len(fmba_COVID))
fmba_healthy=set(FMBA_metadata.loc[FMBA_metadata['COVID_status']=='healthy', "name"])
print("fmba_healthy:", len(fmba_healthy))
fmba_precovid=set(FMBA_metadata.loc[FMBA_metadata['COVID_status']=='precovid', "name"])
print("fmba_precovid:", len(fmba_precovid))

#----select Adaptive cohorts-----
adaptive_acute=set(AB_metadata.loc[AB_metadata["COVID-19-status"]=="acute","sample_short_name"])
print("\nAdaptive_acute:",  len(adaptive_acute))

fmba_IgM_pos: 525
fmba_IgM_neg_IgG_pos: 432
fmba_PCR: 39
fmba_COVID: 1061
fmba_healthy: 433
fmba_precovid: 118

Adaptive_acute: 1140


In [204]:
stat=pd.read_csv("metadata/clone_number.tsv",
                    sep="\t")
Adaptive=set(stat.loc[stat['Dataset']=='Adaptive', "sample_id_short"]) 
HIP=set(stat.loc[stat['Dataset']=='HIP', "sample_id_short"]) 
KECK=set(stat.loc[stat['Dataset']=='KECK', "sample_id_short"]) 
FMBA_COVID=set(stat.loc[stat['subdataset']=='COVID', "sample_id_short"]) 
FMBA_HEALTHY=set(stat.loc[stat['subdataset']=='HEALTHY', "sample_id_short"]) 
FMBA_PRECOVID=set(stat.loc[stat['subdataset']=='PRECOVID', "sample_id_short"])

In [327]:
#-----read Adaptive and HIP data-----
AB_data=pd.read_csv("../feature_search/wilcox_test/UNWEIGHT_TABLES/Adaptive/Adaptive_full_repertoire/Adaptive_public_top_10000.tsv",
                    sep="\t", index_col=0)
AB_data.index=[xT.split("_")[0] for xT in AB_data.index]

HIP_data=pd.read_csv("../feature_search/wilcox_test/UNWEIGHT_TABLES/HIP/HIP_full_repertoire/HIP_public_top_10000.tsv",
                    sep="\t", index_col=0)
HIP_data.index=[xT.split(".")[0] for xT in HIP_data.index]

KECK_data=pd.read_csv("../feature_search/wilcox_test/UNWEIGHT_TABLES/KECK/KECK_full_repertoire/KECK_top_public_10000.tsv",
                    sep="\t", index_col=0)
KECK_data.index=[xT.split("_")[0] for xT in KECK_data.index]

#----read FMBA data------
FMBA_data=pd.read_csv("../feature_search/wilcox_test/UNWEIGHT_TABLES/FMBA/FMBA_full_repertoire/FMBA_public_top_10000.tsv",
                    sep="\t", index_col=0)
FMBA_data.index=list(map(lambda x: x[0 : (x.rfind("S")-1)], FMBA_data.index))

In [328]:
#-----STATUS-----
FMBA_data_=FMBA_data.loc[FMBA_data.index.isin(fmba_COVID) | FMBA_data.index.isin(fmba_healthy) | FMBA_data.index.isin(fmba_precovid)]
FMBA_data_.loc[FMBA_data_.index.isin(FMBA_COVID) , "status"]="COVID"
print("FMBA_COVID", len(FMBA_data_.loc[FMBA_data_.index.isin(FMBA_COVID)]))
FMBA_data_.loc[FMBA_data_.index.isin(FMBA_HEALTHY) , "status"]="healthy"
print("FMBA_HEALTHY", len(FMBA_data_.loc[FMBA_data_.index.isin(FMBA_HEALTHY)]))
FMBA_data_.loc[FMBA_data_.index.isin(FMBA_PRECOVID) , "status"]="healthy"
print("FMBA_PRECOVID", len(FMBA_data_.loc[FMBA_data_.index.isin(FMBA_PRECOVID)]))

AB_data_=AB_data.loc[AB_data.index.isin(adaptive_acute)]
AB_data_["status"]="COVID"
print("AB", len(AB_data_))
HIP_data["status"]="healthy"
print("HIP", len(HIP_data))
KECK_data["status"]="healthy"
print("KEKC", len(KECK_data))
AB_data_=pd.concat([AB_data_, HIP_data, KECK_data])

FMBA_COVID 505
FMBA_HEALTHY 343
FMBA_PRECOVID 103
AB 1041
HIP 665
KEKC 120


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [329]:
COVID_HEALTHY=FMBA_data_.loc[FMBA_data_.index.isin(FMBA_COVID) | FMBA_data_.index.isin(FMBA_HEALTHY)]
PRECOVID=FMBA_data_.loc[FMBA_data_.index.isin(FMBA_PRECOVID)]

In [None]:
#MAKE HOLD-OUT SET (30%)

shufflesplit = StratifiedShuffleSplit(n_splits=1, random_state=42, test_size=int(len(COVID_HEALTHY)*0.3))


for train_index, test_index in shufflesplit.split(COVID_HEALTHY, COVID_HEALTHY["status"]):
    train_set=COVID_HEALTHY.iloc[train_index]
    test_set1=COVID_HEALTHY.iloc[test_index]    

In [330]:
train_set=COVID_HEALTHY.iloc[train_index]
test_set1=COVID_HEALTHY.iloc[test_index]    

In [331]:
test_set1=pd.concat([test_set1, PRECOVID])
test_set2=AB_data_

In [333]:
train_set.to_csv("DATA_FOR_TRAINING/FMBA/PRESENCE/full_repertoire/train_set_top_10000.tsv", sep='\t')
test_set1.to_csv("DATA_FOR_TEST/FMBA/PRESENCE/full_repertoire/test_set_top_10000.tsv", sep='\t')
test_set2.to_csv("DATA_FOR_TEST/ADAPTIVE/PRESENCE/full_repertoire/test_set_top_10000.tsv", sep='\t')

In [270]:
train_set=pd.read_csv("DATA_FOR_TRAINING/FMBA/FREQUENCY/top_public_2000AA/train_set.tsv", sep='\t', index_col=0)
test_set1=pd.read_csv("DATA_FOR_TEST/FMBA/FREQUENCY/top_public_2000AA/test_set.tsv", sep='\t', index_col=0)
test_set2=pd.read_csv("DATA_FOR_TEST/ADAPTIVE/FREQUENCY/top_public_2000AA/test_set.tsv", sep='\t', index_col=0)

In [335]:
output_folder="RF_COMBINATIONS_test/"
output_file=output_folder+"/"+"RF_COMBINATIONS_test.tsv"

parameters = {'criterion': ['gini', 'entropy'], 'max_depth' : [3, 5, 7], 'min_samples_leaf': [4, 10, 20],
               'min_samples_split': [10, 20, 40], 'max_features': [15, 20, 50]} 
quality_metrics=["normalization", "features", "mismatch", "weight", "CV_score", "acc_FMBA", "precision_FMBA", "recall_FMBA", "auc_FMBA", 
                 "acc_AB", "precision_AB", "recall_AB", "auc_AB"]

def run_classifier(normalization, features, mismatch, weight,
                   train_set, test_set1, test_set2,  
                   parameters=parameters, quality_metrics=quality_metrics, output_folder=output_folder, output_file=output_file):
    #------split into features matrix X and vector with ansvers y-----
    X_to_fit=train_set[train_set.columns[~train_set.columns.isin(["status"])]]
    y_to_fit=train_set["status"]
    X_test1=test_set1[test_set1.columns[~test_set1.columns.isin(["status"])]]
    y_test1=test_set1["status"]
    #-----find hyperparameters----
    clf=RandomizedSearchCV(RandomForestClassifier(class_weight="balanced",n_estimators=500), parameters, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    clf.fit(X_to_fit, y_to_fit)
    #-----best estimator-----
    RF=clf.best_estimator_
    #---quality metrics-----
    CV_score=clf.best_score_
    quality_metrics_values=[CV_score]
    for test_set in [test_set1, test_set2]:
        X_test=test_set[test_set.columns[~test_set.columns.isin(["status"])]]
        y_test=test_set["status"]
        y_pred=RF.predict(X_test)
        score=RF.score(X_test, y_test)
        precision=precision_score(y_test, y_pred, pos_label ='COVID')
        recall=recall_score(y_test, y_pred, pos_label ='COVID')
        y_predicted_prob = RF.predict_proba(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_predicted_prob[:,1], pos_label='healthy')
        auc=metrics.auc(fpr, tpr)
        quality_metrics_values.extend([score, precision, recall, auc])  
    quality_metrics_values=[*[normalization, features, mismatch, weight], *quality_metrics_values]
#-----make output file, if it doesn't exist-----
    try:
        os.mkdir(output_folder)
        with open(output_file,"w") as out_file:
            out_file.write("\t".join(quality_metrics))
    except:
        pass
    #-----write a string to a file-----
    with open(output_file,"a") as out_file:
            out_file.write("\n")
            out_file.write("\t".join(str(M) for M in quality_metrics_values)) 

In [336]:
run_classifier(normalization="full repertoire", features="public clonotypes", mismatch="without mismatch", weight="unweighted", 
               train_set=train_set, test_set1=test_set1, test_set2=test_set2)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   11.4s finished


In [337]:
pd.read_csv("RF_COMBINATIONS/RF_COMBINATIONS.tsv", sep='\t')

Unnamed: 0,normalization,features,mismatch,weight,CV_score,acc_FMBA,precision_FMBA,recall_FMBA,auc_FMBA,acc_AB,precision_AB,recall_AB,auc_AB
0,top-5000,enriched in COVID,single mismatch,weighted,0.700855,0.608696,0.423729,0.617284,0.663724,0.452915,0.522727,0.276,0.461135
1,top-5000,enriched in COVID,single mismatch,unweighted,0.735043,0.695652,0.520833,0.617284,0.743827,0.601457,0.602265,0.851,0.555499
2,top-5000,public clonotypes,single mismatch,weighted,0.717949,0.758893,0.628205,0.604938,0.75323,0.616592,0.604359,0.915,0.678392
3,top-5000,public clonotypes,single mismatch,unweighted,0.7151,0.762846,0.62963,0.62963,0.757752,0.618274,0.599007,0.965,0.739954
4,top-2000 public,enriched in COVID,single mismatch,weighted,0.766382,0.816,0.6875,0.723684,0.802405,0.536628,0.542547,0.958422,0.410385
5,top-2000 public,enriched in COVID,single mismatch,unweighted,0.766382,0.792,0.639535,0.723684,0.809437,0.557558,0.553865,0.970149,0.487489
6,top-2000 public,public clonotypes,single mismatch,weighted,0.769231,0.808,0.684211,0.684211,0.813596,0.622674,0.599313,0.929638,0.686964
7,top-2000 public,public clonotypes,single mismatch,unweighted,0.766382,0.812,0.693333,0.684211,0.825242,0.605814,0.585865,0.945629,0.76655
8,top-2000 public,public clonotypes,without mismatch,weighted,0.752137,0.768,0.584906,0.815789,0.826452,0.602907,0.583716,0.947761,0.538215
9,full repertoire,public clonotypes,without mismatch,weighted,0.735043,0.774704,0.630769,0.901099,0.869556,0.44414,0.933333,0.026897,0.504444
