In [1]:
import os
import sys

os.chdir('../')
sys.path.append(os.getcwd())

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.decomposition import PCA
from scipy.stats import fisher_exact, chi2_contingency
from sklearn.manifold import TSNE, MDS
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, PrecisionRecallDisplay, RocCurveDisplay
from multipy.fwer import sidak, hochberg
from xgboost import XGBClassifier
import random

from utils.viz_utils import plot_usage_matrix_pca, plot_boxplots_for_usage_matrix, _plot_feature_importance, plot_v_usage_hist, \
                                plot_cluster_map, significant_clones_distribution, plot_results_for_hla_class, plot_generated_to_real_distribution, \
                            plot_olga_cleanup_data, plot_clusters_of_clonotypes, plot_cooccurence_heatmap_with_epitopes_labeling, plot_waterfall_by_column, \
                            plot_feature_importances, plot_volcano, plot_clonotype_clustering_with_epitope_labeling
from utils.ml_utils import get_parameters, prepare_data, evaluate_models, split_data_by_batch, cross_validation_between_batches, make_hla_predictor
from utils.data_utils import prepare_run_column
from utils.stats_utils import evaluate_anova_testing, evaluate_mannwhitneyu_testing, get_top_changed_clonotypes
from utils.clustering_utils import seqs2hamming, check_significant_epitopes_for_cluster, get_most_frequent_cluster_by_vdjdb_occurence, \
                                check_significant_epitopes_for_all_clusters, read_association_data
from utils.weblogo_utils import create_web_logo
from source.alpha_beta_paired_clones_search import make_metaclone_cm


import warnings
warnings.filterwarnings('ignore')


import importlib
imported_module = importlib.import_module("utils.stats_utils")
importlib.reload(imported_module)

/home/evlasova/tcr-covid-classifier


<module 'utils.stats_utils' from '/home/evlasova/tcr-covid-classifier/utils/stats_utils.py'>

# Reading UMs

In [2]:
norm_um_joint = pd.read_csv('data/normalized_usage_matrix_joint_v.csv').drop(columns=['Unnamed: 0']).fillna(0)
stand_um_joint = pd.read_csv('data/standardized_usage_matrix_joint_v.csv').drop(columns=['Unnamed: 0']).fillna(0)

In [3]:
norm_um_joint['project'] = norm_um_joint['project'].apply(lambda x: x.replace('_DNA', '').split('_')[-1])

In [4]:
stand_um_joint['project'] = stand_um_joint['project'].apply(lambda x: x.replace('_DNA', '').split('_')[-1])

In [5]:
stand_um_joint

Unnamed: 0,run,project,covid,TRBV13,TRBV6-4,TRBV6-7,TRBV5-4,TRBV7-1,TRBV6-8,TRBV16,...,TRBV7-7,TRBV25-1,TRBV21-1,TRBV18,TRBV10-1,TRBV1,TRBV8-2,TRBV5-2,TRBV22-1,TRBV26
0,050002290808_S117_L002.clonotypes.TRB.txt,NovaSeq2,covid,0.003474,0.014043,0.001055,0.023937,0.000008,0.000661,0.000450,...,0.005438,0.003233,0.014215,0.027352,0.007841,0.001079,1.696294e-06,0.000003,0.000009,6.917419e-07
1,050002630808_S113_L002.clonotypes.TRB.txt,NovaSeq2,covid,0.007076,0.021841,0.001130,0.022548,0.000008,0.000710,0.000484,...,0.007519,0.010263,0.009868,0.029526,0.016554,0.001162,1.827189e-06,0.000003,0.000010,7.451205e-07
2,050003120808_S112_L002.clonotypes.TRB.txt,NovaSeq2,covid,0.006501,0.012496,0.001329,0.018439,0.000007,0.001137,0.000444,...,0.001922,0.006333,0.027440,0.038011,0.002849,0.001065,1.674763e-06,0.000003,0.000009,6.829615e-07
3,050003130808_S114_L002.clonotypes.TRB.txt,NovaSeq2,covid,0.006554,0.010890,0.001096,0.019498,0.000007,0.000872,0.000440,...,0.007736,0.012005,0.020345,0.035833,0.009140,0.001056,1.660397e-06,0.000003,0.000009,6.771032e-07
4,050003450808_S118_L002.clonotypes.TRB.txt,NovaSeq2,covid,0.005554,0.007503,0.001363,0.028087,0.000007,0.000898,0.000445,...,0.003293,0.004600,0.018388,0.010175,0.003035,0.001068,1.678981e-06,0.000003,0.000009,6.846817e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,Keck0116_MC1.txt,KECK,healthy,0.005532,0.002471,0.001099,0.020060,0.000010,0.000807,0.000402,...,0.003144,0.005968,0.012771,0.005411,0.005918,0.001757,1.386017e-06,0.000003,0.000007,6.137553e-07
3212,Keck0117_MC1.txt,KECK,healthy,0.009565,0.014078,0.001816,0.024067,0.000006,0.000964,0.000557,...,0.006522,0.006600,0.014357,0.023070,0.008981,0.001154,1.544601e-06,0.000001,0.000005,6.819803e-07
3213,Keck0118_MC1.txt,KECK,healthy,0.007088,0.017838,0.001670,0.022876,0.000010,0.001317,0.000534,...,0.005692,0.013521,0.011688,0.005806,0.003133,0.001475,2.012325e-06,0.000004,0.000007,6.868246e-07
3214,Keck0119_MC1.txt,KECK,healthy,0.004732,0.002107,0.000908,0.022094,0.000009,0.000236,0.000556,...,0.005394,0.016580,0.009938,0.017636,0.011721,0.001100,6.636811e-07,0.000004,0.000011,6.426646e-07


In [6]:
stand_um_joint[stand_um_joint.project.str.lower().str.contains('nova')].project.value_counts().sum()

1225

# Making FMBA based classifier

In [7]:
fmba_not_nan_hla = pd.read_csv('data/preprocessed_fmba_metadata_beta.csv')

In [8]:
fmba_not_nan_hla = fmba_not_nan_hla[fmba_not_nan_hla.COVID_status != 'unknown']

In [9]:
good_fmba_samples = fmba_not_nan_hla.run

In [10]:
good_fmba_samples

0       050002290808_S117_L002.clonotypes.TRB.txt
1       050002630808_S113_L002.clonotypes.TRB.txt
2       050003120808_S112_L002.clonotypes.TRB.txt
3       050003130808_S114_L002.clonotypes.TRB.txt
4       050003450808_S118_L002.clonotypes.TRB.txt
                          ...                    
1220       p18_72_DNA_S88_L002.clonotypes.TRB.txt
1221       p18_73_DNA_S89_L002.clonotypes.TRB.txt
1222       p18_74_DNA_S90_L002.clonotypes.TRB.txt
1223       p18_75_DNA_S91_L002.clonotypes.TRB.txt
1224       p18_76_DNA_S92_L002.clonotypes.TRB.txt
Name: run, Length: 1225, dtype: object

In [11]:
stand_um_joint['platform'] = stand_um_joint.run.apply(lambda x: 'fmba' if 'TRB' in x else 'adaptive')

In [12]:
stand_um_joint['platform'].value_counts()

adaptive    1991
fmba        1225
Name: platform, dtype: int64

In [13]:
pd.read_csv('data/sign_clone_matrix_joint_fmba_based.csv')

Unnamed: 0.1,Unnamed: 0,CASAPGGSYEQYF,CASGLGGNQPQHF,CASGLSGGNQPQHF,CASGQGGYEQYF,CASIPGGSYEQYF,CASKLGTSYEQYF,CASKLSGGNQPQHF,CASKPGGSYEQYF,CASLPGGSYEQYF,...,CSARSGHEQYV,CSARSSYEQYF,CSARTSYEQYF,CSASDRGYEQYF,CSASGGGYEQYF,CSASSGHEQYF,CSASSSYEQYF,CSATSGHEQYF,RASSLGTSYEQYF,run
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,260002830808_S80_L001.clonotypes.TRB.txt
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,780003310808_S68_L001.clonotypes.TRB.txt
2,2,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,770104420808_S61_L001.clonotypes.TRB.txt
3,3,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,2,0,2,0,360001390808_S117_L002.clonotypes.TRB.txt
4,4,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,260002150808_S40_L001.clonotypes.TRB.txt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,3211,5,5,4,7,5,3,3,5,4,...,0,10,14,5,3,0,5,0,3,HIP17577.txt
3212,3212,1,6,4,4,1,0,3,3,2,...,0,2,3,5,5,1,3,1,0,HIP14213.txt
3213,3213,4,12,3,9,4,2,1,5,5,...,0,12,11,3,3,0,10,0,2,HIP17585.txt
3214,3214,2,6,1,6,0,1,0,0,1,...,0,4,1,1,8,1,7,0,0,HIP17657.txt


In [14]:
data_fmba = prepare_data(run_to_number_of_clones_path = 'data/run_to_number_of_clones_joint.csv',
                 desc_path='data/standardized_usage_matrix_joint_v.csv',
                 clonotype_matrix_path='data/sign_clone_matrix_joint_fmba_based.csv',
                 hla_keys_path=None,
                 make_freq=True,
                 use_hla_clones=False,
                 use_hla_bool=False,
                 make_all_features_bool=True,
                 use_standardization=True,
                 raw_target_column='covid',
                 raw_target_clumn_success_label='covid',
                 final_target_column='covid',
                 metadata_columns=[])
fmba_good_beta_biomarkers_data = [x for x in pd.read_csv('data/sign_clone_matrix_joint_fmba_based.csv').columns if x.startswith('CAS')]
data_fmba = data_fmba[fmba_good_beta_biomarkers_data + ['covid']]
data_fmba['folder'] = stand_um_joint['project']
data_fmba['platform'] = stand_um_joint['platform']
data_fmba['run'] = stand_um_joint['run']

In [15]:
data_fmba = data_fmba[(data_fmba.run.isin(good_fmba_samples)) | (data_fmba.platform == 'adaptive')].drop(columns=['run', 'platform'])

In [16]:
data_fmba

Unnamed: 0,CASAPGGSYEQYF,CASGLGGNQPQHF,CASGLSGGNQPQHF,CASGQGGYEQYF,CASIPGGSYEQYF,CASKLGTSYEQYF,CASKLSGGNQPQHF,CASKPGGSYEQYF,CASLPGGSYEQYF,CASNPGGSYEQYF,...,CASSYGVGYEQYF,CASSYGYEQYF,CASSYRGAFGYTF,CASSYSGGSYEQYF,CASSYSYEQYF,CASSYTSYEQYF,CASTLGGSYEQYF,CASTPGGAGYTF,covid,folder
0,-1.136045,0.492604,-1.005613,-1.409281,-1.13461,-0.888622,-0.913548,-1.166719,-1.100831,-1.237854,...,-1.342160,0.205246,-0.468087,0.465122,0.174468,0.480384,0.635898,-0.654751,1,NovaSeq2
1,-1.136045,-2.030029,-1.005613,0.709582,-1.13461,-0.888622,-0.913548,-1.166719,-1.100831,-1.237854,...,0.745068,0.205246,2.136355,-2.149972,0.174468,-2.081666,-1.572578,1.527299,1,NovaSeq2
2,0.880247,0.492604,-1.005613,-1.409281,0.88136,-0.888622,-0.913548,0.857104,0.908404,0.807849,...,0.745068,0.205246,-0.468087,-2.149972,0.174468,0.480384,-1.572578,-0.654751,1,NovaSeq2
3,-1.136045,-2.030029,-1.005613,-1.409281,-1.13461,-0.888622,-0.913548,-1.166719,-1.100831,-1.237854,...,-1.342160,0.205246,-0.468087,0.465122,0.174468,-2.081666,-1.572578,-0.654751,1,NovaSeq2
4,-1.136045,-2.030029,-1.005613,-1.409281,-1.13461,-0.888622,-0.913548,-1.166719,-1.100831,-1.237854,...,-1.342160,-4.872213,2.136355,-2.149972,0.174468,-2.081666,0.635898,-0.654751,1,NovaSeq2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,0.880247,0.492604,-1.005613,0.709582,0.88136,1.125338,-0.913548,0.857104,0.908404,0.807849,...,0.745068,0.205246,-0.468087,0.465122,0.174468,0.480384,0.635898,1.527299,0,KECK
3212,0.880247,0.492604,0.994419,0.709582,0.88136,1.125338,1.094634,0.857104,0.908404,0.807849,...,0.745068,0.205246,2.136355,0.465122,0.174468,0.480384,0.635898,1.527299,0,KECK
3213,0.880247,0.492604,-1.005613,0.709582,0.88136,1.125338,-0.913548,0.857104,0.908404,0.807849,...,0.745068,0.205246,-0.468087,0.465122,0.174468,0.480384,0.635898,-0.654751,0,KECK
3214,0.880247,0.492604,0.994419,0.709582,0.88136,1.125338,1.094634,0.857104,0.908404,0.807849,...,0.745068,0.205246,-0.468087,0.465122,0.174468,0.480384,0.635898,1.527299,0,KECK


In [17]:
X_train_fmba, y_train_fmba, X_test_fmba, y_test_fmba = split_data_by_batch(data=data_fmba, 
                                                       test_batches=[x for x in stand_um_joint.project.unique() if 'Nova' not in x], 
                                                       y_column='covid', 
                                                       batch_column='folder')

In [18]:
fmba_clf = SVC(C=5, kernel='rbf', probability=True, random_state=42).fit(X_train_fmba, y_train_fmba)

In [19]:
y_pred_fmba = fmba_clf.predict(X_test_fmba)

In [20]:
pd.Series(y_pred_fmba).value_counts()

1    1982
0       9
dtype: int64

In [21]:
metrics_fmba = pd.DataFrame({'f1': [round(f1_score(y_test_fmba, y_pred_fmba), 2)], 
              'precision': [round(precision_score(y_test_fmba, y_pred_fmba), 2)], 
              'recall': [round(recall_score(y_test_fmba, y_pred_fmba), 2)]})

In [22]:
fmba_cm = pd.read_csv('data/sign_clone_matrix_joint_fmba_based.csv').drop(columns=['Unnamed: 0'])
fmba_clones = fmba_cm.drop(columns=['run']).columns
res_fmba = seqs2hamming(fmba_clones, threshold=1, viz_method='drl')
metaclone_fmba_cm = make_metaclone_cm(fmba_cm, res_fmba)
metaclone_fmba_cm.to_csv('data/clone_matrix_covid_fmba_and_adaptive_metaclone.csv')

In [23]:
data_fmba_meta = prepare_data(run_to_number_of_clones_path = 'data/run_to_number_of_clones_joint.csv',
                 desc_path='data/standardized_usage_matrix_joint_v.csv',
                 clonotype_matrix_path='data/clone_matrix_covid_fmba_and_adaptive_metaclone.csv',
                 hla_keys_path=None,
                 make_freq=True,
                 use_hla_clones=False,
                 use_hla_bool=False,
                 make_all_features_bool=False,
                 use_standardization=True,
                 raw_target_column='covid',
                 raw_target_clumn_success_label='covid',
                 final_target_column='covid',
                 metadata_columns=[])
data_fmba_meta['folder'] = stand_um_joint['project']
data_fmba_meta['platform'] = stand_um_joint['platform']
data_fmba_meta['run'] = stand_um_joint['run']

In [24]:
data_fmba_meta

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,...,cluster_53,cluster_54,cluster_55,cluster_56,cluster_57,cluster_58,covid,folder,platform,run
0,0.990006,2.598528,-0.450946,1.196177,1.508656,-0.766814,2.563293,1.601560,0.663175,-0.278615,...,-0.630503,-0.313591,-0.260116,-0.175574,-0.594344,-0.631726,1,NovaSeq2,fmba,050002290808_S117_L002.clonotypes.TRB.txt
1,0.406015,-0.968811,-0.450946,-0.336716,0.127176,-0.766814,1.091862,3.253506,-0.696776,-0.278615,...,-0.630503,2.225208,-0.260116,-0.175574,-0.594344,-0.631726,1,NovaSeq2,fmba,050002630808_S113_L002.clonotypes.TRB.txt
2,0.676887,0.611664,-0.450946,1.680870,0.009607,-0.766814,3.340575,1.287645,0.508250,-0.278615,...,-0.630503,-0.948544,-0.260116,-0.175574,-0.594344,-0.631726,1,NovaSeq2,fmba,050003120808_S112_L002.clonotypes.TRB.txt
3,0.789055,-0.968811,-0.450946,1.368387,0.259141,-0.766814,1.666088,2.410658,-0.696776,-0.278615,...,-0.630503,1.789418,-0.260116,-0.175574,-0.594344,-0.631726,1,NovaSeq2,fmba,050003130808_S114_L002.clonotypes.TRB.txt
4,0.876750,-0.968811,-0.450946,1.036827,-0.693770,2.110113,-0.124786,4.213757,-0.696776,-0.278615,...,-0.630503,1.525186,1.918865,2.750141,-0.594344,-0.631726,1,NovaSeq2,fmba,050003450808_S118_L002.clonotypes.TRB.txt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3211,1.760854,-0.524278,-0.450946,1.118902,0.749909,1.073569,0.634505,0.597207,-0.612043,-0.278615,...,0.517276,3.640579,-0.167189,-0.175574,1.893278,0.198397,0,KECK,adaptive,Keck0116_MC1.txt
3212,-0.477001,-0.228740,0.226921,0.556461,-0.086584,-0.031474,0.616814,0.903974,-0.471070,-0.278615,...,0.898180,0.948307,-0.198233,-0.175574,1.062239,-0.078921,0,KECK,adaptive,Keck0117_MC1.txt
3213,0.890906,-0.848518,-0.450946,0.418176,0.608314,0.229221,0.370252,0.927387,-0.421625,0.439497,...,1.357306,1.492316,-0.058944,-0.175574,1.021248,0.940724,0,KECK,adaptive,Keck0118_MC1.txt
3214,1.440349,0.344189,0.425979,1.308198,0.984507,0.682750,2.138889,0.975847,0.054043,0.047976,...,0.725557,0.609469,-0.168624,-0.175574,0.875173,0.185574,0,KECK,adaptive,Keck0119_MC1.txt


In [25]:
X_train_fmba_m, y_train_fmba_m, X_test_fmba_m, y_test_fmba_m = split_data_by_batch(data=data_fmba_meta.drop(columns=['run', 'platform']), 
                                                       test_batches=[x for x in stand_um_joint.project.unique() if 'Nova' not in x], 
                                                       y_column='covid', 
                                                       batch_column='folder')

In [26]:
fmba_clf_m = SVC(C=5, kernel='rbf', probability=True, random_state=42).fit(X_train_fmba_m, y_train_fmba_m)
y_pred_fmba_m = fmba_clf_m.predict(X_test_fmba_m)

In [27]:
metrics_fmba_m = pd.DataFrame({'f1': [round(f1_score(y_test_fmba_m, y_pred_fmba_m), 2)], 
              'precision': [round(precision_score(y_test_fmba_m, y_pred_fmba_m), 2)], 
              'recall': [round(recall_score(y_test_fmba_m, y_pred_fmba_m), 2)]})

# Final classifiers

In [31]:
fmba_clf # biomarker based classifier

SVC(C=5, probability=True, random_state=42)

In [32]:
fmba_clf_m # metaclonotype based classifier

SVC(C=5, probability=True, random_state=42)