In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from os.path import exists
from tqdm import tqdm

from Levenshtein import distance

%matplotlib inline

In [3]:
def tcr_dist(tcr1, tcr2):
    d0 = distance(tcr1[0], tcr2[0])
    d1 = 1 - int(tcr1[1] == tcr2[1])
    d2 = 1 - int(tcr1[2] == tcr2[2])
    return d0 + d1 + d2

In [4]:
# settings
pep = "YLQPRTFLL"

In [5]:
# load and prepare peptide-specific seqs for SONIA
vdgdb_df = pd.read_csv('./' + pep + '/VDJdb_' + pep + '_WithAligned20.csv')
vdgdb_df = vdgdb_df.drop_duplicates().reset_index(drop=True)
t_seqs = vdgdb_df['CDR3_beta'].to_list()
t_v = vdgdb_df['TRBV_gene'].to_list()
t_j = vdgdb_df['TRBJ_gene'].to_list()
sonia_input_vdgdb = [list(a) for a in zip(t_seqs, t_v, t_j)]

In [6]:
# load and prepare second set of Emerson seqs for SONIA (to be used as negative)
filename_cdr3raw = './train_data_1.txt' 
inds_non_overlap = np.loadtxt('./1_inds_nonoverlap_0.txt').astype(np.int16)
t_seq0 = []
with open(filename_cdr3raw) as f:
    for line in f:
        linesplit = line.strip().split('\n')
        t_seq0.append(linesplit[0])

t_seq = [x.split('\t') for x in np.array(t_seq0)[inds_non_overlap]]

In [7]:
# settings (I am using the average of best k obtained for NLVPMVATV)
k = 26

In [8]:
AUROCs = []
for i in tqdm(range(50)):
    #print("Working on repl =", i, "...")
    repl = i
    ## prepare positives (train, test) ##
    path_o ='./' + pep + '/indices/index_permutation_repl' + str(repl) + '.txt'
    full_intR = (np.loadtxt(path_o)).astype(np.int16)
    data = [sonia_input_vdgdb[t] for t in full_intR]
    train_data = data[:int(80*len(data)/100)]
    val_data = data[int(80*len(data)/100):]
    
    ## prepare negatives (test) ##
    path_o ='./' + pep + '/indices/index_permutationN_repl' + str(repl) + '.txt'
    full_intR = (np.loadtxt(path_o)).astype(np.int16)
    val_dataN0 = [t_seq[t] for t in full_intR]
    val_dataN = val_dataN0[:len(val_data)]
    train_dataN = val_dataN0[len(val_data)+1:len(val_data)+len(train_data)]
        
    res_pos = []
    for x in val_data:
        ts = x
        ave_dists_plus = np.mean(sorted([tcr_dist(ts, x) for x in train_data])[:k])
        ave_dists_minus = np.mean(sorted([tcr_dist(ts, x) for x in train_dataN])[:k])
        res_pos.append(ave_dists_minus - ave_dists_plus)
    
    res_pos = np.array(res_pos)
    
    res_neg = []
    for x in val_dataN:
        ts = x
        ave_dists_plus = np.mean(sorted([tcr_dist(ts, x) for x in train_data])[:k])
        ave_dists_minus = np.mean(sorted([tcr_dist(ts, x) for x in train_dataN])[:k])
        res_neg.append(ave_dists_minus - ave_dists_plus)
        
    res_neg = np.array(res_neg)
    
    scores_positive = res_pos
    scores_negative = res_neg    
    labels = np.hstack((np.zeros((len(res_neg))), np.ones((len(res_pos))))) 
    scores = np.hstack((scores_negative, scores_positive))
    fpr, tpr, thresholds = metrics.roc_curve(labels, scores)
    metrics.auc(fpr, tpr)
    AUROCs = np.append(AUROCs, metrics.auc(fpr, tpr))

100%|███████████████████████████████████████████████████████████████████████████████████| 50/50 [00:02<00:00, 19.66it/s]


In [9]:
AUROCs.mean(), AUROCs.std()

(0.9731746031746031, 0.01284574043752873)

In [10]:
np.savetxt('./' + pep + '/AUROCs_kNN.txt', AUROCs)