In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
import torch
from tqdm import tqdm
import os
import sys

from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")



There are 1 GPU(s) available.
We will use the GPU: Tesla V100S-PCIE-32GB


In [2]:
sys.path.append("..")
# from my_model import CustomModel, PT5_classification_model, train_per_protein, create_dataset

In [3]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)

In [4]:
def add_spaces(seq):
     return ' '.join(list(seq))

In [5]:
# from epitopes.utilites import balance_majority, balance_minority, process_types, add_spaces

In [6]:
from models_mdf import save_model, load_model_

[2025-05-21 20:03:00,366] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# KNN

In [7]:
# create df with emb

In [8]:
alpha = pd.read_csv('../data/balanced data for clf/train_alpha.csv')
beta = pd.read_csv('../data/balanced data for clf/train_beta.csv')

In [9]:
N_LABELS_av = alpha['v'].nunique()
N_LABELS_aj = alpha['j'].nunique()


## J GENES

In [10]:
aJ_model = load_model_('../models/VJ_clf_transf/TCRbert_alfa_j.pth', mod_type='TCR-bert', num_labels=N_LABELS_aj)

Some weights of the model checkpoint at wukevin/tcr-bert-mlm-only were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wukevin/tcr-bert-ml

In [11]:
aj_df = alpha[['cdr3aa', 'j']]

In [12]:
aj_df

Unnamed: 0,cdr3aa,j
0,CAAIGGSTLGRLYF,TRAJ18
1,CAASFSGYSTLTF,TRAJ11
2,CALGDGGNYQLIW,TRAJ33
3,CALFDFGNEKLTF,TRAJ48
4,CAGSKNAGKSTF,TRAJ27
...,...,...
26029,CAVLPLYGGSQGNLIF,TRAJ42
26030,CAEIPNYGGSQGNLIF,TRAJ42
26031,CAMRDYNVLYF,TRAJ21
26032,CLVAVPADTGRRALTF,TRAJ5


In [13]:
aj_df['cdr3aa'] = aj_df['cdr3aa'].apply(add_spaces)

l_enc_j = LabelEncoder()
l_enc_j.fit(aj_df['j'])
aj_df['j'] = l_enc_j.transform(aj_df['j'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aj_df['cdr3aa'] = aj_df['cdr3aa'].apply(add_spaces)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aj_df['j'] = l_enc_j.transform(aj_df['j'])


In [14]:
mapping = dict(zip(l_enc_j.classes_, range(len(l_enc_j.classes_))))


In [15]:
from knn_setup import create_df_embs, train_clf, get_nearest_neighbours

In [16]:
aj_df = aj_df.sample(frac=0.4, random_state=42)

df_j_emb = create_df_embs(aJ_model, aj_df)
# df_j_emb = pd.read_csv('df_j_emb.csv', index_col=0)
df_j_emb.head()

100%|██████████| 10414/10414 [09:35<00:00, 18.10it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.64734,-0.560809,-0.001305,-0.163974,0.079416,0.108241,-0.344863,0.001524,0.228859,-0.667988,...,0.697033,-0.052007,0.314567,-0.174897,0.279822,-0.411745,-0.510936,-0.378025,0.482202,0.124261
1,0.244388,-1.58239,-0.431907,-0.549375,-0.077844,0.631082,-0.203106,-0.494974,-0.564094,-0.02876,...,0.001689,-0.424484,-0.072142,-0.263264,-0.456467,0.00198,-0.364723,-0.508019,1.004848,-0.505774
2,-0.784028,-0.616249,-0.240298,-0.145131,0.100028,-0.12154,-0.361889,-0.119354,-0.008641,-0.110913,...,0.085095,-0.303054,0.219602,-0.442124,-0.178131,-0.117636,-0.350745,-0.340547,0.30756,-0.490311
3,-0.348778,-0.405088,-0.076944,0.001057,0.470691,0.136137,-0.211244,0.324338,0.096805,0.327968,...,0.336686,-0.315851,0.124372,-0.311242,-0.696494,-0.429463,-0.213975,-0.279047,0.266973,0.230128
4,-0.004334,-0.499696,0.008729,-0.030454,0.001965,0.1229,-0.192133,-0.110254,-0.351567,-0.168506,...,0.275831,0.035123,0.090992,0.045417,-0.145398,-0.203661,-0.184373,-0.248784,0.783612,0.009645


In [17]:
aj_df = aj_df.reset_index(drop=True)
df_j_emb = pd.concat([aj_df, df_j_emb], axis=1, ignore_index=True)
df_j_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,768,769
0,C L V G A P G Y S S A S K I I F,17,-0.64734,-0.560809,-0.001305,-0.163974,0.079416,0.108241,-0.344863,0.001524,...,0.697033,-0.052007,0.314567,-0.174897,0.279822,-0.411745,-0.510936,-0.378025,0.482202,0.124261
1,C A L L G R L Y F,7,0.244388,-1.58239,-0.431907,-0.549375,-0.077844,0.631082,-0.203106,-0.494974,...,0.001689,-0.424484,-0.072142,-0.263264,-0.456467,0.00198,-0.364723,-0.508019,1.004848,-0.505774
2,C G T S N S G G S N Y K L T F,39,-0.784028,-0.616249,-0.240298,-0.145131,0.100028,-0.12154,-0.361889,-0.119354,...,0.085095,-0.303054,0.219602,-0.442124,-0.178131,-0.117636,-0.350745,-0.340547,0.30756,-0.490311
3,C A E S K E G K L I F,24,-0.348778,-0.405088,-0.076944,0.001057,0.470691,0.136137,-0.211244,0.324338,...,0.336686,-0.315851,0.124372,-0.311242,-0.696494,-0.429463,-0.213975,-0.279047,0.266973,0.230128
4,C A G Q L Y G G S Q G N L I F,29,-0.004334,-0.499696,0.008729,-0.030454,0.001965,0.1229,-0.192133,-0.110254,...,0.275831,0.035123,0.090992,0.045417,-0.145398,-0.203661,-0.184373,-0.248784,0.783612,0.009645


In [18]:
df_j_emb.drop([0, 1], axis=1)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,760,761,762,763,764,765,766,767,768,769
0,-0.647340,-0.560809,-0.001305,-0.163974,0.079416,0.108241,-0.344863,0.001524,0.228859,-0.667988,...,0.697033,-0.052007,0.314567,-0.174897,0.279822,-0.411745,-0.510936,-0.378025,0.482202,0.124261
1,0.244388,-1.582390,-0.431907,-0.549375,-0.077844,0.631082,-0.203106,-0.494974,-0.564094,-0.028760,...,0.001689,-0.424484,-0.072142,-0.263264,-0.456467,0.001980,-0.364723,-0.508019,1.004848,-0.505774
2,-0.784028,-0.616249,-0.240298,-0.145131,0.100028,-0.121540,-0.361889,-0.119354,-0.008641,-0.110913,...,0.085095,-0.303054,0.219602,-0.442124,-0.178131,-0.117636,-0.350745,-0.340547,0.307560,-0.490311
3,-0.348778,-0.405088,-0.076944,0.001057,0.470691,0.136137,-0.211244,0.324338,0.096805,0.327968,...,0.336686,-0.315851,0.124372,-0.311242,-0.696494,-0.429463,-0.213975,-0.279047,0.266973,0.230128
4,-0.004334,-0.499696,0.008729,-0.030454,0.001965,0.122900,-0.192133,-0.110254,-0.351567,-0.168506,...,0.275831,0.035123,0.090992,0.045417,-0.145398,-0.203661,-0.184373,-0.248784,0.783612,0.009645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10409,0.107218,-0.597809,-0.866804,-0.205675,-0.075828,-0.023055,-0.754772,-0.062797,-0.259994,-0.861944,...,0.215896,0.024436,-0.073342,0.036956,0.166490,-0.018310,-0.111455,-0.747917,0.380730,-0.013440
10410,-0.635970,-0.024376,-0.256344,0.081329,0.473106,0.133732,-0.104639,-0.047496,0.370318,-0.154011,...,0.308429,0.017407,0.530394,0.210666,0.109712,0.155375,-0.598735,-0.228948,0.175603,0.011674
10411,-0.795809,-0.259600,-0.079311,-0.283488,0.139523,0.138584,-0.328737,-0.457066,0.073675,-0.150348,...,0.429378,-0.370843,0.258513,-0.183225,-0.366565,-0.036393,-0.232845,-0.197336,-0.041201,-0.134115
10412,-0.112686,-0.303925,-0.460364,-0.194952,0.481558,0.305201,-0.191374,-0.143161,0.253668,-0.369667,...,0.053475,0.110446,0.393816,0.040140,0.210204,-0.060648,-0.429888,-0.393008,0.563790,-0.032604


In [19]:
from sklearn.model_selection import train_test_split
# df_j_emb.drop(columns = ['cdr3aa'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df_j_emb.drop([0, 1], axis=1), df_j_emb[1], test_size=0.20, random_state=42)

In [20]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import pickle 


In [21]:
knn_best = train_clf(X_train, X_test, y_train, y_test, gene='j', save=True)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fitting 5 folds for each of 16 candidates, totalling 80 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



Resulting test score: 0.993


In [22]:
nn_j = get_nearest_neighbours('./knn_j_model.pkl', aJ_model, 'C A L F D F G N E K L T F', X_train, y_train, print_info=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Query Point: C A L F D F G N E K L T F
Nearest Neighbors: [6957, 2581, 1778, 1928, 4283, 3548, 558, 1502, 2773, 8136]
Nearest Labels: [34 34 34 34 34 34 34 34 34 34]
Distances to Nearest Neighbors: [[1.1045881  2.21314547 2.78202561 2.79092794 2.80230889 2.80850398
  2.80850398 2.82770409 2.94296435 2.94377723]]




In [23]:
l_enc_j.inverse_transform(y_train.loc[nn_j].values)

array(['TRAJ48', 'TRAJ48', 'TRAJ48', 'TRAJ48', 'TRAJ48', 'TRAJ48',
       'TRAJ48', 'TRAJ48', 'TRAJ48', 'TRAJ48'], dtype=object)

## V GENES

In [24]:
N_LABELS_av = alpha['v'].nunique()


In [25]:
av_model = load_model_('../models/VJ_clf_transf/TCRbert_alfa_v.pth', mod_type='TCR-bert', num_labels=N_LABELS_av)

Some weights of the model checkpoint at wukevin/tcr-bert-mlm-only were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at wukevin/tcr-bert-ml

In [26]:
av_df = alpha[['cdr3aa', 'v']]

In [27]:
av_df

Unnamed: 0,cdr3aa,v
0,CAAIGGSTLGRLYF,TRAV29
1,CAASFSGYSTLTF,TRAV13
2,CALGDGGNYQLIW,TRAV6
3,CALFDFGNEKLTF,TRAV16
4,CAGSKNAGKSTF,TRAV25
...,...,...
26029,CAVLPLYGGSQGNLIF,TRAV39
26030,CAEIPNYGGSQGNLIF,TRAV5
26031,CAMRDYNVLYF,TRAV16
26032,CLVAVPADTGRRALTF,TRAV4


In [28]:
av_df['cdr3aa'] = av_df['cdr3aa'].apply(add_spaces)

l_enc_v = LabelEncoder()
l_enc_v.fit(av_df['v'])
av_df['v'] = l_enc_v.transform(av_df['v'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  av_df['cdr3aa'] = av_df['cdr3aa'].apply(add_spaces)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  av_df['v'] = l_enc_v.transform(av_df['v'])


In [29]:
mapping = dict(zip(l_enc_v.classes_, range(len(l_enc_v.classes_))))


In [30]:
av_df = av_df.sample(frac=0.4, random_state=42)

df_v_emb = create_df_embs(av_model, av_df)
# df_v_emb.head()

100%|██████████| 10414/10414 [09:47<00:00, 17.73it/s]


In [31]:
av_df = av_df.reset_index(drop=True)
df_v_emb = pd.concat([av_df, df_v_emb], axis=1, ignore_index=True)
df_v_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,760,761,762,763,764,765,766,767,768,769
0,C L V G A P G Y S S A S K I I F,23,-0.634476,-0.715426,-0.112857,-0.026188,-0.163004,0.40364,-0.354486,-0.02881,...,0.525054,-0.023938,0.282135,-0.311814,0.185906,-0.352735,-0.448487,0.025638,0.497169,0.047479
1,C A L L G R L Y F,14,-0.197206,-0.869457,-0.575098,-0.589052,0.36481,0.42524,-0.285844,-0.439713,...,0.4481,-0.214829,-0.18832,-0.257455,-0.381339,0.370652,-0.89881,-0.381585,0.596959,0.254778
2,C G T S N S G G S N Y K L T F,19,-0.356926,-0.40702,0.011429,0.016217,0.150756,0.267557,-0.530167,-0.358644,...,0.266957,-0.431619,0.063431,-0.255619,-0.081371,-0.068055,-0.308897,-0.062996,0.249711,0.156963
3,C A E S K E G K L I F,25,-0.119118,-0.653685,-0.765995,0.080028,0.23476,0.119031,-0.130814,-0.172284,...,0.30134,-0.249289,0.098097,-0.44112,-0.594371,-0.157959,-0.604804,0.065207,0.07938,0.208593
4,C A G Q L Y G G S Q G N L I F,20,-0.029738,-0.364254,-0.008347,0.007682,-0.31288,0.402974,-0.163879,-0.293151,...,0.267511,0.245688,-0.08128,-0.307766,-0.095386,0.109498,-0.794303,0.025131,0.267981,0.077586


In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_v_emb.drop([0, 1], axis=1), df_v_emb[1], test_size=0.20, random_state=42)

In [33]:
knn_best = train_clf(X_train, X_test, y_train, y_test, gene='v', save=True)



Fitting 5 folds for each of 16 candidates, totalling 80 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

In [34]:
nn_v = get_nearest_neighbours('./knn_v_model.pkl', av_model, 'C A A I G G S T L G R L Y F', X_train, y_train, n_neighb=5, print_info=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Query Point: C A A I G G S T L G R L Y F
Nearest Neighbors: [8983, 8099, 3221, 10331, 143]
Nearest Labels: [17 17 17  7  3]
Distances to Nearest Neighbors: [[1.165384   4.10142285 4.16795175 4.19935107 4.49350682 4.51299552
  4.6255152  4.79619021 4.86849739 4.89917919 4.93790188 4.93790188
  5.09306188 5.12971227 5.13880063 5.14726428 5.18733255 5.19217977
  5.22167551 5.22167551 5.24774105 5.25847894 5.33857386 5.36837915
  5.39957292 5.41882723 5.45843952 5.47608706 5.47704842 5.48595861
  5.4871863  5.4871863  5.49239787 5.5024445  5.51526245]]




In [35]:
l_enc_v.inverse_transform(y_train.loc[nn_v].values)

array(['TRAV29', 'TRAV29', 'TRAV29', 'TRAV19', 'TRAV13'], dtype=object)