# Data preparation

In [1]:
import numpy as np
import pandas as pd
from datetime import date
import seaborn as sns
from math import sqrt
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import datetime
from collections import Counter
import re
# libs for ml
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

# my module with some func
import pepcode
from tg_bot_notification import telegram_logger

## Set parametrs

In [2]:
random_state = 42
train_size = 0.7
test_size = 0.3

AA_LIST = pepcode.AA_LIST

latent_dims = 64
use_gpu = True

# Device set
if use_gpu and torch.cuda.is_available():
    device = torch.device("cuda:0")
elif use_gpu and torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

sns.set_theme(style="darkgrid")
default_color = "xkcd:dark pastel green"

## Positive data process

In [4]:
data = pd.read_csv('./dataset/vdjdb-2024-11-27-fixed/vdjdb.slim.txt', sep = '\t')
data = data[(data.gene == 'TRB') & (data.species == 'HomoSapiens')]

#Filter CDR3
data.columns=data.columns.str.replace('.','_')
data=data[data['cdr3'].str.match(r'^C.*[FW]$')]
data=data[data['cdr3'].str.len()>10]
data=data[data['cdr3'].str.len()<20]
#Filter Epitope
data=data[data['antigen_epitope'].str.len()>=6]
data=data[data['antigen_epitope'].str.len()<=20]


data.reset_index(drop=True, inplace=True)
data_p = data[['cdr3', 'antigen_epitope']].copy()
data_p['bind'] = 1
data_p

Unnamed: 0,cdr3,antigen_epitope,bind
0,CAAADEEIGNQPQHF,ATDALMTGY,1
1,CAAAERNTGELFF,YLQPRTFLL,1
2,CAAAGTASTDTQYF,RAKFKQLL,1
3,CAAATGLYGYTF,GILGFVFTL,1
4,CAACPGTENTGELFF,KLVALGINAV,1
...,...,...,...
47029,CVSRSGAEAFF,GILGFVFTL,1
47030,CVSRSLVSTGELFF,FRDYVDRFYKTLRAEQASQE,1
47031,CVSSKGRMDFYEQYF,QYIKWPWYI,1
47032,CVSVGRSGYTF,TTDPSFLGRY,1


## Negative data process

In [6]:
# # Generate negative data

# def random_recombination(df, column_name, epitope_dist, tcr_dist, ratio=0.8):
#     unique_epitopes = df['antigen_epitope'].unique()  # все представленные эпитопы
#     unique_tcrs = df[column_name].unique()  # все TCR
#     conversion_df = df[[column_name, 'antigen_epitope']]
#     positive_pairs = set([tuple(x) for x in conversion_df.to_numpy()])   # создаем массив из пар

#     # weight the tcr choice by frequency in data
#     # epitope_dist - value_counts по эпитопам
#     # tcr_dist - value_counts по сиквенсам
#     epitope_freq_array = [epitope_dist[peptide] / len(df) for peptide in unique_epitopes]
#     tcr_freq_array = [tcr_dist[tcr] / len(df) for tcr in unique_tcrs]
#     # получили веса встречаемости эпитопов и tcr 
    
#     neg_pairs = set()
#     for pep in unique_epitopes:
#         i = 0
#         pairs_to_generate = round(epitope_dist[pep] * ratio) # определяем количество негативных пар с эпитопом
#         # ratio - параметр который мы задаем для регулирования частоты встречаемости несвязывающегося эпитопа
#         while i < pairs_to_generate:
#             tcr = np.random.choice(unique_tcrs, p=tcr_freq_array)
#             pair = (tcr, pep)
#             if pair not in positive_pairs and pair not in neg_pairs:
#                 neg_pairs.add(pair)
#                 i += 1
            
#     negative_data = pd.DataFrame(neg_pairs, columns = [column_name, 'antigen_epitope'])
#     negative_data = negative_data.assign(bind=0)
#     return negative_data


# data_n = random_recombination(data, 'cdr3', data['antigen_epitope'].value_counts(), data.cdr3.value_counts())
# data_n.to_csv(f'dataset/negative_data/negative_data.csv', index=False)

In [7]:
data_n = pd.read_csv('dataset/negative_data/negative_data.csv')
data_n

Unnamed: 0,cdr3,antigen_epitope,bind
0,CSATGGDYNEQFF,RAKFKQLL,0
1,CASSAADTQYF,ALGIGILTV,0
2,CASSVNTATDTQYF,RAKFKQLL,0
3,CASSRTGSGELFF,NQKLIANQF,0
4,CASSQDGLAGNNEQFF,LLWNGPMAV,0
...,...,...,...
37734,CASSIRFGTEAFF,KLGGALQAK,0
37735,CASSQAQASSYEQYF,YLQPRTFLL,0
37736,CASSLVGSKNIQYF,NLVPMVATV,0
37737,CSARDNNEQFF,GILGFVFTL,0


## Merge data

In [11]:
data_f = pd.concat([data_p, data_n])
data_f

Unnamed: 0,cdr3,antigen_epitope,bind
0,CAAADEEIGNQPQHF,ATDALMTGY,1
1,CAAAERNTGELFF,YLQPRTFLL,1
2,CAAAGTASTDTQYF,RAKFKQLL,1
3,CAAATGLYGYTF,GILGFVFTL,1
4,CAACPGTENTGELFF,KLVALGINAV,1
...,...,...,...
37734,CASSIRFGTEAFF,KLGGALQAK,0
37735,CASSQAQASSYEQYF,YLQPRTFLL,0
37736,CASSLVGSKNIQYF,NLVPMVATV,0
37737,CSARDNNEQFF,GILGFVFTL,0


In [12]:
pairs = data_f[['cdr3', 'antigen_epitope']].copy()
targets = data_f['bind'].copy()

In [13]:
pairs

Unnamed: 0,cdr3,antigen_epitope
0,CAAADEEIGNQPQHF,ATDALMTGY
1,CAAAERNTGELFF,YLQPRTFLL
2,CAAAGTASTDTQYF,RAKFKQLL
3,CAAATGLYGYTF,GILGFVFTL
4,CAACPGTENTGELFF,KLVALGINAV
...,...,...
37734,CASSIRFGTEAFF,KLGGALQAK
37735,CASSQAQASSYEQYF,YLQPRTFLL
37736,CASSLVGSKNIQYF,NLVPMVATV
37737,CSARDNNEQFF,GILGFVFTL


In [14]:
targets

0        1
1        1
2        1
3        1
4        1
        ..
37734    0
37735    0
37736    0
37737    0
37738    0
Name: bind, Length: 84773, dtype: int64

In [15]:
X_train,  X_test, y_train, y_test = train_test_split(pairs, targets, train_size = train_size, random_state = 42, shuffle = True)

In [16]:
X_train.to_csv(f'dataset/X_train.csv', index=False)
X_test.to_csv(f'dataset/X_test.csv', index=False)
y_train.to_csv(f'dataset/y_train.csv', index=False)
y_test.to_csv(f'dataset/y_test.csv', index=False)

In [17]:
pd.read_csv('dataset/X_train.csv')

Unnamed: 0,cdr3,antigen_epitope
0,CSARDRVEEKLFF,AVFDRKSDAK
1,CASSPKLTVSLGANVLTF,NLVPMVATV
2,CASSAGQGLPYEQYF,NLVPMVATV
3,CASSVGYRNTEAFF,NLVPMVATV
4,CASSQGLASGGASTDTQYF,KLGGALQAK
...,...,...
59336,CASSARISGGLNEQYF,FRDYVDRFYKTLRAEQASQE
59337,CASSLEKASGGLAKNIQYF,GILGFVFTL
59338,CASSYMGPEAFF,KLVALGINAV
59339,CAISGPGGPTGELFF,YSEHPTFTSQY


In [18]:
pd.read_csv('dataset/X_test.csv')

Unnamed: 0,cdr3,antigen_epitope
0,CASSLVHRGHANTEAFF,KLGGALQAK
1,CASSFVGGSYEQYF,KLGGALQAK
2,CASSNSLGDEAFF,VMNILLQYV
3,CASRFRRPYGYTF,LLLGIGILV
4,CASSAGRVLPGEQYF,GILGFVFTL
...,...,...
25427,CASSQGTGRNTEAFF,LLQTGIHVRVSQPSL
25428,CASSIRSSDEQFF,VMATRRNVL
25429,CASRSWVRAPNQPQHF,TTDPSFLGRY
25430,CASSFTSGTTDTQYF,KLGGALQAK


In [19]:
pd.read_csv('dataset/y_train.csv')

Unnamed: 0,bind
0,0
1,0
2,0
3,0
4,1
...,...
59336,1
59337,0
59338,0
59339,1


In [20]:
pd.read_csv('dataset/y_test.csv')

Unnamed: 0,bind
0,1
1,1
2,0
3,1
4,0
...,...
25427,0
25428,0
25429,0
25430,1
