# Data preprocessing

In [44]:
import os
import sys
from IPython import get_ipython
from pathlib import Path

notebook_path = get_ipython().run_line_magic("pwd", "")
project_root = Path(notebook_path).parent.parent.parent
sys.path.append(str(project_root)) 
import pandas as pd
import numpy as np
from modules.modules_kidera.kidera import kidera_final_dict
import torch

In [36]:
def sequence_to_factor(sequence, kidera_dict):
    return np.array([kidera_dict[aa] for aa in sequence], dtype=np.float32).T

## Data preprocessing for cdr3 sequences

In [37]:
data=pd.read_csv('../../../dataset/datasets_kidera/autoencoder_vdjdb_train/vdj_preprocessed.csv')

In [38]:
def insert_x(x):
    length_X = 19 - len(x)
    if len(x) == 4:
        return x[:2] + 'X' * 15 + x[2:]
    elif len(x) == 5:
        return x[:2] + 'X' * 7 + x[2] + 'X' * 7 + x[3:]
    elif len(x) == 6:
        return x[:3] + 'X' * 13 + x[3:]
    else:
        pref, suff = x[:3], x[-3:]
        mid = x[3:-3]
        return pref + 'X' * (length_X // 2 + length_X % 2) + mid + 'X' * (length_X // 2) + suff

In [39]:
data['cdr3'] = data['cdr3'].apply(insert_x)
binding=data[['bind']]

In [40]:
data.to_csv('../../../dataset/datasets_kidera/check_quality/cdr3_quality.csv')

In [41]:
data_cdr3=data[['cdr3']]

In [42]:
data_train_test_cdr3 = torch.tensor(np.stack(data_cdr3['cdr3'].map(
    lambda seq: sequence_to_factor(seq, kidera_final_dict)
).values, axis=0),dtype=torch.float32).unsqueeze(1)

In [43]:
torch.save(data_train_test_cdr3, '../../../dataset/datasets_kidera/autoencoder_vdjdb_train/data_train_test_cdr3.pt')
binding.to_csv('../../../dataset/datasets_kidera/autoencoder_vdjdb_train/binding.csv')

## Data preprocessing for epitope sequences

In [27]:
epitopes=pd.read_csv('../../../dataset/datasets_kidera/autoencoder_epitope_train/epitope_clean.csv')
epitopes = epitopes[(epitopes['Epitope - Object Type'] == 'Linear peptide') & (epitopes['Epitope - Species'] == 'Homo sapiens')]
epitopes = epitopes['Epitope - Name']
epitopes = epitopes[~epitopes.str.contains(r'[a-z()0-9\s]')] 
epitopes = epitopes[(epitopes.str.len() >= 6) & (epitopes.str.len() <= 20)]
epitopes_test=data[['antigen_epitope']]
epitopes_test = epitopes_test[~epitopes_test['antigen_epitope'].str.startswith('KLG')]
epitopes.name='antigen_epitope'

In [32]:
def func_antigen(antigen):
    n=20
    start_end=(n-len(antigen))//2
    if len(antigen)%2==0:
        return start_end*'X'+antigen+start_end*'X'
    else:
        return start_end*'X'+antigen[:len(antigen)//2]+'X'+antigen[len(antigen)//2:]+start_end*'X' 
def epitope_to_kidera(epitopes,kidera_dict):
    epitopes = epitopes.apply(func_antigen)
    epitopes_test_tensor=epitopes_test.apply(func_antigen)
    epitopes.to_csv('../../../dataset/datasets_kidera/check_quality/epitopes_quality.csv',index=False)
    factors_array = np.stack(epitopes.map(lambda seq: sequence_to_factor(seq, kidera_dict)).values, axis=0)
    factors_tensor = torch.tensor(factors_array, dtype=torch.float32).unsqueeze(1) 
    return factors_tensor
epitope_tensor=epitope_to_kidera(epitopes,kidera_final_dict)
epitopes_test_tensor=epitope_to_kidera(epitopes_test['antigen_epitope'],kidera_final_dict)

In [33]:
torch.save(epitope_tensor, '../../../dataset/datasets_kidera/autoencoder_epitope_train/epitope_tensor.pt')
torch.save(epitopes_test_tensor,'../../../dataset/datasets_kidera/autoencoder_epitope_train/epitopes_test_tensor.pt')

In [34]:
epitopes=pd.read_csv('../../../dataset/datasets_kidera/check_quality/epitopes_quality.csv')
len(epitopes['antigen_epitope'].str.cat(sep=''))

1247760