In [1]:
import os
import pandas as pd
from tqdm import tqdm
import numba 

### Download the data 

In [2]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, "data"))
trbvs = os.path.join(data_path, "trbvs.tsv")
trbjs = os.path.join(data_path, "trbjs.tsv")

trbvs_df = pd.read_csv(trbvs, sep = "\t")
trbjs_df = pd.read_csv(trbjs, sep = "\t")

In [3]:
trbvs_df.loc[:, "1":"109"] = trbvs_df.loc[:, "1":"109"].fillna("-") 
trbjs_df.loc[:, "1":"17"] = trbjs_df.loc[:, "1":"17"].fillna("-")

In [4]:
trbjs_df

Unnamed: 0,Species,Gene,Allele,AccNum,Functionality,1,2,3,4,5,...,8,9,10,11,12,13,14,15,16,17
0,Homosap,TRBJ1-1,TRBJ1-1*01,K02545,F,N,T,E,A,F,...,Q,G,T,R,L,T,V,V,-,-
1,Homosap,TRBJ1-2,TRBJ1-2*01,K02545,F,N,Y,G,Y,T,...,S,G,T,R,L,T,V,V,-,-
2,Homosap,TRBJ1-3,TRBJ1-3*01,M14158,F,S,G,N,T,I,...,G,E,G,S,W,L,T,V,V,-
3,Homosap,TRBJ1-4,TRBJ1-4*01,M14158,F,T,N,E,K,L,...,G,S,G,T,Q,L,S,V,L,-
4,Homosap,TRBJ1-5,TRBJ1-5*01,M14158,F,S,N,Q,P,Q,...,G,D,G,T,R,L,S,I,L,-
5,Homosap,TRBJ1-6,TRBJ1-6*01,M14158,F,S,Y,N,S,P,...,F,G,N,G,T,R,L,T,V,T
6,Homosap,TRBJ1-6,TRBJ1-6*02,L36092,F,S,Y,N,S,P,...,F,G,N,G,T,R,L,T,V,T
7,Homosap,TRBJ2-1,TRBJ2-1*01,X02987,F,S,Y,N,E,Q,...,G,P,G,T,R,L,T,V,L,-
8,Homosap,TRBJ2-2,TRBJ2-2*01,X02987,F,N,T,G,E,L,...,G,E,G,S,R,L,T,V,L,-
9,Homosap,TRBJ2-3,TRBJ2-3*01,X02987,F,S,T,D,T,Q,...,G,P,G,T,R,L,T,V,L,-


### Create dictionary of the J and V genes based on tsv file containing sequences

The keys of the dictionary are the names of the different genes and the values are lists containing tuples. First value of the tuple is the name of the allele and the second value is the sequence as a stiring.

In [5]:
dictionary = {}

for indx, row in trbjs_df.iterrows():
    key = row["Gene"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"17"].to_string(header = False, index = False)
    seq = seq.replace('\n', ' ')
    dictionary[key].append((row["Allele"], seq))

for indx, row in trbvs_df.iterrows():
    key = row["Gene"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"109"].to_string(header = False, index = False)
    seq = seq.replace('\n', ' ')
    dictionary[key].append((row["Allele"], seq))

### Fetching the sequence into dataset

In [6]:
train_data_path = os.path.join(data_path, "universal_TCR_pool", "train.tsv")

train_data = pd.read_csv(train_data_path, sep= '\t')

In [7]:
train_data

Unnamed: 0,seq,v,j
0,CASSGSWTGDYEQYF,TRBV6-1,TRBJ2-7
1,CASSKFSGRAQGWNNEQFF,TRBV19,TRBJ2-1
2,CASSPRTSGSGEFF,TRBV4-2,TRBJ2-1
3,CASSLTAGTLYNEQFF,TRBV14,TRBJ2-1
4,CASSLERDTDTQYF,TRBV7-6,TRBJ2-3
...,...,...,...
49988336,CASSPRGQATDTQYF,TRBV6-6,TRBJ2-3
49988337,CATPKGGGAYEQYF,TRBV27,TRBJ2-7
49988338,CASSEGQVAEQFF,TRBV2,TRBJ2-1
49988339,CASSSPGTYTGELFF,TRBV5-1,TRBJ2-2


In [8]:
@numba.jit(forceobj=True)
def replace_v_j(data, dictionary):
    for i in range(len(train_data)):
        data[i, 1] = dictionary[data[i, 1]][0][1]
        data[i, 2] = dictionary[data[i, 2]][0][1]
    return data

In [9]:
train_data_np = train_data.to_numpy()
new_data = tqdm(replace_v_j(train_data_np, dictionary), position=0, leave=True)
new_data = pd.DataFrame(data=new_data, columns=["seq","v","j"])

 34%|███▍      | 17199759/49988341 [00:06<00:11, 2770164.15it/s]

MemoryError: 

In [None]:
new_data