In [1]:
import os
import pandas as pd
import random
from tqdm import tqdm
import numba 

### Emerson data 

In [6]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "data"))
emerson_data_path = os.path.join(data_path, "emerson")

In [19]:
example_df = pd.read_csv(os.path.join(emerson_data_path, "P00002.tsv"), sep = '\t') 

  example_df = pd.read_csv(os.path.join(emerson_data_path, "P00002.tsv"), sep = '\t')


In [22]:
columns = ["cdr3_amino_acid", "chosen_v_family", "chosen_v_gene", "chosen_v_allele", "chosen_j_family", "chosen_j_gene", "chosen_j_allele"]
example_df = example_df.loc[:,columns].dropna()

In [41]:
example_df

Unnamed: 0,cdr3_amino_acid,chosen_v_family,chosen_v_gene,chosen_v_allele,chosen_j_family,chosen_j_gene,chosen_j_allele
0,CASSAFPGGGETQYF,TCRBV05,5.0,1.0,TCRBJ02,5.0,1.0
1,CASSLHSNQPQHF,TCRBV05,5.0,1.0,TCRBJ01,5.0,1.0
2,CASSLEGQGGPEAFF,TCRBV05,5.0,1.0,TCRBJ01,1.0,1.0
3,CASSLAGGGDANTEAFF,TCRBV05,5.0,1.0,TCRBJ01,1.0,1.0
4,CASSAEQALIYGYTF,TCRBV05,5.0,1.0,TCRBJ01,2.0,1.0
...,...,...,...,...,...,...,...
102315,CASSQVGTVSGNTIYF,TCRBV04,3.0,1.0,TCRBJ01,3.0,1.0
102316,CASSQRKAQPQHF,TCRBV04,3.0,1.0,TCRBJ01,5.0,1.0
102607,CASSQDRGSTEAFF,TCRBV04,3.0,1.0,TCRBJ01,1.0,1.0
102608,CASSQDLRESPLHF,TCRBV04,3.0,1.0,TCRBJ01,6.0,1.0


In [42]:
def fix_gene_codes(row):
    v_gene_family = row[1]
    j_gene_family = row[4]
    
    if v_gene_family[5] == "0":
        v_gene_family = v_gene_family[:5] + v_gene_family[5+1:]
    
    if j_gene_family[5] == "0":
        j_gene_family = j_gene_family[:5] + j_gene_family[5+1:]
        
    new_v_code = v_gene_family+ '-' + str(int(row[2])) + '*0' + str(int(row[3]))
    new_j_code = j_gene_family+ '-' + str(int(row[5])) + '*0' + str(int(row[6]))
    row[1] = new_v_code
    row[4] = new_j_code
    return row

In [50]:
df = example_df.apply(fix_gene_codes, axis=1)
df = df.drop(["chosen_v_gene", "chosen_v_allele","chosen_j_gene", "chosen_j_allele"], axis = 1)
df.rename(columns = {'cdr3_amino_acid':'seq', 'chosen_v_family':'v','chosen_j_family':'j'}, inplace = True)

In [51]:
df

Unnamed: 0,seq,v,j
0,CASSAFPGGGETQYF,TCRBV5-5*01,TCRBJ2-5*01
1,CASSLHSNQPQHF,TCRBV5-5*01,TCRBJ1-5*01
2,CASSLEGQGGPEAFF,TCRBV5-5*01,TCRBJ1-1*01
3,CASSLAGGGDANTEAFF,TCRBV5-5*01,TCRBJ1-1*01
4,CASSAEQALIYGYTF,TCRBV5-5*01,TCRBJ1-2*01
...,...,...,...
102315,CASSQVGTVSGNTIYF,TCRBV4-3*01,TCRBJ1-3*01
102316,CASSQRKAQPQHF,TCRBV4-3*01,TCRBJ1-5*01
102607,CASSQDRGSTEAFF,TCRBV4-3*01,TCRBJ1-1*01
102608,CASSQDLRESPLHF,TCRBV4-3*01,TCRBJ1-6*01


### Download the gene sequence data 

In [2]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, "data"))
trbvs = os.path.join(data_path, "trbvs.tsv")
trbjs = os.path.join(data_path, "trbjs.tsv")

trbvs_df = pd.read_csv(trbvs, sep = "\t")
trbjs_df = pd.read_csv(trbjs, sep = "\t")

In [3]:
trbvs_df.loc[:, "1":"109"] = trbvs_df.loc[:, "1":"109"].fillna("-") 
trbjs_df.loc[:, "1":"17"] = trbjs_df.loc[:, "1":"17"].fillna("-")

In [4]:
trbjs_df

Unnamed: 0,Species,Gene,Allele,AccNum,Functionality,1,2,3,4,5,...,8,9,10,11,12,13,14,15,16,17
0,Homosap,TRBJ1-1,TRBJ1-1*01,K02545,F,N,T,E,A,F,...,Q,G,T,R,L,T,V,V,-,-
1,Homosap,TRBJ1-2,TRBJ1-2*01,K02545,F,N,Y,G,Y,T,...,S,G,T,R,L,T,V,V,-,-
2,Homosap,TRBJ1-3,TRBJ1-3*01,M14158,F,S,G,N,T,I,...,G,E,G,S,W,L,T,V,V,-
3,Homosap,TRBJ1-4,TRBJ1-4*01,M14158,F,T,N,E,K,L,...,G,S,G,T,Q,L,S,V,L,-
4,Homosap,TRBJ1-5,TRBJ1-5*01,M14158,F,S,N,Q,P,Q,...,G,D,G,T,R,L,S,I,L,-
5,Homosap,TRBJ1-6,TRBJ1-6*01,M14158,F,S,Y,N,S,P,...,F,G,N,G,T,R,L,T,V,T
6,Homosap,TRBJ1-6,TRBJ1-6*02,L36092,F,S,Y,N,S,P,...,F,G,N,G,T,R,L,T,V,T
7,Homosap,TRBJ2-1,TRBJ2-1*01,X02987,F,S,Y,N,E,Q,...,G,P,G,T,R,L,T,V,L,-
8,Homosap,TRBJ2-2,TRBJ2-2*01,X02987,F,N,T,G,E,L,...,G,E,G,S,R,L,T,V,L,-
9,Homosap,TRBJ2-3,TRBJ2-3*01,X02987,F,S,T,D,T,Q,...,G,P,G,T,R,L,T,V,L,-


### Create dictionary of the J and V genes based on tsv file containing sequences

The keys of the dictionary are the names of the different genes and the values are lists containing tuples. First value of the tuple is the name of the allele and the second value is the sequence as a stiring.

In [5]:
dictionary = {}

for indx, row in trbjs_df.iterrows():
    key = row["Gene"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"17"].to_string(header = False, index = False)
    seq = seq.replace('\n', ' ')
    dictionary[key].append((row["Allele"], seq))

for indx, row in trbvs_df.iterrows():
    key = row["Gene"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"109"].to_string(header = False, index = False)
    seq = seq.replace('\n', ' ')
    dictionary[key].append((row["Allele"], seq))

### Load the data utilised for comparing GRU and soNNia

In [13]:
train_data_path = os.path.join(data_path, "universal_TCR_pool", "train.tsv")

train_data = pd.read_csv(train_data_path, sep= '\t')

In [7]:
train_data

Unnamed: 0,seq,v,j
0,CASSGSWTGDYEQYF,TRBV6-1,TRBJ2-7
1,CASSKFSGRAQGWNNEQFF,TRBV19,TRBJ2-1
2,CASSPRTSGSGEFF,TRBV4-2,TRBJ2-1
3,CASSLTAGTLYNEQFF,TRBV14,TRBJ2-1
4,CASSLERDTDTQYF,TRBV7-6,TRBJ2-3
...,...,...,...
49988336,CASSPRGQATDTQYF,TRBV6-6,TRBJ2-3
49988337,CATPKGGGAYEQYF,TRBV27,TRBJ2-7
49988338,CASSEGQVAEQFF,TRBV2,TRBJ2-1
49988339,CASSSPGTYTGELFF,TRBV5-1,TRBJ2-2


### Fetching the sequence into dataset

In [14]:
@numba.jit(forceobj=True)
def replace_v_j(data, dictionary, rnd = False, seed = 123):
    '''
    replaces the v and j gene names with the amino acid sequence. 
    Data needs to be numpy array. If the variable rnd is False the
    1st available allele from the dictionary is chosen. Otherwise the
    allele is chosen randomly if multiple alleles are available.
    
    Dictionary need to be in form where the key is the name of the gene,
    and the value is list of tuples, where the 1st element of the tuple is
    the name of the allele and the 2nd element is the amino acid sequence of
    the allele.
    '''
    random.seed(seed)
    for i in tqdm(range(len(data)),position=0, leave=True):
        if not rnd:
            data[i, 1] = dictionary[data[i, 1]][0][1]
            data[i, 2] = dictionary[data[i, 2]][0][1]
        else:
            data[i, 1] = random.choice(dictionary[data[i, 1]])[1]
            data[i, 2] = random.choice(dictionary[data[i, 2]])[1]
    return data

In [15]:
train_data_np = train_data.to_numpy()
del train_data #Variables take too much memory 

new_data = replace_v_j(train_data_np, dictionary, rnd = True)
del train_data_np #Variables take too much memory 

100%|███████████████████████████| 49988341/49988341 [01:02<00:00, 803768.54it/s]


In [16]:
new_data = pd.DataFrame(data=new_data, columns=["seq","v","j"])

In [20]:
new_data.loc[5000000,"v"]

'E A Q V T Q N P R Y L I T V T G K K L T V T C S Q N M N H . . . . . . . E Y M S W Y R Q D P G L G L R Q I Y Y S M N . . . . V E V T D K G D V P . E G Y K V S R K . E K R N F P L I L E S P S P N Q T S L Y F C A S S L -'