In [1]:
import os
import pandas as pd
import random
from tqdm.notebook import tqdm
import numba 
import numpy as np
import math

### Emerson data 

In [2]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "data"))
emerson_data_path = os.path.join(data_path, "emerson", "emerson_processed")

In [3]:
example_df = pd.read_csv(os.path.join(emerson_data_path, "whole_seqs_nn_train.tsv"), sep = '\t') 

In [4]:
example_df

Unnamed: 0,patient_id,seq,nn,v,v_allele,v_deletions,j,j_allele,j_deletions
0,P00421,CASSSPRLAGVPDTQYF,GCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGTTCAC...,TRBV11-2,2.0,6,TRBJ2-3,1.0,4
1,P00245,CASSPEGQGANTGELFF,ACACAGCAGGAGGACTCCGCCGTGTATCTCTGTGCCAGCAGCCCAG...,TRBV7-8,1.0,5,TRBJ2-2,1.0,0
2,P00473,CASSTGTSGFYEQYF,CAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCA...,TRBV11-2,2.0,5,TRBJ2-7,1.0,3
3,P00484,CASSRAGQTSEKLFF,CACGCCCTGCAGCCAGAAGACTCGGCCCTGTATCTCTGTGCCAGCA...,TRBV4-2,1.0,4,TRBJ1-4,1.0,7
4,P00440,CASSLGGIYSNTGELFF,ACACAGCAGGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAG...,TRBV7-2,1.0,1,TRBJ2-2,1.0,0
...,...,...,...,...,...,...,...,...,...
26427239,Keck0102_MC1,CASSLGQEFGTDTQYF,GCCTTGGAGCTGGACGACTCGGCCCTGTATCTCTGTGCCAGCAGCT...,TRBV5-4,1.0,0,TRBJ2-3,1.0,3
26427240,P00332,CASSPRTSGRRGNIQYF,TTGGAGCTGGACGACTCGGCCCTGTATCTCTGTGCCAGCAGCCCGA...,TRBV5-4,1.0,4,TRBJ2-4,1.0,6
26427241,Keck0092_MC1,CSVGAGTYEQYF,CTGACTGTGAGCAACATGAGCCCTGAAGACAGCAGCATATATCTCT...,TRBV29-1,1.0,6,TRBJ2-7,1.0,2
26427242,P00352,CASSQIRQGPNTEAFF,ACCCTGCAGCCAGAAGACTCGGCCCTGTATCTCTGTGCCAGCAGCC...,TRBV4-2,1.0,3,TRBJ1-1,1.0,2


In [5]:
train_df = example_df[['seq', 'v', 'v_allele', 'j', 'j_allele']]
del example_df
train_df

Unnamed: 0,seq,v,v_allele,j,j_allele
0,CASSSPRLAGVPDTQYF,TRBV11-2,2.0,TRBJ2-3,1.0
1,CASSPEGQGANTGELFF,TRBV7-8,1.0,TRBJ2-2,1.0
2,CASSTGTSGFYEQYF,TRBV11-2,2.0,TRBJ2-7,1.0
3,CASSRAGQTSEKLFF,TRBV4-2,1.0,TRBJ1-4,1.0
4,CASSLGGIYSNTGELFF,TRBV7-2,1.0,TRBJ2-2,1.0
...,...,...,...,...,...
26427239,CASSLGQEFGTDTQYF,TRBV5-4,1.0,TRBJ2-3,1.0
26427240,CASSPRTSGRRGNIQYF,TRBV5-4,1.0,TRBJ2-4,1.0
26427241,CSVGAGTYEQYF,TRBV29-1,1.0,TRBJ2-7,1.0
26427242,CASSQIRQGPNTEAFF,TRBV4-2,1.0,TRBJ1-1,1.0


Following line is just for saving memory i.e., remove it

In [6]:
train_df = train_df.drop(train_df.index[int(len(train_df)/2):])

In [7]:
@numba.jit(forceobj=True)
def fix_gene_codes(data):
    
    for i in tqdm(range(len(data)),position=0, leave=True):
        v_gene_family = data[i,1]
        j_gene_family = data[i,3]

        new_v_code = v_gene_family + '*0' + str(int(data[i,2]))
        data[i,1] = new_v_code

        new_j_code = j_gene_family + '*0' + str(int(data[i,4]))
        data[i,3] = new_j_code
    
    return data

In [8]:
train_df = train_df.to_numpy()
train_df = fix_gene_codes(train_df)

  0%|          | 0/13213622 [00:00<?, ?it/s]

In [9]:
train_data = pd.DataFrame(data=train_df, columns=["seq","v", "v_allle","j", "j_allele"])

In [10]:
train_data['v_seq'] = ''
train_data['j_seq'] = ''

In [11]:
train_data

Unnamed: 0,seq,v,v_allle,j,j_allele,v_seq,j_seq
0,CASSSPRLAGVPDTQYF,TRBV11-2*02,2.0,TRBJ2-3*01,1.0,,
1,CASSPEGQGANTGELFF,TRBV7-8*01,1.0,TRBJ2-2*01,1.0,,
2,CASSTGTSGFYEQYF,TRBV11-2*02,2.0,TRBJ2-7*01,1.0,,
3,CASSRAGQTSEKLFF,TRBV4-2*01,1.0,TRBJ1-4*01,1.0,,
4,CASSLGGIYSNTGELFF,TRBV7-2*01,1.0,TRBJ2-2*01,1.0,,
...,...,...,...,...,...,...,...
13213617,CASSLEGFGTIYF,TRBV11-2*02,2.0,TRBJ1-3*01,1.0,,
13213618,CASSAQVTNYGYTF,TRBV5-1*01,1.0,TRBJ1-2*01,1.0,,
13213619,CASSLESSAGEQYF,TRBV5-1*01,1.0,TRBJ2-7*01,1.0,,
13213620,CASSLELNNEQFF,TRBV7-3*01,1.0,TRBJ2-1*01,1.0,,


### Download the gene sequence data 

In [12]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "data"))
trbvs = os.path.join(data_path, "trbvs_full.tsv")
trbjs = os.path.join(data_path, "trbjs_full.tsv")

trbvs_df = pd.read_csv(trbvs, sep = "\t")
trbjs_df = pd.read_csv(trbjs, sep = "\t")

In [13]:
trbvs_df.loc[:, "1":"109"] = trbvs_df.loc[:, "1":"109"].fillna("-") 
trbjs_df.loc[:, "1":"17"] = trbjs_df.loc[:, "1":"17"].fillna("-")

In [14]:
trbjs_df

Unnamed: 0,Species,Gene,Allele,AccNum,Functionality,1,2,3,4,5,...,8,9,10,11,12,13,14,15,16,17
0,Homo sapiens,TRBJ1-1,TRBJ1-1*01,K02545,F,N,T,E,A,F,...,Q,G,T,R,L,T,V,V,-,-
1,Homo sapiens,TRBJ1-2,TRBJ1-2*01,K02545,F,N,Y,G,Y,T,...,S,G,T,R,L,T,V,V,-,-
2,Homo sapiens,TRBJ1-3,TRBJ1-3*01,M14158,F,S,G,N,T,I,...,G,E,G,S,W,L,T,V,V,-
3,Homo sapiens,TRBJ1-4,TRBJ1-4*01,M14158,F,T,N,E,K,L,...,G,S,G,T,Q,L,S,V,L,-
4,Homo sapiens,TRBJ1-5,TRBJ1-5*01,M14158,F,S,N,Q,P,Q,...,G,D,G,T,R,L,S,I,L,-
5,Homo sapiens,TRBJ1-6,TRBJ1-6*01,M14158,F,S,Y,N,S,P,...,F,G,N,G,T,R,L,T,V,T
6,Homo sapiens,TRBJ1-6,TRBJ1-6*02,L36092,F,S,Y,N,S,P,...,F,G,N,G,T,R,L,T,V,T
7,Homo sapiens,TRBJ2-1,TRBJ2-1*01,X02987,F,S,Y,N,E,Q,...,G,P,G,T,R,L,T,V,L,-
8,Homo sapiens,TRBJ2-2,TRBJ2-2*01,X02987,F,N,T,G,E,L,...,G,E,G,S,R,L,T,V,L,-
9,Homo sapiens,TRBJ2-2P,TRBJ2-2P*01,X02987,ORF,L,R,G,A,A,...,L,G,G,G,L,L,V,L,-,-


### Create dictionary of the J and V genes based on tsv file containing sequences

The keys of the dictionary are the names of the different genes and the values are lists containing tuples. First value of the tuple is the name of the allele and the second value is the sequence as a stiring.

In [15]:
dictionary = {}

for indx, row in trbjs_df.iterrows():
    key = row["Allele"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"17"].to_string(header = False, index = False)
    seq = seq.replace('\n', '')
    dictionary[key].append(seq)

for indx, row in trbvs_df.iterrows():
    key = row["Allele"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"109"].to_string(header = False, index = False)
    seq = seq.replace('\n', '')
    dictionary[key].append(seq)

### Fetching the sequence into dataset

In [16]:
@numba.jit(forceobj=True)
def replace_v_j(data, dictionary):
    '''
    replaces the v and j gene names with the amino acid sequence. 
    Data needs to be numpy array. If the variable rnd is False the
    1st available allele from the dictionary is chosen. Otherwise the
    allele is chosen randomly if multiple alleles are available.
    
    Dictionary need to be in form where the key is the name of the gene,
    and the value is list of tuples, where the 1st element of the tuple is
    the name of the allele and the 2nd element is the amino acid sequence of
    the allele.
    '''
    #print(data)

    for i in tqdm(range(len(data)),position=0, leave=True):
        data[i, 5] = ''.join(dictionary[data[i, 1]])
        data[i, 6] = ''.join(dictionary[data[i, 3]])
    return data

In [17]:
train_data_np = train_data.to_numpy()
#del train_data #Variables take too much memory 
new_data = replace_v_j(train_data_np, dictionary)
del train_data_np #Variables take too much memory 

  0%|          | 0/13213622 [00:00<?, ?it/s]

In [18]:
new_data = pd.DataFrame(data = new_data, columns=["seq","v", "v_allle","j", "j_allele", "v_seq", "j_seq"])

In [19]:
new_data

Unnamed: 0,seq,v,v_allle,j,j_allele,v_seq,j_seq
0,CASSSPRLAGVPDTQYF,TRBV11-2*02,2.0,TRBJ2-3*01,1.0,EAGVAQSPRYKIIEKRQSVAFWCNPISGH.......ATLYWYQQIL...,STDTQYFGPGTRLTVL-
1,CASSPEGQGANTGELFF,TRBV7-8*01,1.0,TRBJ2-2*01,1.0,GAGVSQSPRYKVAKRGQDVALRCDPISGH.......VSLFWYQQAL...,NTGELFFGEGSRLTVL-
2,CASSTGTSGFYEQYF,TRBV11-2*02,2.0,TRBJ2-7*01,1.0,EAGVAQSPRYKIIEKRQSVAFWCNPISGH.......ATLYWYQQIL...,SYEQYFGPGTRLTVT--
3,CASSRAGQTSEKLFF,TRBV4-2*01,1.0,TRBJ1-4*01,1.0,ETGVTQTPRHLVMGMTNKKSLKCEQHLGH.......NAMYWYKQSA...,TNEKLFFGSGTQLSVL-
4,CASSLGGIYSNTGELFF,TRBV7-2*01,1.0,TRBJ2-2*01,1.0,GAGVSQSPSNKVTEKGKDVELRCDPISGH.......TALYWYRQSL...,NTGELFFGEGSRLTVL-
...,...,...,...,...,...,...,...
13213617,CASSLEGFGTIYF,TRBV11-2*02,2.0,TRBJ1-3*01,1.0,EAGVAQSPRYKIIEKRQSVAFWCNPISGH.......ATLYWYQQIL...,SGNTIYFGEGSWLTVV-
13213618,CASSAQVTNYGYTF,TRBV5-1*01,1.0,TRBJ1-2*01,1.0,KAGVTQTPRYLIKTRGQQVTLSCSPISGH.......RSVSWYQQTP...,NYGYTFGSGTRLTVV--
13213619,CASSLESSAGEQYF,TRBV5-1*01,1.0,TRBJ2-7*01,1.0,KAGVTQTPRYLIKTRGQQVTLSCSPISGH.......RSVSWYQQTP...,SYEQYFGPGTRLTVT--
13213620,CASSLELNNEQFF,TRBV7-3*01,1.0,TRBJ2-1*01,1.0,GAGVSQTPSNKVTEKGKYVELRCDPISGH.......TALYWYRQSL...,SYNEQFFGPGTRLTVL-
