In [1]:
import os
import pandas as pd
import random
from tqdm.notebook import tqdm
import numba 
import numpy as np
import math

### Emerson data 

In [2]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, "data"))
emerson_data_path = os.path.join(data_path, "emerson")

In [3]:
example_df = pd.read_csv(os.path.join(emerson_data_path, "P00002.tsv"), sep = '\t') 

  example_df = pd.read_csv(os.path.join(emerson_data_path, "P00002.tsv"), sep = '\t')


In [4]:
columns = ["cdr3_amino_acid", "chosen_v_family", "chosen_v_gene", "chosen_v_allele", "chosen_j_family", "chosen_j_gene", "chosen_j_allele"]
example_df = example_df.loc[:,columns].dropna(subset = ["cdr3_amino_acid", "chosen_v_family", "chosen_j_family"])

In [5]:
example_df

Unnamed: 0,cdr3_amino_acid,chosen_v_family,chosen_v_gene,chosen_v_allele,chosen_j_family,chosen_j_gene,chosen_j_allele
0,CASSAFPGGGETQYF,TCRBV05,5.0,1.0,TCRBJ02,5.0,1.0
1,CASSLHSNQPQHF,TCRBV05,5.0,1.0,TCRBJ01,5.0,1.0
2,CASSLEGQGGPEAFF,TCRBV05,5.0,1.0,TCRBJ01,1.0,1.0
3,CASSLAGGGDANTEAFF,TCRBV05,5.0,1.0,TCRBJ01,1.0,1.0
4,CASSAEQALIYGYTF,TCRBV05,5.0,1.0,TCRBJ01,2.0,1.0
...,...,...,...,...,...,...,...
102315,CASSQVGTVSGNTIYF,TCRBV04,3.0,1.0,TCRBJ01,3.0,1.0
102316,CASSQRKAQPQHF,TCRBV04,3.0,1.0,TCRBJ01,5.0,1.0
102607,CASSQDRGSTEAFF,TCRBV04,3.0,1.0,TCRBJ01,1.0,1.0
102608,CASSQDLRESPLHF,TCRBV04,3.0,1.0,TCRBJ01,6.0,1.0


In [6]:
def fix_gene_codes(row):
    v_gene_family = row[1]
    j_gene_family = row[4]
    
    if v_gene_family[5] == "0":
        v_gene_family = v_gene_family[:5] + v_gene_family[5+1:]
        
    if math.isnan(float(row[2])):
        new_v_code = v_gene_family + '*0' + str(int(row[3]))
        row[1] = new_v_code
    else:
        new_v_code = v_gene_family+ '-' + str(int(row[2])) + '*0' + str(int(row[3]))
        row[1] = new_v_code
    
    if j_gene_family[5] == "0":
        j_gene_family = j_gene_family[:5] + j_gene_family[5+1:]
        
    if math.isnan(float(row[5])):
        new_j_code = j_gene_family + '*0' + str(int(row[6]))
        row[4] = new_j_code
    else:
        new_j_code = j_gene_family+ '-' + str(int(row[5])) + '*0' + str(int(row[6]))
        row[4] = new_j_code
    
    return row

In [7]:
df = example_df.apply(fix_gene_codes, axis=1)
df = df.drop(["chosen_v_gene", "chosen_v_allele","chosen_j_gene", "chosen_j_allele"], axis = 1)
df.rename(columns = {'cdr3_amino_acid':'seq', 'chosen_v_family':'v','chosen_j_family':'j'}, inplace = True)

In [8]:
df

Unnamed: 0,seq,v,j
0,CASSAFPGGGETQYF,TCRBV5-5*01,TCRBJ2-5*01
1,CASSLHSNQPQHF,TCRBV5-5*01,TCRBJ1-5*01
2,CASSLEGQGGPEAFF,TCRBV5-5*01,TCRBJ1-1*01
3,CASSLAGGGDANTEAFF,TCRBV5-5*01,TCRBJ1-1*01
4,CASSAEQALIYGYTF,TCRBV5-5*01,TCRBJ1-2*01
...,...,...,...
102315,CASSQVGTVSGNTIYF,TCRBV4-3*01,TCRBJ1-3*01
102316,CASSQRKAQPQHF,TCRBV4-3*01,TCRBJ1-5*01
102607,CASSQDRGSTEAFF,TCRBV4-3*01,TCRBJ1-1*01
102608,CASSQDLRESPLHF,TCRBV4-3*01,TCRBJ1-6*01


### Load the data utilised for comparing GRU and soNNia

In [9]:
train_data_path = os.path.join(data_path, "universal_TCR_pool", "train.tsv")

train_data = pd.read_csv(train_data_path, sep= '\t')

In [10]:
train_data

Unnamed: 0,seq,v,j
0,CASSGSWTGDYEQYF,TRBV6-1,TRBJ2-7
1,CASSKFSGRAQGWNNEQFF,TRBV19,TRBJ2-1
2,CASSPRTSGSGEFF,TRBV4-2,TRBJ2-1
3,CASSLTAGTLYNEQFF,TRBV14,TRBJ2-1
4,CASSLERDTDTQYF,TRBV7-6,TRBJ2-3
...,...,...,...
49988336,CASSPRGQATDTQYF,TRBV6-6,TRBJ2-3
49988337,CATPKGGGAYEQYF,TRBV27,TRBJ2-7
49988338,CASSEGQVAEQFF,TRBV2,TRBJ2-1
49988339,CASSSPGTYTGELFF,TRBV5-1,TRBJ2-2


Combine the emerson info to dataset

In [40]:
@numba.jit(forceobj=True)
def add_alleles(data, emerson):
    for i in tqdm(range(len(data)),position=0, leave=True):
        length = len(data[i,0])
        if data[i,0] in emerson[:,0] and data[i,1] in emerson[:,1][length] and data[i,2] in emerson[:,2][length]:
            print(np.where(emerson == data[i,0])[0])
    

In [41]:
train_data_np = train_data.to_numpy()
emerson = df.drop_duplicates()
emerson_np = emerson.to_numpy()

add_alleles(train_data_np, emerson_np)

  0%|          | 0/49988341 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Download the gene sequence data 

In [None]:
data_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, "data"))
trbvs = os.path.join(data_path, "trbvs.tsv")
trbjs = os.path.join(data_path, "trbjs.tsv")

trbvs_df = pd.read_csv(trbvs, sep = "\t")
trbjs_df = pd.read_csv(trbjs, sep = "\t")

In [None]:
trbvs_df.loc[:, "1":"109"] = trbvs_df.loc[:, "1":"109"].fillna("-") 
trbjs_df.loc[:, "1":"17"] = trbjs_df.loc[:, "1":"17"].fillna("-")

In [None]:
trbjs_df

### Create dictionary of the J and V genes based on tsv file containing sequences

The keys of the dictionary are the names of the different genes and the values are lists containing tuples. First value of the tuple is the name of the allele and the second value is the sequence as a stiring.

In [None]:
dictionary = {}

for indx, row in trbjs_df.iterrows():
    key = row["Gene"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"17"].to_string(header = False, index = False)
    seq = seq.replace('\n', ' ')
    dictionary[key].append((row["Allele"], seq))

for indx, row in trbvs_df.iterrows():
    key = row["Gene"]
    if key not in dictionary.keys():
        dictionary[key] = []
    seq = row.loc["1":"109"].to_string(header = False, index = False)
    seq = seq.replace('\n', ' ')
    dictionary[key].append((row["Allele"], seq))

### Fetching the sequence into dataset

In [None]:
@numba.jit(forceobj=True)
def replace_v_j(data, dictionary, rnd = False, seed = 123):
    '''
    replaces the v and j gene names with the amino acid sequence. 
    Data needs to be numpy array. If the variable rnd is False the
    1st available allele from the dictionary is chosen. Otherwise the
    allele is chosen randomly if multiple alleles are available.
    
    Dictionary need to be in form where the key is the name of the gene,
    and the value is list of tuples, where the 1st element of the tuple is
    the name of the allele and the 2nd element is the amino acid sequence of
    the allele.
    '''
    random.seed(seed)
    for i in tqdm(range(len(data)),position=0, leave=True):
        if not rnd:
            data[i, 1] = dictionary[data[i, 1]][0][1]
            data[i, 2] = dictionary[data[i, 2]][0][1]
        else:
            data[i, 1] = random.choice(dictionary[data[i, 1]])[1]
            data[i, 2] = random.choice(dictionary[data[i, 2]])[1]
    return data

In [None]:
train_data_np = train_data.to_numpy()
del train_data #Variables take too much memory 

new_data = replace_v_j(train_data_np, dictionary, rnd = True)
del train_data_np #Variables take too much memory 

In [None]:
new_data = pd.DataFrame(data=new_data, columns=["seq","v","j"])

In [None]:
new_data.loc[5000000,"v"]