This notebook contains the analysis of the processed emerson data: distributions of V- and J- gene combinations, deletions on V- gene and J-gene, CDR3 length, and deletions on invidual V- and J- genes. The figures are saved on `figures` directory locating on the same root as this notebook.

In [139]:
import numpy as np
import numba
import matplotlib.pyplot as plt
import pandas as pd 
import os 
from tqdm import tqdm 

In [2]:
# Location of the processed emerson data 
# Use process_data.py script to process the pure emerson data
data_path = os.path.join(os.getcwd(), os.pardir, "data", "emerson", "emerson_processed")

If the `whole_data_flag` is set `True`, the whole dataset is utilised for the analysis. Otherwise the trainset is utilised.  

In [3]:
whole_data_flag = False

In [4]:
if whole_data_flag:
    whole_data = pd.read_csv(os.path.join(data_path, "whole_seqs_nn.tsv"), sep = '\t')
    data = whole_data
else:
    train_data = pd.read_csv(os.path.join(data_path, "whole_seqs_nn_train.tsv"), sep = '\t')
    data = train_data

In [5]:
len(data)

26427244

# V- and J-gene combinations distribution

In [None]:
v_genes = data.loc[:,"v"]
j_genes = data.loc[:,"j"]

In [None]:
@numba.jit(forceobj=True)
def create_combinations(data):
    result = []
    for i in tqdm(range(len(data)),position=0, leave=True):
        result.append(f'{data[i, 3]}, {data[i, 6]}')
    return result 

In [None]:
v_j_combinations = create_combinations(data.to_numpy())

In [None]:
gene_count = {}
for comb in tqdm(v_j_combinations, total = data.shape[0], position=0, leave=True):
    key = str(comb)
    if key not in gene_count.keys():
        gene_count[key] = 1
    else:
        gene_count[key] += 1
gene_count.update((x, y/data.shape[0]) for x, y in gene_count.items())

In [None]:
gene_count_sorted = dict(sorted(gene_count.items(), key = lambda x: x[1], reverse= True))

In [None]:
sum(gene_count_sorted.values())


In [None]:
fig = plt.figure(figsize=(50,50))
plt.bar(x=list(gene_count_sorted.keys()), height= list(gene_count_sorted.values()), align='edge')
plt.xticks(rotation=-90)
plt.show

### Latex codes generation for the tables

The table is divided in to seven parts to fit the tables on pages of the latex document.

To use the tables just copy the printed latex codes and paste them to your document. You may need to add following lines around the printed latex codes:

<code>\begin{table}[]
\centering
\caption{Your caption}    
%%% Paste the printed table here%%% 
\end{table}
</code>

In [None]:
indx = int(len(gene_count_sorted)/7)

In [None]:
def gene_combination(begin_indx, end_indx):
    print(pd.DataFrame(data={'gene combination': list(gene_count_sorted.keys())[begin_indx:end_indx], 'fraction (%)': [f'{i*100:.3}'for i in list(gene_count_sorted.values())[begin_indx:end_indx]]}).to_latex(index=False))

In [None]:
gene_combination(0, indx)
gene_combination(indx, 2*indx)
gene_combination(2*indx, 3*indx)
gene_combination(3*indx, 4*indx)
gene_combination(4*indx, 5*indx)
gene_combination(5*indx, 6*indx)
gene_combination(6*indx, len(gene_count_sorted))

# distribution of deletions of v gene  

In [None]:
v_deletions= data.loc[:,"v_deletions"]

In [None]:
v_deletion_count = {}
for comb in tqdm(v_deletions, total = data.shape[0], position=0, leave=True):
    key = str(comb)
    if key not in v_deletion_count.keys():
        v_deletion_count[key] = 1
    else:
        v_deletion_count[key] += 1
v_deletion_count.update((x, y/data.shape[0]) for x, y in v_deletion_count.items())

In [None]:
v_deletion_count_sorted = dict(sorted(v_deletion_count.items(), key= lambda x: x[1], reverse=True))
v_deletion_count_sorted

In [None]:
sum(list(v_deletion_count_sorted.values()))

In [None]:
print(pd.DataFrame(data={'number of deletions': list(v_deletion_count_sorted.keys()), 'fraction (%)': [f'{i *100:.3}' for i in v_deletion_count_sorted.values()]}).to_latex(index=False))

In [None]:
sum(list(v_deletion_count_sorted.values())[:11])

In [None]:
list(v_deletion_count_sorted.keys())[:11]

In [None]:
v_deletion_count_sorted = {int(k):v for k,v in v_deletion_count_sorted.items()}
v_deletion_count_sorted = dict(sorted(v_deletion_count_sorted.items(), reverse=False))

In [None]:
fig = plt.figure(figsize = (10,5))
plt.bar(v_deletion_count_sorted.keys(), v_deletion_count_sorted.values())
plt.title("Deletions on V gene")
plt.xlabel("No. of deletions")
plt.ylabel("Fraction of the train set")
plt.show
plt.savefig(os.path.join("figures", "V_deletion_train.png"))

# distribution of deletions on J - genes  

In [None]:
j_deletions = data.loc[:,"j_deletions"]

In [None]:
j_deletion_count = {}
for comb in tqdm(j_deletions, total = data.shape[0], position=0, leave=True):
    key = str(comb)
    if key not in j_deletion_count.keys():
        j_deletion_count[key] = 1
    else:
        j_deletion_count[key] += 1
j_deletion_count.update((x, y/data.shape[0]) for x, y in j_deletion_count.items())

In [None]:
j_deletion_count_sorted = dict(sorted(j_deletion_count.items(), key= lambda x: x[1], reverse=True))
j_deletion_count_sorted

In [None]:
print(pd.DataFrame(data={'number of deletions': list(j_deletion_count_sorted.keys()), 'fraction (%)': [f'{i *100:.3}' for i in j_deletion_count_sorted.values()]}).to_latex(index=False))

In [None]:
sum(list(j_deletion_count_sorted.values())[:12])

In [None]:
list(j_deletion_count_sorted.keys())[:12]

In [None]:
j_deletion_count_sorted = {int(k):v for k,v in j_deletion_count_sorted.items()}
j_deletion_count_sorted = dict(sorted(j_deletion_count_sorted.items(), reverse=False))

In [None]:
fig = plt.figure(figsize = (10,5))
plt.bar(j_deletion_count_sorted.keys(), j_deletion_count_sorted.values())
plt.title("Deletions on J gene")
plt.xlabel("No. of deletions")
plt.ylabel("Fraction of the train set")
plt.show
plt.savefig(os.path.join("figures", "J_deletion_train.png"))

# Distribution of Lengths of cdr3 sequences 

In [None]:
cdr3 = data.loc[:, 'seq']

In [None]:
@numba.jit(forceobj=True)
def count_seq_len(data):
    result = []
    for i in tqdm(range(len(data)),position=0, leave=True):
        result.append(len(data[i]))
    return result

In [None]:
cdr3_lengths = count_seq_len(cdr3.to_numpy())

In [None]:
cdr3_len_count = {}
for len_ in tqdm(cdr3_lengths, total = data.shape[0], position=0, leave=True):
    key = str(len_)
    if key not in cdr3_len_count.keys():
        cdr3_len_count[key] = 1
    else:
        cdr3_len_count[key] += 1
cdr3_len_count.update((x, y/data.shape[0]) for x, y in cdr3_len_count.items())

In [None]:
cdr3_len_count

In [None]:
sum(list(cdr3_len_count.values()))

In [None]:
sorted_cdr3_len = dict(sorted(cdr3_len_count.items(), key = lambda x: x[1], reverse = True))

In [None]:
sum(list(sorted_cdr3_len.values())[:9])

In [None]:
list(sorted_cdr3_len.keys())[:9]

In [None]:
print(pd.DataFrame(data={'len of cdr3': sorted_cdr3_len.keys(), 'fraction (%)': [f'{i*100:.3}'for i in sorted_cdr3_len.values()]}).to_latex(index=False))

In [None]:
sorted_cdr3_len = {int(k):v for k,v in sorted_cdr3_len.items()}
sorted_cdr3_len= dict(sorted(sorted_cdr3_len.items(), reverse=False))

In [None]:
fig = plt.figure(figsize = (10,5))
plt.bar(sorted_cdr3_len.keys(), sorted_cdr3_len.values())
plt.title("Lengths of CDR3 sequences")
plt.xlabel("Length of CDR3 seq")
plt.ylabel("Fraction of the train set")
plt.show
plt.savefig(os.path.join("figures","CDR3_train.png"))

# Specific V gene deletions 

In [None]:
v_genes_deletions = data.loc[:,['v', 'v_deletions']]

In [None]:
@numba.jit(forceobj=True)
def create_list(data):
    result = []
    for i in tqdm(range(len(data)),position=0, leave=True):
        result.append((data[i, 0], data[i, 1]))
    return result 

In [None]:
v_genes_deletions

In [None]:
v_genes_deletions_list = create_list(v_genes_deletions.to_numpy())

In [None]:
gene_deletion_list = {}
for i in tqdm(v_genes_deletions_list, total = data.shape[0], position=0, leave=True):
    key = str(i[0])
    value = i[1]
    if key not in gene_deletion_list.keys():
        gene_deletion_list[key] = [value]
    else:
        gene_deletion_list[key].append(value)

In [None]:
v_gene_deletions = {}
for i in tqdm(gene_deletion_list.keys(), total = len(gene_deletion_list), position = 0, leave = True):
    gene_count = {}
    total = 0
    for j in gene_deletion_list[i]:
        key = j
        if key not in gene_count.keys():
            gene_count[key] = 1
        else:
            gene_count[key] += 1
        total += 1
    gene_count.update((x, y/total) for x, y in gene_count.items())
    v_gene_deletions[i] = gene_count

In [None]:
v_gene_deletions

In [None]:
trbv112 = v_gene_deletions['TRBV11-2']

In [None]:
trbv112 = {int(k): v for k,v in trbv112.items()}
trbv112 = dict(sorted(trbv112.items()))
trbv112

In [None]:
fig = plt.figure(figsize = (10,5))
plt.bar(trbv112.keys(), trbv112.values())
plt.show()

In [None]:
len(v_gene_deletions)

# Specific J gene deletions 

In [None]:
j_genes_deletions = data.loc[:,['j', 'j_deletions']]

In [None]:
@numba.jit(forceobj=True)
def create_list(data):
    result = []
    for i in tqdm(range(len(data)),position=0, leave=True):
        result.append((data[i, 0], data[i, 1]))
    return result 

In [None]:
j_genes_deletions

In [None]:
j_genes_deletions_list = create_list(j_genes_deletions.to_numpy())

In [None]:
gene_deletion_list = {}
for i in tqdm(j_genes_deletions_list, total = data.shape[0], position=0, leave=True):
    key = str(i[0])
    value = i[1]
    if key not in gene_deletion_list.keys():
        gene_deletion_list[key] = [value]
    else:
        gene_deletion_list[key].append(value)

In [None]:
j_gene_deletions = {}
for i in tqdm(gene_deletion_list.keys(), total = len(gene_deletion_list), position = 0, leave = True):
    gene_count = {}
    total = 0
    for j in gene_deletion_list[i]:
        key = j
        if key not in gene_count.keys():
            gene_count[key] = 1
        else:
            gene_count[key] += 1
        total += 1
    gene_count.update((x, y/total) for x, y in gene_count.items())
    j_gene_deletions[i] = gene_count

In [None]:
j_gene_deletions

### Tests

In [100]:
data.head()

Unnamed: 0,patient_id,seq,nn,v,v_allele,v_deletions,j,j_allele,j_deletions
0,P00421,CASSSPRLAGVPDTQYF,GCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGTTCAC...,TRBV11-2,2.0,6,TRBJ2-3,1.0,4
1,P00245,CASSPEGQGANTGELFF,ACACAGCAGGAGGACTCCGCCGTGTATCTCTGTGCCAGCAGCCCAG...,TRBV7-8,1.0,5,TRBJ2-2,1.0,0
2,P00473,CASSTGTSGFYEQYF,CAGCCTGCAAAGCTTGAGGACTCGGCCGTGTATCTCTGTGCCAGCA...,TRBV11-2,2.0,5,TRBJ2-7,1.0,3
3,P00484,CASSRAGQTSEKLFF,CACGCCCTGCAGCCAGAAGACTCGGCCCTGTATCTCTGTGCCAGCA...,TRBV4-2,1.0,4,TRBJ1-4,1.0,7
4,P00440,CASSLGGIYSNTGELFF,ACACAGCAGGAGGACTCGGCCGTGTATCTCTGTGCCAGCAGCTTAG...,TRBV7-2,1.0,1,TRBJ2-2,1.0,0


In [101]:
@numba.jit(forceobj=True)
def fix_gene_codes(data):
    
    for i in tqdm(range(len(data)),position=0, leave=True):
        v_gene_family = data[i,3]
        j_gene_family = data[i,6]

        new_v_code = v_gene_family + '*0' + str(int(data[i,4]))
        data[i,3] = new_v_code

        new_j_code = j_gene_family + '*0' + str(int(data[i,7]))
        data[i,6] = new_j_code
    
    return data

In [108]:
new_df = fix_gene_codes(data.to_numpy())
new_df = pd.DataFrame(data=new_df).iloc[:,[1,3,6]]
new_df = new_df.rename(columns = {1:'seq', 3:'v', 6:'q'})

100%|██████████| 26427244/26427244 [00:43<00:00, 609095.27it/s]


Unnamed: 0,seq,v,q
0,CASSSPRLAGVPDTQYF,TRBV11-2*02,TRBJ2-3*01
1,CASSPEGQGANTGELFF,TRBV7-8*01,TRBJ2-2*01
2,CASSTGTSGFYEQYF,TRBV11-2*02,TRBJ2-7*01
3,CASSRAGQTSEKLFF,TRBV4-2*01,TRBJ1-4*01
4,CASSLGGIYSNTGELFF,TRBV7-2*01,TRBJ2-2*01
...,...,...,...
26427239,CASSLGQEFGTDTQYF,TRBV5-4*01,TRBJ2-3*01
26427240,CASSPRTSGRRGNIQYF,TRBV5-4*01,TRBJ2-4*01
26427241,CSVGAGTYEQYF,TRBV29-1*01,TRBJ2-7*01
26427242,CASSQIRQGPNTEAFF,TRBV4-2*01,TRBJ1-1*01


In [113]:
new_df

Unnamed: 0,seq,v,q
0,CASSSPRLAGVPDTQYF,TRBV11-2*02,TRBJ2-3*01
1,CASSPEGQGANTGELFF,TRBV7-8*01,TRBJ2-2*01
2,CASSTGTSGFYEQYF,TRBV11-2*02,TRBJ2-7*01
3,CASSRAGQTSEKLFF,TRBV4-2*01,TRBJ1-4*01
4,CASSLGGIYSNTGELFF,TRBV7-2*01,TRBJ2-2*01
...,...,...,...
26427239,CASSLGQEFGTDTQYF,TRBV5-4*01,TRBJ2-3*01
26427240,CASSPRTSGRRGNIQYF,TRBV5-4*01,TRBJ2-4*01
26427241,CSVGAGTYEQYF,TRBV29-1*01,TRBJ2-7*01
26427242,CASSQIRQGPNTEAFF,TRBV4-2*01,TRBJ1-1*01


In [148]:
@numba.jit(forceobj=True)
def CDR3_amino_acids_distribution(start, end, data, desc=False):
    if desc:
        step = -1
    else:
        step = 1
    CDR3_positions = {}
    for j in range(start, end, step):
        cdr3_elements = {}
        for i in range(len(data)):
            if (len(data[i])-1 < j):
                continue
            else:
                key = list(data[i])[j]
                if key not in cdr3_elements.keys():
                    cdr3_elements[key] = 1
                else:
                    cdr3_elements[key] += 1
        cdr3_elements_list = [(x, y / len(data)) for x, y in cdr3_elements.items()]
        cdr3_elements = {}
        for x, y in cdr3_elements_list:
            cdr3_elements[x] = y

        CDR3_positions[j] = cdr3_elements
    return CDR3_positions

In [142]:
@numba.jit(forceobj=True)
def seq_gene_dict(data, gene = 1):
    result = {}
    for i in tqdm(range(len(data)),position=0, leave=False):
        key = data[i, gene]
        value = data[i, 0]
        if key not in result.keys():
            result[key] = [value]
        else:
            result[key].append(value)
    return result

In [120]:
v_gene_cdr3_dict = seq_gene_dict(new_df.to_numpy())
j_gene_cdr3_dict = seq_gene_dict(new_df.to_numpy(), gene=2)

100%|██████████| 26427244/26427244 [00:19<00:00, 1363130.54it/s]
100%|██████████| 26427244/26427244 [00:27<00:00, 951543.99it/s] 


In [149]:
result = {}
for k, v in tqdm(v_gene_cdr3_dict.items(), position = 0, leave= True):
    result[k] = CDR3_amino_acids_distribution(0,3, v)

100%|██████████| 31/31 [01:15<00:00,  2.44s/it]


In [150]:
result

{'TRBV11-2*02': {0: {'C': 1.0},
  1: {'A': 0.9990771519347671,
   'T': 0.00021196427368949452,
   'Q': 3.826069922193042e-06,
   'G': 0.00011325166969691404,
   'V': 0.00038643306214149723,
   'P': 3.213898734642155e-05,
   'E': 9.947781797701909e-06,
   'S': 6.504318867728172e-05,
   'F': 1.453906570433356e-05,
   'D': 3.443462929973738e-05,
   'L': 1.453906570433356e-05,
   'R': 9.182567813263301e-06,
   'K': 5.356497891070259e-06,
   'N': 3.0608559377544335e-06,
   'Y': 3.0608559377544335e-06,
   'M': 3.0608559377544335e-06,
   'C': 5.356497891070259e-06,
   'I': 3.0608559377544335e-06,
   'W': 2.2956419533158253e-06,
   'H': 2.2956419533158253e-06},
  2: {'S': 0.9910699528016015,
   'T': 0.00427831138699626,
   'V': 0.0003443462929973738,
   'W': 7.346054250610641e-05,
   'C': 0.0002387467631448458,
   'A': 0.0003275115853397244,
   'G': 0.0012541857204948791,
   'I': 0.0007675096263919242,
   'L': 0.00015686886680991473,
   'R': 0.0005142237975427448,
   'E': 0.0001224342375101773

In [153]:
result = {}
for k, v in tqdm(v_gene_cdr3_dict.items(), position = 0, leave= True):
    result[k] = CDR3_amino_acids_distribution(-1,-3, v, desc=True)

100%|██████████| 31/31 [00:50<00:00,  1.63s/it]


In [154]:
result

{'TRBV11-2*02': {-1: {'F': 0.9963047816691459, 'V': 0.00369521833085404},
  -2: {'Y': 0.3901619498876666,
   'F': 0.40663700697262983,
   'H': 0.07573399325387352,
   'D': 0.0004323459012078138,
   'T': 0.1240901605725025,
   'S': 0.0003887287040948131,
   'L': 0.0004476501808965859,
   'V': 0.0004116851236279713,
   'C': 0.00010483431586808935,
   'G': 0.0001652862206387394,
   'P': 0.00020431213384510845,
   'R': 0.0001385037311833881,
   'N': 0.0003535288608106371,
   'I': 0.00032445072940197,
   'A': 0.000275477034397899,
   'W': 1.453906570433356e-05,
   'Q': 2.6782489455351295e-05,
   'E': 5.739104883289563e-05,
   'K': 2.2956419533158253e-05,
   'M': 8.417353828824693e-06}},
 'TRBV7-8*01': {-1: {'F': 0.9967199608432867, 'V': 0.003280039156713228},
  -2: {'F': 0.3829196556512572,
   'Y': 0.35152784268214626,
   'T': 0.12668348397298895,
   'H': 0.1358580702030258,
   'P': 0.00038868796218985245,
   'I': 0.0002469442038983963,
   'L': 0.00036875524618011646,
   'R': 8.748247582050

In [155]:
result = {}
for k, v in tqdm(j_gene_cdr3_dict.items(), position = 0, leave= True):
    result[k] = CDR3_amino_acids_distribution(0,3, v)

100%|██████████| 12/12 [01:13<00:00,  6.16s/it]


In [156]:
result

{'TRBJ2-3*01': {0: {'C': 1.0},
  1: {'A': 0.9233487813823226,
   'S': 0.07477621066908739,
   'G': 0.00032034966234389397,
   'T': 0.00048499289009359915,
   'V': 0.00046230718439113454,
   'P': 6.668222585269897e-05,
   'D': 4.365279733656066e-05,
   'R': 0.0002732596368705963,
   'M': 1.6154972242664182e-05,
   'L': 3.059133041695983e-05,
   'I': 5.087097642370849e-05,
   'C': 1.3405189733274533e-05,
   'E': 2.0279646006748654e-05,
   'Y': 4.124673764084472e-06,
   'K': 2.1654537261443477e-05,
   'N': 4.434024296390807e-05,
   'W': 1.374891254694824e-06,
   'F': 7.218179087147826e-06,
   'H': 5.499565018779296e-06,
   'Q': 8.249347528168944e-06},
  2: {'S': 0.870893586510392,
   'A': 0.014829233350324697,
   'I': 0.038110954411699496,
   'V': 0.0528944726278056,
   'D': 0.0004464959349621441,
   'G': 0.0057343277005184375,
   'T': 0.010582881710199735,
   'N': 0.0006396681562467669,
   'L': 0.0012724618562200596,
   'W': 0.00012855233231396604,
   'P': 0.0007850629064307445,
   'F': 

In [159]:
result = {}
for k, v in tqdm(j_gene_cdr3_dict.items(), position = 0, leave= True):
    result[k] = CDR3_amino_acids_distribution(-1,-2, v, desc=True)

100%|██████████| 12/12 [00:24<00:00,  2.06s/it]


In [160]:
result

{'TRBJ2-3*01': {-1: {'F': 0.9997666122095156, 'V': 0.00023338779048444637}},
 'TRBJ2-2*01': {-1: {'F': 0.9999989981586155, 'V': 1.001841384464646e-06}},
 'TRBJ2-7*01': {-1: {'F': 0.9875540966145211, 'V': 0.012445903385478954}},
 'TRBJ1-4*01': {-1: {'F': 0.9999976227681755, 'V': 2.377231824477881e-06}},
 'TRBJ1-3*01': {-1: {'F': 0.9998247556285449, 'V': 0.00017524437145510544}},
 'TRBJ1-5*01': {-1: {'F': 0.9999993988383151, 'V': 6.011616848397844e-07}},
 'TRBJ1-2*01': {-1: {'F': 0.9999981489137209, 'V': 1.8510862791314704e-06}},
 'TRBJ1-1*01': {-1: {'F': 0.9999980963844085, 'V': 1.9036155914819185e-06}},
 'TRBJ2-1*01': {-1: {'F': 1.0}},
 'TRBJ1-6*01': {-1: {'F': 0.9999988931857876, 'V': 1.1068142123799384e-06}},
 'TRBJ2-4*01': {-1: {'F': 0.9998801262322893, 'V': 0.00011987376771069769}},
 'TRBJ2-6*01': {-1: {'F': 1.0}}}