In [1]:
import pandas as pd
import json
import tcrdist as td
from tcrdist.repertoire import TCRrep
import parasail

Using Cython-powered Fisher's exact test


In [2]:
data = pd.read_csv('../vdjdb-dump/vdjdb.txt', sep='\t')

In [3]:
ge30 = data.groupby('antigen.epitope').cdr3.count() >= 30

In [4]:
# removing all epitopes which have less then 30 unique TCRs
vdj = data.loc[(data['antigen.epitope'].isin(ge30[ge30].index))]

In [5]:
# finding epitopes with more than 1000 unique TCRs
ge1000 = vdj.groupby('antigen.epitope').cdr3.count().sort_values() > 1000

In [6]:
# pseudo-sampling of 1000 rows from each group
# yes, it's just keeping top 1000, will change this later
sampled = vdj.loc[vdj['antigen.epitope'].isin(ge1000.loc[ge1000].index)].groupby('antigen.epitope').head(1000)

In [7]:
# separate df for epitopes which counts are 30 <= x <= 1000
bw30_1000 = vdj.loc[~vdj['antigen.epitope'].isin(ge1000.loc[ge1000].index)]

In [8]:
# concatenating two dfs
vdj = pd.concat([sampled, bw30_1000])

In [9]:
vdj.shape

(26070, 21)

In [10]:
# tcrdist has a specific naming format
# our mapping is not in default lib mappers
mapper = {'complex.id': 'complex_id', 'gene': 'gene', 'cdr3': 'cdr3_b_aa', 'v.segm': 'v_b_gene',
          'j.segm': 'j_b_gene', 'species': 'organism', 'mhc.a': 'mhc_b_a', 'mhc.b': 'mhc_b_b', 
          'mhc.class': 'mhc_b_class', 'antigen.epitope': 'epitope', 'antigen.gene': 'epitope_gene',
          'antigen.species': 'epitope_species', 'reference.id': 'reference', 'vdjdb.score': 'score'}

In [11]:
# dropping unnecessary variables (j.start, v.end) - they do not have corresponding names in the lib
vdj.drop(set(vdj.columns) - mapper.keys(), axis=1, inplace=True)

# renaming columns
vdj.rename(mapper=mapper, axis=1, inplace=True)

In [12]:
# removing references to www.10xgenomics.com
vdj = vdj[~vdj.reference.str.startswith('https://www.10xgenomics.com/resources/applicat')]

In [39]:
vdj_trb, vdj_tra = vdj.loc[vdj.gene == 'TRB'], vdj.loc[vdj.gene == 'TRA']

In [40]:
vdj_tra.rename(mapper={'v_b_gene': 'v_a_gene', 'j_b_gene': 'j_a_gene', 'mhc_b_class': 'mhc_a_class', 
                       'mhc_b_a': 'mhc_a_a', 'mhc_b_b': 'mhc_a_b', 'cdr3_b_aa': 'cdr3_a_aa'}, 
               axis=1, inplace=True)

In [41]:
# these lines are from example 2
tr_a, tr_b = TCRrep(cell_df=vdj_tra, organism="human"), TCRrep(cell_df=vdj_trb, organism="human")

In [42]:
# not sure whether this step is OK: need to figure out more about imgt_aligned argument
tr_a.infer_cdrs_from_v_gene(chain='alpha', imgt_aligned=True) 
tr_b.infer_cdrs_from_v_gene(chain='beta', imgt_aligned=True) 

In [44]:
# these columns will determine duplicates
tr_b.index_cols = ['epitope', 'cdr3_b_aa', 'cdr1_b_aa', 'cdr2_b_aa', 'pmhc_b_aa', 'v_b_gene', 'j_b_gene']
tr_a.index_cols = ['epitope', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'v_a_gene', 'j_a_gene']

In [45]:
# this clearly does not work and I need to manualy add a column 'count'
tr_b.cell_df['count'], tr_a.cell_df['count'] = 1, 1
tr_b.deduplicate()
tr_a.deduplicate()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


tcrdist.repertoire.TCRrep for <Your TCR Repertoire Project>
 with index_cols: ['epitope', 'cdr3_a_aa', 'cdr1_a_aa', 'cdr2_a_aa', 'pmhc_a_aa', 'v_a_gene', 'j_a_gene']
 with model organism: human

In [None]:
# computing pairwise distances for beta chains with Needleman–Wunsch algorithm using Blosum62
# this fills such attributes as cdr3_b_aa_pw and also tcrdist (required for the next line)
tr_b.compute_pairwise_all(chain="beta", metric="nw", processes=4, matrix=parasail.blosum62)
tr_a.compute_pairwise_all(chain="alpha", metric="nw", processes=4, matrix=parasail.blosum62)

# this fills paired_tcrdist between CDR regions 
tr_b.compute_paired_tcrdist(store_result=True)
tr_a.compute_paired_tcrdist(store_result=True)