In [1]:
import tcrdist as td
import pandas as pd
from tcrdist.repertoire import TCRrep

Using Cython-powered Fisher's exact test


In [2]:
# reading the db file
vdj = pd.read_csv('../vdjdb-dump/vdjdb.slim.txt', sep='\t')
vdj.shape

(61557, 16)

In [3]:
# keeping only human entries and beta chains
vdj = vdj.loc[(vdj.species == 'HomoSapiens') & (vdj.gene == 'TRB')]

In [4]:
# counting the number of unique TCRs for an epitope
ge30 = vdj.groupby('antigen.epitope').cdr3.count() >= 30

In [5]:
# removing all epitopes which have less then 30 unique TCRs
vdj = vdj.loc[(vdj['antigen.epitope'].isin(ge30[ge30].index))]

In [6]:
# finding epitopes with more than 1000 unique TCRs
ge1000 = vdj.groupby('antigen.epitope').cdr3.count().sort_values() > 1000

In [7]:
# pseudo-sampling of 1000 rows from each group
# yes, it's just keeping top 1000, will change this later
sampled = vdj.loc[vdj['antigen.epitope'].isin(ge1000.loc[ge1000].index)].groupby('antigen.epitope').head(1000)

In [8]:
# separate df for epitopes which counts are 30 <= x <= 1000
bw30_1000 = vdj.loc[~vdj['antigen.epitope'].isin(ge1000.loc[ge1000].index)]

In [9]:
# concatenating two dfs
vdj = pd.concat([sampled, bw30_1000])

In [10]:
vdj.shape

(13798, 16)

In [11]:
# tcrdist has a specific naming format
# our mapping is not in default lib mappers
mapper = {'complex.id': 'complex_id', 'gene': 'gene', 'cdr3': 'cdr3_b_aa', 'v.segm': 'v_b_gene',
          'j.segm': 'j_b_gene', 'species': 'organism', 'mhc.a': 'mhc_b_a', 'mhc.b': 'mhc_b_b', 
          'mhc.class': 'mhc_b_class', 'antigen.epitope': 'epitope', 'antigen.gene': 'epitope_gene',
          'antigen.species': 'epitope_species', 'reference.id': 'reference', 'vdjdb.score': 'score'}

In [12]:
# dropping unnecessary variables (j.start, v.end) - they do not have corresponding names in the lib
vdj.drop(set(vdj.columns) - mapper.keys(), axis=1, inplace=True)

# renaming columns
vdj.rename(mapper=mapper, axis=1, inplace=True)

In [13]:
vdj.head()

Unnamed: 0,gene,cdr3_b_aa,organism,epitope,epitope_gene,epitope_species,complex_id,v_b_gene,j_b_gene,mhc_b_a,mhc_b_b,mhc_b_class,reference,score
5,TRB,CSASILGLAGYNEQFF,HomoSapiens,KLGGALQAK,IE1,CMV,8227,TRBV20-1*01,TRBJ2-1*01,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0
7,TRB,CASSYFSATNEQFF,HomoSapiens,KLGGALQAK,IE1,CMV,3088,TRBV6-5*01,TRBJ2-1*01,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0
8,TRB,CASSAFPCREGRNNEQFF,HomoSapiens,NLVPMVATV,pp65,CMV,0,TRBV6-1*01,TRBJ2-1*01,HLA-A*02,B2M,MHCI,PMID:28423320,0
10,TRB,CASSLWTTNYGYTF,HomoSapiens,KLGGALQAK,IE1,CMV,19039,TRBV12-4*01,TRBJ1-2*01,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0
12,TRB,CASSLTTESGEQYF,HomoSapiens,NLVPMVATV,pp65,CMV,0,TRBV7-9*01,TRBJ2-7*01,HLA-A*02,B2M,MHCI,PMID:28423320,0


In [14]:
# these lines are from example 2
tr = TCRrep(cell_df=vdj, organism="human")

In [15]:
tr.infer_cdrs_from_v_gene(chain='beta')  



In [16]:
tr.cell_df

Unnamed: 0,gene,cdr3_b_aa,organism,epitope,epitope_gene,epitope_species,complex_id,v_b_gene,j_b_gene,mhc_b_a,mhc_b_b,mhc_b_class,reference,score,cdr1_b_aa,cdr2_b_aa,pmhc_b_aa
5,TRB,CSASILGLAGYNEQFF,HomoSapiens,KLGGALQAK,IE1,CMV,8227,TRBV20-1*01,TRBJ2-1*01,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0,DFQATT,SNEGSKA,ASLTL
7,TRB,CASSYFSATNEQFF,HomoSapiens,KLGGALQAK,IE1,CMV,3088,TRBV6-5*01,TRBJ2-1*01,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0,MNHEY,SVGAGI,STTED
8,TRB,CASSAFPCREGRNNEQFF,HomoSapiens,NLVPMVATV,pp65,CMV,0,TRBV6-1*01,TRBJ2-1*01,HLA-A*02,B2M,MHCI,PMID:28423320,0,MNHNS,SASEGT,LNKRE
10,TRB,CASSLWTTNYGYTF,HomoSapiens,KLGGALQAK,IE1,CMV,19039,TRBV12-4*01,TRBJ1-2*01,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0,SGHDY,FNNNVP,PNASF
12,TRB,CASSLTTESGEQYF,HomoSapiens,NLVPMVATV,pp65,CMV,0,TRBV7-9*01,TRBJ2-7*01,HLA-A*02,B2M,MHCI,PMID:28423320,0,SEHNR,FQNEAQ,PKGSF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61522,TRB,CASSLGLAGNTDTQYF,HomoSapiens,PKYVKQNTLKLAT,HA,InfluenzaA,0,TRBV12-4*01,TRBJ2-3*01,HLA-DRA*01,HLA-DRB1*01,MHCII,https://github.com/antigenomics/vdjdb-db/issue...,1,SGHDY,FNNNVP,PNASF
61530,TRB,CASSLEAADNYGYTF,HomoSapiens,GLCTLVAML,BMLF1,EBV,0,TRBV5-5*01,TRBJ1-2*01,HLA-A*02,B2M,MHCI,PMID:29483360,0,SGHKS,YYEKEE,FPNYS
61537,TRB,CASSRRTSGGADEQFF,HomoSapiens,FRDYVDRFYKTLRAEQASQE,Gag,HIV-1,0,TRBV2*01,TRBJ2-1*01,HLA-DRA*01:01,HLA-DRB5*01:01,MHCII,PMID:27111229,0,SNHLY,FYNNEI,PDGSN
61538,TRB,CASRTSGNEQFF,HomoSapiens,IVTDFSVIK,EBNA4,EBV,20029,TRBV12-4*01,TRBJ2-1*01,HLA-A*11:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0,SGHDY,FNNNVP,PNASF
