In [1]:
# 1. load vdj dataset
import pandas as pd
import numpy as np
vdjdb = pd.read_csv("../vdjdb-2022-03-30/vdjdb.txt", sep = "\t")
vdjdb.head()

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,...,antigen.species,reference.id,method,meta,cdr3fix,vdjdb.score,web.method,web.method.seq,web.cdr3fix.nc,web.cdr3fix.unmp
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2,sort,sanger,no,no
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2,sort,sanger,no,no
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2,sort,sanger,no,no
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2,sort,sanger,no,no


Each roll has its meaning.

**complex.id**: TCR alpha and beta chain records having the same complex identifier belong to the same T-cell clone.

**gene**: TCR chain: alpha or beta.

**cdr3**:	seq	1	1	0	cdr3	CDR3	TCR complementarity determining region 3 (CDR3) amino acid sequence.

**v.segm**:	TCR Variable segment allele.

**j.segm**:	TCR Joining segment allele.

**species**: TCR parent species(Human or Mouse).

**mhc.a**: First MHC chain allele.

**mhc.b**: Second MHC chain allele (defaults to Beta2Microglobulin for MHC class I).

**mhc.class**: MHC class (I or II).

**antigen.epitope**: Amino acid sequence of the epitope.

**antigen.gene**: Representative parent gene of the epitope.

**antigen.species**: Representative parent species of the epitope.

**reference.id**: Pubmed reference / URL / or submitter details in case unpublished.

**method**: Details on method used to assay TCR specificity.

**meta**: Various meta-information: cell subset, donor status, etc.

**cdr3fix**: Details on CDR3 sequence fixing (if applied) and consistency between V, J and reported CDR3 sequence.

**vdjdb.score**: VDJdb confidence score, the higher is the score the more confidence we have in the antigen specificity annotation of a given TCR clonotype/clone. Zero score indicates that there are insufficient method details to draw any conclusion.

**web.method**, **web.method.seq**, **web.cdr3fix.nc**,**web.cdr3fix.unmp** are factor	Internal.

We drop some unrelevant rolls.

In [2]:
vdjdb=vdjdb.drop(['web.method','web.method.seq','web.cdr3fix.nc','web.cdr3fix.unmp','cdr3fix','method','reference.id','meta'],axis=1)
vdjdb.head()

Unnamed: 0,complex.id,gene,cdr3,v.segm,j.segm,species,mhc.a,mhc.b,mhc.class,antigen.epitope,antigen.gene,antigen.species,vdjdb.score
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,2
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,2
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,2
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,2
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,2


In [3]:
vdjdb=vdjdb.drop(vdjdb[vdjdb['vdjdb.score']==0].index)
vdjdb.to_csv(r"../processed_data/vdjdb_preprocessd_all.txt",sep = "\t", index=False,header=True)
np.shape(vdjdb)

(11643, 13)

In [4]:
vdjdb=vdjdb.drop(vdjdb[vdjdb.isnull().any(axis=1)].index)
vdjdb.to_csv(r"../processed_data/vdjdb_preprocessd_nonull.txt",sep = "\t", index=False,header=True)
np.shape(vdjdb)

(11331, 13)