In [1]:
import pandas as pd
import pyrepseq as prs

## Basic standardization

In [12]:
df = pd.DataFrame(
    data=[
        ["av26.1*1","CIVRAPGRADMRF","aj43*1","bv13*1","CASSYLPGQGDHYSNQPQHF","bj1.5*1","FLKEKGGL","b8","b2m"],
        ["TCRAV20*01","CAVPSGAGSYQLTF","TCRAJ28*01","TCRBV28S1*01","CASSLGQSGANVLTF","TCRBJ2S6*01","LQPFPQPELPYPQPQ","HLA-DQA1*05","HLA-DQB1*02"],
        ["unknown","unknown","unknown","TRBV7-2*01","CASSDWGSQNTLYF","TRBJ2-4*01","YMPYFFTLL","HLA-A*02","B2M"]
    ],
    columns=["TRAV","CDR3A","TRAJ","TRBV","CDR3B","TRBJ","Epitope","MHCA","MHCB"]
)
df

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB
0,av26.1*1,CIVRAPGRADMRF,aj43*1,bv13*1,CASSYLPGQGDHYSNQPQHF,bj1.5*1,FLKEKGGL,b8,b2m
1,TCRAV20*01,CAVPSGAGSYQLTF,TCRAJ28*01,TCRBV28S1*01,CASSLGQSGANVLTF,TCRBJ2S6*01,LQPFPQPELPYPQPQ,HLA-DQA1*05,HLA-DQB1*02
2,unknown,unknown,unknown,TRBV7-2*01,CASSDWGSQNTLYF,TRBJ2-4*01,YMPYFFTLL,HLA-A*02,B2M


In [4]:
prs.io.standardize_dataframe(df)



Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB
0,TRAV26-1,CIVRAPGRADMRF,TRAJ43,TRBV13,CASSYLPGQGDHYSNQPQHF,TRBJ1-5,FLKEKGGL,HLA-B,B2M
1,TRAV20,CAVPSGAGSYQLTF,TRAJ28,TRBV28,CASSLGQSGANVLTF,TRBJ2-6,LQPFPQPELPYPQPQ,HLA-DQA1,HLA-DQB1
2,,,,TRBV7-2,CASSDWGSQNTLYF,TRBJ2-4,YMPYFFTLL,HLA-A,B2M


## Extra columns are allowed

In [13]:
extended_df = df.copy()
extended_df["clone_count"] = [1,2,3]
extended_df

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,clone_count
0,av26.1*1,CIVRAPGRADMRF,aj43*1,bv13*1,CASSYLPGQGDHYSNQPQHF,bj1.5*1,FLKEKGGL,b8,b2m,1
1,TCRAV20*01,CAVPSGAGSYQLTF,TCRAJ28*01,TCRBV28S1*01,CASSLGQSGANVLTF,TCRBJ2S6*01,LQPFPQPELPYPQPQ,HLA-DQA1*05,HLA-DQB1*02,2
2,unknown,unknown,unknown,TRBV7-2*01,CASSDWGSQNTLYF,TRBJ2-4*01,YMPYFFTLL,HLA-A*02,B2M,3


In [7]:
prs.io.standardize_dataframe(extended_df)



Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,clone_count
0,TRAV26-1,CIVRAPGRADMRF,TRAJ43,TRBV13,CASSYLPGQGDHYSNQPQHF,TRBJ1-5,FLKEKGGL,HLA-B,B2M,1
1,TRAV20,CAVPSGAGSYQLTF,TRAJ28,TRBV28,CASSLGQSGANVLTF,TRBJ2-6,LQPFPQPELPYPQPQ,HLA-DQA1,HLA-DQB1,2
2,,,,TRBV7-2,CASSDWGSQNTLYF,TRBJ2-4,YMPYFFTLL,HLA-A,B2M,3


## Subsets of columns are allowed

In [14]:
beta_only_df = df.copy()
beta_only_df = beta_only_df[["TRBV","CDR3B","TRBJ"]]
beta_only_df

Unnamed: 0,TRBV,CDR3B,TRBJ
0,bv13*1,CASSYLPGQGDHYSNQPQHF,bj1.5*1
1,TCRBV28S1*01,CASSLGQSGANVLTF,TCRBJ2S6*01
2,TRBV7-2*01,CASSDWGSQNTLYF,TRBJ2-4*01


In [15]:
prs.io.standardize_dataframe(beta_only_df)

Unnamed: 0,TRBV,CDR3B,TRBJ
0,TRBV13,CASSYLPGQGDHYSNQPQHF,TRBJ1-5
1,TRBV28,CASSLGQSGANVLTF,TRBJ2-6
2,TRBV7-2,CASSDWGSQNTLYF,TRBJ2-4


## Rename columns with col_mapper

In [9]:
beta_only_misnamed = beta_only_df.copy()
beta_only_misnamed.columns = ["foo", "bar", "baz"]
beta_only_misnamed

Unnamed: 0,foo,bar,baz
0,bv13*1,CASSYLPGQGDHYSNQPQHF,bj1.5*1
1,TCRBV28S1*01,CASSLGQSGANVLTF,TCRBJ2S6*01
2,TRBV7-2*01,CASSDWGSQNTLYF,TRBJ2-4*01


In [16]:
col_mapper = {
    "foo": "TRBV",
    "bar": "CDR3B",
    "baz": "TRBJ"
}
prs.io.standardize_dataframe(beta_only_misnamed, col_mapper=col_mapper)

Unnamed: 0,TRBV,CDR3B,TRBJ
0,TRBV13,CASSYLPGQGDHYSNQPQHF,TRBJ1-5
1,TRBV28,CASSLGQSGANVLTF,TRBJ2-6
2,TRBV7-2,CASSDWGSQNTLYF,TRBJ2-4
