In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# %pip install openpyxl

In [4]:
# load excel file data
df = pd.read_excel("~/data/project/pMHC-TCR/230213_info.xlsx")

In [16]:
df["nSeqHLA"].value_counts()

ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGGCCCTGGCCCTGACCCAGACCTGGGCGGGCTCCCACTCCATGAGGTATTTCTACACCTCCGTGTCCCGGCCCGGCCGCGGGGAGCCCCGCTTCATCGCCGTGGGCTACGTGGACGACACGCAGTTCGTGCGGTTCGACAGCGACGCCGCGAGCCAGAGGATGGAGCCGCGGGCGCCGTGGATAGAGCAGGAGGGGCCGGAGTATTGGGACCAGGAGACACGGAATGTGAAGGCCCAGTCACAGACTGACCGAGTGGACCTGGGGACCCTGCGCGGCTACTACAACCAGAGCGAGGACGGTTCTCACACCATCCAGATAATGTATGGCTGCGACGTGGGGCCGGACGGGCGCTTACTCCGCGGGTACCGGCAGGACGCCTACGACGGCAAGGATTACATCGCCCTGAACGAGGACCTGCGCTCTTGGACCGCGGCGGACATGGCAGCTCAGATCACCAAGCGCAAGTGGGAGGCGGCCCATGCGGCGGAGCAGCAGAGAGCCTACCTGGAGGGCCGGTGCGTGGAGTGGCTCCGCAGATACCTGGAGAACGGGAAGGAGACGCTGCAGCGCACGGACCCCCCCAAGACACATATGACCCACCACCCCATCTCTGACCATGAGGCCACCCTGAGGTGCTGGGCCCTGGGCTTCTACCCTGCGGAGATCACACTGACCTGGCAGCGGGATGGGGAGGACCAGACCCAGGACACGGAGCTCGTGGAGACCAGGCCTGCAGGGGATGGAACCTTCCAGAAGTGGGCGGCTGTGGTGGTGCCTTCTGGAGAGGAGCAGAGATACACCTGCCATGTGCAGCATGAGGGTCTGCCCAAGCCCCTCACCCTGAGATGGGAGCTGTCTTCCCAGCCCACCATCCCCATCGTGGGCATCATTGCTGGCCTGGTTCTCCTTGGAGCTGTGATCACTGGAGCTGTGGTCGCTGCCGTGATGTGGAGGA

In [31]:
# check HLA class types
df.sort_values(by=['cellname'])
# df['HLA'].value_counts()

Unnamed: 0,cellname,NeoGeneID,NeoAA,HLA,nSeqHLA,aaSeqHLA,chain,top1.ID,top1.TCRnt,libraryID,...,nSeqFR3,nSeqCDR3,nSeqFR4,aaSeqFR1,aaSeqCDR1,aaSeqFR2,aaSeqCDR2,aaSeqFR3,aaSeqCDR3,aaSeqFR4
2129,V350085868_L01_502,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRB,TRBV7-2_CTTAGGCGTTTACGA_TRBJ2-7,GGAGCTGGAGTCTCCCAGTCCCCCAGTAACAAGGTCACAGAGAAGG...,35th-6-2,...,CCAGACAAATCAGGGCTGCCCAGTGATCGCTTCTCTGCAGAGAGGA...,TGTGCCAGCAGCTTAGGCGTTTACGAGCAGTACTTC,GGGCCGGGCACCAGGCTCACGGTCACAG,GAGVSQSPSNKVTEKGKDVELRCDPI,SGHTA,LYWYRQRLGQGLEFLIY,FQGNSA,PDKSGLPSDRFSAERTGESVSTLTIQRTQQEDSAVYL,CASSLGVYEQYF,GPGTRLTVT_
2128,V350085868_L01_502,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRA,TRAV38-2_DV8_ATAGGGCTTACATGGAA_TRAJ47,GCTCAGACAGTCACTCAGTCTCAACCAGAGATGTCTGTGCAGGAGG...,35th-6-2,...,GCAACAGAGAATCGTTTCTCTGTGAACTTCCAGAAAGCAGCCAAAT...,TGTGCTTATAGGGCTTACATGGAATATGGAAACAAACTGGTCTTT,GGCGCAGGAACCATTCTGAGAGTCAAGTCCT,AQTVTQSQPEMSVQEAETVTLSCTYD,TSESDYY,LFWYKQPPSRQMILVIR,QEAYKQQN,ATENRFSVNFQKAAKSFSLKISDSQLGDAAMYF,CAYRAYMEYGNKLVF,GAGTILRVKS_
2126,V350085868_L01_503,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRB,TRBV5-6_CAGCTGGACGGCGAACA_TRBJ1-1,GACGCTGGAGTCACCCAAAGTCCCACACACCTGATCAAAACGAGAG...,35th-6-2,...,AGACAGAGAGGCAACTTCCCTGATCGATTCTCAGGTCACCAGTTCC...,TGTGCCAGCAGCTGGACGGCGAACACTGAAGCTTTCTTT,GGACAAGGCACCAGACTCACAGTTGTAG,DAGVTQSPTHLIKTRGQQVTLRCSPK,SGHDT,VSWYQQALGQGPQFIFQ,YYEEEE,RQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYL,CASSWTANTEAFF,GQGTRLTVV_
2127,V350085868_L01_503,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRA,TRAV13-1_GCAGCCATCATGGAA_TRAJ47,GACGCTGGAGTCACCCAAAGTCCCACACACCTGATCAAAACGAGAG...,35th-6-2,...,AGACAGAGAGGCAACTTCCCTGATCGATTCTCAGGTCACCAGTTCC...,TGTGCCAGCAGCTGGACGGCGAACACTGAAGCTTTCTTT,GGACAAGGCACCAGACTCACAGTTGTAG,DAGVTQSPTHLIKTRGQQVTLRCSPK,SGHDT,VSWYQQALGQGPQFIFQ,YYEEEE,RQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYL,CASSWTANTEAFF,GQGTRLTVV_
2124,V350085868_L01_504,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRB,TRBV20-1_CAGTGTAAGGAATGGGTACGA_TRBJ2-7,GGTGCTGTCGTCTCTCAACATCCGAGCAGGGTTATCTGTAAGAGTG...,35th-6-2,...,ACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATG...,TGCAGTGTAAGGAATGGGTACGAGCAGTACTTC,GGGCCGGGCACCAGGCTCACGGTCACAG,GAVVSQHPSRVICKSGTSVKIECRSL,DFQATT,MFWYRQFPKKSLMLMAT,SNEGSKA,TYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYI,CSVRNGYEQYF,GPGTRLTVT_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,V350110758_L02_523,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRB,TRBV13_GCAGCCCCCCAACGACAGCCGCTAAC_TRBJ1-2,GCTGCTGGAGTCATCCAGTCCCCAAGACATCTGATCAAAGAAAAGA...,36th-5-2,...,AGCGATAAAGGAAGCATCCCTGATCGATTCTCAGCTCAACAGTTCA...,TGTGCCAGCAGCCCCCCAACGACAGCCGCTAACTATGGCTACACCTTC,GGTTCGGGGACCAGGTTAACCGTTGTAG,AAGVIQSPRHLIKEKRETATLKCYPI,PRHDT,VYWYQQGPGQDPQFLIS,FYEKMQ,SDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYF,CASSPPTTAANYGYTF,GSGTRLTVV_
3,V350110758_L02_531,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRB,TRBV30_TGGAGCCCCGGGCGTTCAC_TRBJ1-6,TCTCAGACTATTCATCAATGGCCAGCGACCCTGGTGCAGCCTGTGG...,36th-5-2,...,CAGATCAGCTCTGAGGTGCCCCAGAATCTCTCAGCCTCCAGACCCC...,TGTGCCTGGAGCCCCGGGCGTTCACCCCTCCACTTT,GGGAATGGGACCAGGCTCACTGTGACAG,SQTIHQWPATLVQPVGSPLSLECTVE,GTSNPN,LYWYRQAAGRGLQLLFY,SVGIG,QISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYL,CAWSPGRSPLHF,GNGTRLTVT_
2,V350110758_L02_531,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRA,TRAV21_TGAGGGGGAATA_TRAJ31,AAACAGGAGGTGACGCAGATTCCTGCAGCTCTGAGTGTCCCAGAAG...,36th-5-2,...,CAAACAAGTGGAAGACTTAATGCCTCGCTGGATAAATCATCAGGAC...,TGTGCTGTGAGGGGGAATAACAATGCCAGACTCATGTTT,GGAGATGGAACTCAGCTGGTGGTGAAGCCCA,KQEVTQIPAALSVPEGENLVLNCSFT,DSAIYN,LQWFRQDPGKGLTSLLL,IQSSQRE,QTSGRLNASLDKSSGRSTLYIAASQPGDSATYL,CAVRGNNNARLMF,GDGTQLVVKP_
1,V350110758_L02_532,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRB,TRBV5-8_CTTGGTCAGCTCCT_TRBJ2-7,GAGGCTGGAGTCACACAAAGTCCCACACACCTGATCAAAACGAGAG...,36th-5-2,...,AGAAACAGAGGAAACTTCCCTCCTAGATTTTCAGGTCGCCAGTTCC...,TGTGCCAGCAGCTTGGTCAGCTCCTACGAGCAGTACTTC,GGGCCGGGCACCAGGCTCACGGTCACAG,EAGVTQSPTHLIKTRGQQATLRCSPI,SGHTS,VYWYQQALGLGLQFLLW,YDEGEE,RNRGNFPPRFSGRQFPNYSSELNVNALELEDSALYL,CASSLVSSYEQYF,GPGTRLTVT_


In [37]:
# sorting data: combine the sequence from the same cell (has the same cellname), 
# add the prefix of the column chain to the columns CDR1, CDR2, CDR3
data = df.groupby("cellname").agg(
    {
        "NeoGeneID": "first",
        "NeoAA" : "first",
        "HLA": "first",
        "aaSeqCDR1": lambda x: "B:" + x.iloc[0] + " A:" + x.iloc[1],
        "aaSeqCDR2": lambda x: "B:" + x.iloc[0] + " A:" + x.iloc[1],
        "aaSeqCDR3": lambda x: "B:" + x.iloc[0] + " A:" + x.iloc[1],
    }
)
data

Unnamed: 0_level_0,NeoGeneID,NeoAA,HLA,aaSeqCDR1
cellname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
V350085868_L01_502,CRC06C1,VLLSHLSYL,HLA-A*02:01,B:TSESDYY A:SGHTA
V350085868_L01_503,CRC06C1,VLLSHLSYL,HLA-A*02:01,B:SGHDT A:SGHDT
V350085868_L01_504,CRC06C1,VLLSHLSYL,HLA-A*02:01,B:DFQATT A:DFQATT
V350085868_L01_505,CRC06C1,VLLSHLSYL,HLA-A*02:01,B:LGHNT A:NIATNDY
V350085868_L01_506,CRC06C1,VLLSHLSYL,HLA-A*02:01,B:KGHSH A:ATGYPS
...,...,...,...,...
V350110758_L02_520,CTNNB1_S45P,TTAPPLSGK,HLA-A*11:01,B:SGHDT A:SGHDT
V350110758_L02_522,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,B:SGHNS A:NSMFDY
V350110758_L02_523,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,B:PRHDT A:SSNFYA
V350110758_L02_531,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,B:DSAIYN A:GTSNPN


In [38]:
df

Unnamed: 0,cellname,NeoGeneID,NeoAA,HLA,nSeqHLA,aaSeqHLA,chain,top1.ID,top1.TCRnt,libraryID,...,nSeqFR3,nSeqCDR3,nSeqFR4,aaSeqFR1,aaSeqCDR1,aaSeqFR2,aaSeqCDR2,aaSeqFR3,aaSeqCDR3,aaSeqFR4
0,V350110758_L02_532,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRA,TRAV29_DV5_AAGCGCAGGA_TRAJ40,GACCAGCAAGTTAAGCAAAATTCACCATCCCTGAGCGTCCAGGAAG...,36th-5-2,...,AATGAAGATGGAAGATTCACTGTCTTCTTAAACAAAAGTGCCAAGC...,TGTGCAGCAAGCGCAGGAACCTACAAATACATCTTT,GGAACAGGCACCAGGCTGAAGGTTTTAGCAA,DQQVKQNSPSLSVQEGRISILNCDYT,NSMFDY,FLWYKKYPAEGPTFLIS,ISSIKDK,NEDGRFTVFLNKSAKHLSLHIVPSQPGDSAVYF,CAASAGTYKYIF,GTGTRLKVLA_
1,V350110758_L02_532,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRB,TRBV5-8_CTTGGTCAGCTCCT_TRBJ2-7,GAGGCTGGAGTCACACAAAGTCCCACACACCTGATCAAAACGAGAG...,36th-5-2,...,AGAAACAGAGGAAACTTCCCTCCTAGATTTTCAGGTCGCCAGTTCC...,TGTGCCAGCAGCTTGGTCAGCTCCTACGAGCAGTACTTC,GGGCCGGGCACCAGGCTCACGGTCACAG,EAGVTQSPTHLIKTRGQQATLRCSPI,SGHTS,VYWYQQALGLGLQFLLW,YDEGEE,RNRGNFPPRFSGRQFPNYSSELNVNALELEDSALYL,CASSLVSSYEQYF,GPGTRLTVT_
2,V350110758_L02_531,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRA,TRAV21_TGAGGGGGAATA_TRAJ31,AAACAGGAGGTGACGCAGATTCCTGCAGCTCTGAGTGTCCCAGAAG...,36th-5-2,...,CAAACAAGTGGAAGACTTAATGCCTCGCTGGATAAATCATCAGGAC...,TGTGCTGTGAGGGGGAATAACAATGCCAGACTCATGTTT,GGAGATGGAACTCAGCTGGTGGTGAAGCCCA,KQEVTQIPAALSVPEGENLVLNCSFT,DSAIYN,LQWFRQDPGKGLTSLLL,IQSSQRE,QTSGRLNASLDKSSGRSTLYIAASQPGDSATYL,CAVRGNNNARLMF,GDGTQLVVKP_
3,V350110758_L02_531,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRB,TRBV30_TGGAGCCCCGGGCGTTCAC_TRBJ1-6,TCTCAGACTATTCATCAATGGCCAGCGACCCTGGTGCAGCCTGTGG...,36th-5-2,...,CAGATCAGCTCTGAGGTGCCCCAGAATCTCTCAGCCTCCAGACCCC...,TGTGCCTGGAGCCCCGGGCGTTCACCCCTCCACTTT,GGGAATGGGACCAGGCTCACTGTGACAG,SQTIHQWPATLVQPVGSPLSLECTVE,GTSNPN,LYWYRQAAGRGLQLLFY,SVGIG,QISSEVPQNLSASRPQDRQFILSSKKLLLSDSGFYL,CAWSPGRSPLHF,GNGTRLTVT_
4,V350110758_L02_523,NRAS_G13D,VVVGAGDVGK,HLA-A*11:01,ATGGCCGTCATGGCGCCCCGAACCCTCCTCCTGCTACTCTCGGGGG...,MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFYTSVSRPGRGEPRF...,TRB,TRBV13_GCAGCCCCCCAACGACAGCCGCTAAC_TRBJ1-2,GCTGCTGGAGTCATCCAGTCCCCAAGACATCTGATCAAAGAAAAGA...,36th-5-2,...,AGCGATAAAGGAAGCATCCCTGATCGATTCTCAGCTCAACAGTTCA...,TGTGCCAGCAGCCCCCCAACGACAGCCGCTAACTATGGCTACACCTTC,GGTTCGGGGACCAGGTTAACCGTTGTAG,AAGVIQSPRHLIKEKRETATLKCYPI,PRHDT,VYWYQQGPGQDPQFLIS,FYEKMQ,SDKGSIPDRFSAQQFSDYHSELNMSSLELGDSALYF,CASSPPTTAANYGYTF,GSGTRLTVV_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2125,V350085868_L01_504,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRA,TRAV12-3_CAATGGGAACAAC_TRAJ24,GGTGCTGTCGTCTCTCAACATCCGAGCAGGGTTATCTGTAAGAGTG...,35th-6-2,...,ACATACGAGCAAGGCGTCGAGAAGGACAAGTTTCTCATCAACCATG...,TGCAGTGTAAGGAATGGGTACGAGCAGTACTTC,GGGCCGGGCACCAGGCTCACGGTCACAG,GAVVSQHPSRVICKSGTSVKIECRSL,DFQATT,MFWYRQFPKKSLMLMAT,SNEGSKA,TYEQGVEKDKFLINHASLTLSTLTVTSAHPEDSSFYI,CSVRNGYEQYF,GPGTRLTVT_
2126,V350085868_L01_503,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRB,TRBV5-6_CAGCTGGACGGCGAACA_TRBJ1-1,GACGCTGGAGTCACCCAAAGTCCCACACACCTGATCAAAACGAGAG...,35th-6-2,...,AGACAGAGAGGCAACTTCCCTGATCGATTCTCAGGTCACCAGTTCC...,TGTGCCAGCAGCTGGACGGCGAACACTGAAGCTTTCTTT,GGACAAGGCACCAGACTCACAGTTGTAG,DAGVTQSPTHLIKTRGQQVTLRCSPK,SGHDT,VSWYQQALGQGPQFIFQ,YYEEEE,RQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYL,CASSWTANTEAFF,GQGTRLTVV_
2127,V350085868_L01_503,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRA,TRAV13-1_GCAGCCATCATGGAA_TRAJ47,GACGCTGGAGTCACCCAAAGTCCCACACACCTGATCAAAACGAGAG...,35th-6-2,...,AGACAGAGAGGCAACTTCCCTGATCGATTCTCAGGTCACCAGTTCC...,TGTGCCAGCAGCTGGACGGCGAACACTGAAGCTTTCTTT,GGACAAGGCACCAGACTCACAGTTGTAG,DAGVTQSPTHLIKTRGQQVTLRCSPK,SGHDT,VSWYQQALGQGPQFIFQ,YYEEEE,RQRGNFPDRFSGHQFPNYSSELNVNALLLGDSALYL,CASSWTANTEAFF,GQGTRLTVV_
2128,V350085868_L01_502,CRC06C1,VLLSHLSYL,HLA-A*02:01,ATGGCCGTCATGGCGCCCCGAACCCTCGTCCTGCTACTCTCGGGGG...,MAVMAPRTLVLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRF...,TRA,TRAV38-2_DV8_ATAGGGCTTACATGGAA_TRAJ47,GCTCAGACAGTCACTCAGTCTCAACCAGAGATGTCTGTGCAGGAGG...,35th-6-2,...,GCAACAGAGAATCGTTTCTCTGTGAACTTCCAGAAAGCAGCCAAAT...,TGTGCTTATAGGGCTTACATGGAATATGGAAACAAACTGGTCTTT,GGCGCAGGAACCATTCTGAGAGTCAAGTCCT,AQTVTQSQPEMSVQEAETVTLSCTYD,TSESDYY,LFWYKQPPSRQMILVIR,QEAYKQQN,ATENRFSVNFQKAAKSFSLKISDSQLGDAAMYF,CAYRAYMEYGNKLVF,GAGTILRVKS_


In [25]:
# torch dataset
class TCRDataset(Dataset):
    '''
    Use each two rows of data as a sample (one for alpha chain, one for beta chain), each sample has the common cellname
    Different from the chain, the HLA class is the same for each sample
    The aaSeqCDR1, aaSeqCDR2, aaSeqCDR3 are the CDR1, CDR2, CDR3 of the alpha chain and beta chain, respectively.
    '''
    def __init__(self) -> None:
        super().__init__()
        self.data = data
        # each sample has two rows of data, one for alpha chain, one for beta chain
        self.len = len(self.data) / 2
        # aggregate the cellname
        self.cellname = self.data["cellname"]
        # aggregate the NeoAA
        self.neoAA = self.data["NeoAA"]
        # aggregate the HLA class
        self.HLA = self.data["HLA"]
        # aggregate the CDR1, CDR2, CDR3 of the alpha chain and beta chain, respectively
        self.aCDR1 = self.data["aaSeqCDR1"] if self.data["chain"] == "A" else None
        self.aCDR2 = self.data["aaSeqCDR2"] if self.data["chain"] == "A" else None
        self.aCDR3 = self.data["aaSeqCDR3"] if self.data["chain"] == "A" else None
        self.bCDR1 = self.data["aaSeqCDR1"] if self.data["chain"] == "B" else None
        self.bCDR2 = self.data["aaSeqCDR2"] if self.data["chain"] == "B" else None
        self.bCDR3 = self.data["aaSeqCDR3"] if self.data["chain"] == "B" else None



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fcb1c78bf70>