In [52]:
import os
import glob
from distutils.dir_util import copy_tree

import pandas as pd
from python_pdb.parsers import parse_pdb_to_pandas
from python_pdb.formats.residue import THREE_TO_ONE_CODE

In [2]:
TCRMODEL2_STRUCTURES_PATH = '/project/koohylab/bmcmaste/data/TCRModel2-tcrs/data'
export_path = '../data/apo-holo-mhc-class-I-synthetic_refined'

In [4]:
tcrmodel_2_structures = pd.DataFrame(glob.glob('**/ranked_*.pdb', root_dir=TCRMODEL2_STRUCTURES_PATH, recursive=True), columns=['path'])

tcrmodel_2_structures['form'] = tcrmodel_2_structures['path'].map(lambda path: path.split('/')[0])
tcrmodel_2_structures['pdb_id'] = tcrmodel_2_structures['path'].map(lambda path: path.split('/')[1].split('_')[0])
tcrmodel_2_structures['state'] = tcrmodel_2_structures['form'].map(lambda form: 'apo' if form == 'unbound' else 'holo')
tcrmodel_2_structures['model_rank'] = tcrmodel_2_structures['path'].map(lambda path: int(path.split('/')[-1].split('_')[-1].split('.')[0]))

tcrmodel_2_structures

Unnamed: 0,path,form,pdb_id,state,model_rank
0,unbound/6uk2_DE/ranked_4.pdb,unbound,6uk2,apo,4
1,unbound/6uk2_DE/ranked_2.pdb,unbound,6uk2,apo,2
2,unbound/6uk2_DE/ranked_1.pdb,unbound,6uk2,apo,1
3,unbound/6uk2_DE/ranked_0.pdb,unbound,6uk2,apo,0
4,unbound/6uk2_DE/ranked_3.pdb,unbound,6uk2,apo,3
...,...,...,...,...,...
2380,mhcI/6zkx_DECA/ranked_4.pdb,mhcI,6zkx,holo,4
2381,mhcI/6zkx_DECA/ranked_2.pdb,mhcI,6zkx,holo,2
2382,mhcI/6zkx_DECA/ranked_1.pdb,mhcI,6zkx,holo,1
2383,mhcI/6zkx_DECA/ranked_0.pdb,mhcI,6zkx,holo,0


In [22]:
crystal_structures = pd.DataFrame(glob.glob('**/*.pdb', root_dir='../data/apo-holo-mhc-class-I_refined', recursive=True), columns=['path'])

crystal_structures['group'] = crystal_structures['path'].map(lambda path: path.split('/')[0])
crystal_structures['pdb_id'] = crystal_structures['path'].map(lambda path: path.split('/')[-1].split('_')[0])

crystal_structures

Unnamed: 0,path,group,pdb_id
0,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,7n1c
1,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,7n1e
2,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,6at6
3,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,6avf
4,DRGSQS-IYSNGD-GTYNQGGKLI-MNHEY-SMNVEV-ASSGASHE...,DRGSQS-IYSNGD-GTYNQGGKLI-MNHEY-SMNVEV-ASSGASHEQY,3vxt
...,...,...,...
70,DRGSQS-IYSNGD-AVNFGGGKLI-MRHNA-SNTAGT-ASSLSFGT...,DRGSQS-IYSNGD-AVNFGGGKLI-MRHNA-SNTAGT-ASSLSFGTEAF,3qdg
71,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,5ivx
72,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,5iw1
73,DSAIYN-IQSSQRE-AQLNQAGTALI-MNHEY-SVGAGI-ASSYGT...,DSAIYN-IQSSQRE-AQLNQAGTALI-MNHEY-SVGAGI-ASSYGT...,7r7z


In [11]:
AHO_CDR1 = set(range(20, 47 + 1))
AHO_CDR2 = set(range(51, 82 + 1))
AHO_CDR3 = set(range(102, 144 + 1))

def assign_cdr(seq_id):
    if seq_id in AHO_CDR1:
        return 1
    
    if seq_id in AHO_CDR2:
        return 2
    
    if seq_id in AHO_CDR3:
        return 3
    
    return None

In [12]:
cdr_1_alpha_seq = []
cdr_2_alpha_seq = []
cdr_3_alpha_seq = []

cdr_1_beta_seq = []
cdr_2_beta_seq = []
cdr_3_beta_seq = []

peptide_seq = []

mhc_chain_1_seq = []
mhc_chain_2_seq = []


for index, entry in tcrmodel_2_structures.iterrows():
    with open(os.path.join(TCRMODEL2_STRUCTURES_PATH, entry['path']), 'r') as fh:
            structure_df = parse_pdb_to_pandas(fh.read())

    structure_df['cdr'] = structure_df['residue_seq_id'].map(lambda id_: assign_cdr(id_))
    structure_df['res_olc'] = structure_df['residue_name'].map(lambda res_name: THREE_TO_ONE_CODE[res_name] if res_name in THREE_TO_ONE_CODE else None)

    # Alpha CDRs
    cdr_1_alpha_seq.append(''.join(structure_df.query(f"cdr == 1 & chain_id == 'D'")
                                               .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                               .dropna()))
    cdr_2_alpha_seq.append(''.join(structure_df.query(f"cdr == 2 & chain_id == 'D'")
                                               .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                               .dropna()))
    cdr_3_alpha_seq.append(''.join(structure_df.query(f"cdr == 3 & chain_id == 'D'")
                                               .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                               .dropna()))
    # Beta CDRs
    cdr_1_beta_seq.append(''.join(structure_df.query(f"cdr == 1 & chain_id == 'E'")
                                              .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                              .dropna()))
    cdr_2_beta_seq.append(''.join(structure_df.query(f"cdr == 2 & chain_id == 'E'")
                                              .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                              .dropna()))
    cdr_3_beta_seq.append(''.join(structure_df.query(f"cdr == 3 & chain_id == 'E'")
                                              .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                              .dropna()))
    if entry['form'] == 'mhcI':
        peptide_seq.append(''.join(structure_df.query(f"chain_id == 'C'")
                                               .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                               .dropna())
             )
        
        mhc_chain_1_seq.append(''.join(structure_df.query(f"chain_id == 'A'")
                                                   .drop_duplicates(['residue_seq_id', 'residue_insert_code'])['res_olc']
                                                   .dropna())
             )
    
        

    else:
        peptide_seq.append(None)
        mhc_chain_1_seq.append(None)
        mhc_chain_2_seq.append(None)

tcrmodel_2_structures['cdr_1_alpha_seq'] = cdr_1_alpha_seq
tcrmodel_2_structures['cdr_2_alpha_seq'] = cdr_2_alpha_seq
tcrmodel_2_structures['cdr_3_alpha_seq'] = cdr_3_alpha_seq

tcrmodel_2_structures['cdr_1_beta_seq'] = cdr_1_beta_seq
tcrmodel_2_structures['cdr_2_beta_seq'] = cdr_2_beta_seq
tcrmodel_2_structures['cdr_3_beta_seq'] = cdr_3_beta_seq

tcrmodel_2_structures['peptide_seq'] = peptide_seq

tcrmodel_2_structures['mhc_chain_1_seq'] = mhc_chain_1_seq

tcrmodel_2_structures

Unnamed: 0,path,form,pdb_id,state,model_rank,cdr_1_alpha_seq,cdr_2_alpha_seq,cdr_3_alpha_seq,cdr_1_beta_seq,cdr_2_beta_seq,cdr_3_beta_seq,peptide_seq,mhc_chain_1_seq
0,unbound/6uk2_DE/ranked_4.pdb,unbound,6uk2,apo,4,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
1,unbound/6uk2_DE/ranked_2.pdb,unbound,6uk2,apo,2,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
2,unbound/6uk2_DE/ranked_1.pdb,unbound,6uk2,apo,1,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
3,unbound/6uk2_DE/ranked_0.pdb,unbound,6uk2,apo,0,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
4,unbound/6uk2_DE/ranked_3.pdb,unbound,6uk2,apo,3,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380,mhcI/6zkx_DECA/ranked_4.pdb,mhcI,6zkx,holo,4,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...
2381,mhcI/6zkx_DECA/ranked_2.pdb,mhcI,6zkx,holo,2,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...
2382,mhcI/6zkx_DECA/ranked_1.pdb,mhcI,6zkx,holo,1,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...
2383,mhcI/6zkx_DECA/ranked_0.pdb,mhcI,6zkx,holo,0,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...


In [24]:
holo_pdb_ids = tcrmodel_2_structures.query("state == 'holo'")['pdb_id'].unique().tolist()

tcrmodel_2_structures = tcrmodel_2_structures.query("state == 'holo' or (state == 'apo' and pdb_id not in @holo_pdb_ids)")
tcrmodel_2_structures = tcrmodel_2_structures.drop_duplicates(['pdb_id', 'model_rank'])

tcrmodel_2_structures

Unnamed: 0,path,form,pdb_id,state,model_rank,cdr_1_alpha_seq,cdr_2_alpha_seq,cdr_3_alpha_seq,cdr_1_beta_seq,cdr_2_beta_seq,cdr_3_beta_seq,peptide_seq,mhc_chain_1_seq
0,unbound/6uk2_DE/ranked_4.pdb,unbound,6uk2,apo,4,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
1,unbound/6uk2_DE/ranked_2.pdb,unbound,6uk2,apo,2,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
2,unbound/6uk2_DE/ranked_1.pdb,unbound,6uk2,apo,1,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
3,unbound/6uk2_DE/ranked_0.pdb,unbound,6uk2,apo,0,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
4,unbound/6uk2_DE/ranked_3.pdb,unbound,6uk2,apo,3,TLSCTYDTSESDYYLFWYKQP,QMILVIRQEAYKQQNATENRFSVNF,AMYFCAFMDSNYQLIWGAGTK,TLRCKPISGHNSLFWYRQT,GLELLIYFNNNVPIDDSGMPEDRFSAKM,AVYFCASSRTSPTDTQYFGPGTR,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380,mhcI/6zkx_DECA/ranked_4.pdb,mhcI,6zkx,holo,4,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...
2381,mhcI/6zkx_DECA/ranked_2.pdb,mhcI,6zkx,holo,2,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...
2382,mhcI/6zkx_DECA/ranked_1.pdb,mhcI,6zkx,holo,1,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...
2383,mhcI/6zkx_DECA/ranked_0.pdb,mhcI,6zkx,holo,0,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVTNQAGTALIFGKGTT,TLQCSQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYSIRGSRGEQFFGPGTR,RLPAKAPLLGCG,SHSLKYFHTSVSRPGRGEPRFISVGYVDDTQFVRFDNDASPRMVPR...


In [42]:
apo_holo_dfs = []

for sequences, tcr_group in tcrmodel_2_structures.groupby(['cdr_1_alpha_seq', 'cdr_2_alpha_seq', 'cdr_3_alpha_seq', 'cdr_1_beta_seq', 'cdr_2_beta_seq', 'cdr_3_beta_seq']):
    # Screen out groups that don't have apo and holo forms
    states = tcr_group['state'].unique().tolist()
    if 'apo' in states and 'holo' in states:
        tcr_group = tcr_group.copy()
        tcr_group['cdr_sequence_collated'] = '-'.join(sequences)
    
        apo_holo_dfs.append(tcr_group)

apo_holo_tcrs = pd.concat(apo_holo_dfs).reset_index(drop=True)
apo_holo_tcrs

Unnamed: 0,path,form,pdb_id,state,model_rank,cdr_1_alpha_seq,cdr_2_alpha_seq,cdr_3_alpha_seq,cdr_1_beta_seq,cdr_2_beta_seq,cdr_3_beta_seq,peptide_seq,mhc_chain_1_seq,cdr_sequence_collated
0,unbound/6vth_DE/ranked_4.pdb,unbound,6vth,apo,4,AFNCTYSNSASQSFFWYRQD,EPKLLMSVYSSGNEDGRFTAQL,ATYLCVVQPGGYQKVTFGTGTK,TLQCAQDMNHNSMYWYRQD,GLRLIYYSASEGTTDKGEVPNGYNVSR,SVYFCASSEGLWQVGDEQYFGPGTR,,,AFNCTYSNSASQSFFWYRQD-EPKLLMSVYSSGNEDGRFTAQL-AT...
1,unbound/6vth_DE/ranked_2.pdb,unbound,6vth,apo,2,AFNCTYSNSASQSFFWYRQD,EPKLLMSVYSSGNEDGRFTAQL,ATYLCVVQPGGYQKVTFGTGTK,TLQCAQDMNHNSMYWYRQD,GLRLIYYSASEGTTDKGEVPNGYNVSR,SVYFCASSEGLWQVGDEQYFGPGTR,,,AFNCTYSNSASQSFFWYRQD-EPKLLMSVYSSGNEDGRFTAQL-AT...
2,unbound/6vth_DE/ranked_1.pdb,unbound,6vth,apo,1,AFNCTYSNSASQSFFWYRQD,EPKLLMSVYSSGNEDGRFTAQL,ATYLCVVQPGGYQKVTFGTGTK,TLQCAQDMNHNSMYWYRQD,GLRLIYYSASEGTTDKGEVPNGYNVSR,SVYFCASSEGLWQVGDEQYFGPGTR,,,AFNCTYSNSASQSFFWYRQD-EPKLLMSVYSSGNEDGRFTAQL-AT...
3,unbound/6vth_DE/ranked_0.pdb,unbound,6vth,apo,0,AFNCTYSNSASQSFFWYRQD,EPKLLMSVYSSGNEDGRFTAQL,ATYLCVVQPGGYQKVTFGTGTK,TLQCAQDMNHNSMYWYRQD,GLRLIYYSASEGTTDKGEVPNGYNVSR,SVYFCASSEGLWQVGDEQYFGPGTR,,,AFNCTYSNSASQSFFWYRQD-EPKLLMSVYSSGNEDGRFTAQL-AT...
4,unbound/6vth_DE/ranked_3.pdb,unbound,6vth,apo,3,AFNCTYSNSASQSFFWYRQD,EPKLLMSVYSSGNEDGRFTAQL,ATYLCVVQPGGYQKVTFGTGTK,TLQCAQDMNHNSMYWYRQD,GLRLIYYSASEGTTDKGEVPNGYNVSR,SVYFCASSEGLWQVGDEQYFGPGTR,,,AFNCTYSNSASQSFFWYRQD-EPKLLMSVYSSGNEDGRFTAQL-AT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405,mhcI/2bnq_DECA/ranked_4.pdb,mhcI,2bnq,holo,4,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVRPTSGGSYIPTFGRGTS,TLQCAQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYVGNTGELFFGEGSR,SLLMWITQV,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,VLNCSFTDSAIYNLQWFRQD-GLTSLLLIQSSQREQTSGRLNASL-...
406,mhcI/2bnq_DECA/ranked_2.pdb,mhcI,2bnq,holo,2,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVRPTSGGSYIPTFGRGTS,TLQCAQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYVGNTGELFFGEGSR,SLLMWITQV,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,VLNCSFTDSAIYNLQWFRQD-GLTSLLLIQSSQREQTSGRLNASL-...
407,mhcI/2bnq_DECA/ranked_1.pdb,mhcI,2bnq,holo,1,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVRPTSGGSYIPTFGRGTS,TLQCAQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYVGNTGELFFGEGSR,SLLMWITQV,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,VLNCSFTDSAIYNLQWFRQD-GLTSLLLIQSSQREQTSGRLNASL-...
408,mhcI/2bnq_DECA/ranked_0.pdb,mhcI,2bnq,holo,0,VLNCSFTDSAIYNLQWFRQD,GLTSLLLIQSSQREQTSGRLNASL,ATYLCAVRPTSGGSYIPTFGRGTS,TLQCAQDMNHEYMSWYRQD,GLRLIHYSVGAGITDQGEVPNGYNVSR,SVYFCASSYVGNTGELFFGEGSR,SLLMWITQV,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,VLNCSFTDSAIYNLQWFRQD-GLTSLLLIQSSQREQTSGRLNASL-...


In [43]:
apo_holo_tcrs['chains'] = apo_holo_tcrs['path'].map(lambda path: path.split('/')[1].split('_')[-1])

In [53]:
base_output_path = '/project/koohylab/bmcmaste/projects/tcr-loop-comparison/data/apo-holo-mhc-class-I-synthetic_refined'

for group_name, group_data in apo_holo_tcrs.drop_duplicates(['pdb_id', 'chains', 'form']).groupby('cdr_sequence_collated'):
    output_path = os.path.join(base_output_path, group_name)
    
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    
    summary[group_name] = []
    
    for _, entry in group_data.iterrows():
        dir_name = os.path.join(TCRMODEL2_STRUCTURES_PATH, entry.path.rsplit('/', 1)[0])
        output_name = f'{entry.pdb_id}_{entry.chains}_{entry.state}'
        
        copy_tree(dir_name, os.path.join(output_path, output_name))