In [27]:
import os
import glob

import pandas as pd
from python_pdb.parsers import parse_pdb_to_pandas
from python_pdb.formats.residue import THREE_TO_ONE_CODE

In [3]:
DATA_DIR = '/project/koohylab/bmcmaste/projects/tcr-pmhc-interface-analysis/data/apo-holo-mhc-class-I_refined_aligned'

In [12]:
df = pd.DataFrame({
    'path': glob.glob(os.path.join(DATA_DIR, '**/*.pdb'), recursive=True)
})

df

Unnamed: 0,path
0,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
1,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
2,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
3,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
4,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
...,...
70,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
71,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
72,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...
73,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...


In [23]:
df['group'] = df['path'].map(lambda path: path.split('/')[-2])
df[['pdb_id', 'chains', 'state']] = df['path'].map(lambda path: path.rsplit('/', 1)[-1].replace('.pdb', '').split('_')).apply(pd.Series)

df['chains'] = df['chains'].apply(list)
df[['alpha_chain_id', 'beta_chain_id', 'antigen_chain_id', 'mhc_chain_id']] = df['chains'].apply(pd.Series)

df

Unnamed: 0,path,group,pdb_id,chains,state,alpha_chain_id,beta_chain_id,antigen_chain_id,mhc_chain_id
0,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,7n1c,"[D, E]",apo,D,E,,
1,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,7n1e,"[D, E, C, A]",holo,D,E,C,A
2,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,6at6,"[A, B]",apo,A,B,,
3,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,6avf,"[A, B, P, H]",holo,A,B,P,H
4,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,DRGSQS-IYSNGD-GTYNQGGKLI-MNHEY-SMNVEV-ASSGASHEQY,3vxt,"[A, B]",apo,A,B,,
...,...,...,...,...,...,...,...,...,...
70,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,DRGSQS-IYSNGD-AVNFGGGKLI-MRHNA-SNTAGT-ASSLSFGTEAF,3qdg,"[D, E, C, A]",holo,D,E,C,A
71,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,5ivx,"[E, F, P, A]",holo,E,F,P,A
72,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,5iw1,"[A, B]",apo,A,B,,
73,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,DSAIYN-IQSSQRE-AQLNQAGTALI-MNHEY-SVGAGI-ASSYGT...,7r7z,"[A, B]",apo,A,B,,


In [28]:
def get_sequence(df):
    residue_names = df.drop_duplicates(['chain_id', 'residue_seq_id', 'residue_insert_code'])['residue_name']
    return ''.join(residue_names.map(THREE_TO_ONE_CODE).tolist())

In [93]:
pdb_ids = []
chain_ids = []
chain_types = []
sequences = []

for _, row in df.iterrows():
    
    with open(row.path, 'r') as fh:
        structure_df = parse_pdb_to_pandas(fh.read())
    
    for chain, chain_type in zip(row.chains, ['alpha', 'beta', 'antigen', 'mhc']):
        chain_df = structure_df.query('chain_id == @chain')
        
        sequence = get_sequence(chain_df)
        
        pdb_ids.append(row.pdb_id)
        chain_ids.append(chain)
        chain_types.append(chain_type)
        sequences.append(sequence)

In [94]:
sequences_df = pd.DataFrame({
    'pdb_id': pdb_ids,
    'chain_id': chain_ids,
    'chain_type': chain_types,
    'sequence': sequences,
})

sequences_df

Unnamed: 0,pdb_id,chain_id,chain_type,sequence
0,7n1c,D,alpha,QRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLL...
1,7n1c,E,beta,GVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQ...
2,7n1e,D,alpha,QRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLL...
3,7n1e,E,beta,GVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQ...
4,7n1e,C,antigen,RLQSLQTYV
...,...,...,...,...
259,7r7z,B,beta,AGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIH...
260,7r80,A,alpha,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...
261,7r80,B,beta,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...
262,7r80,E,antigen,QASQEVKNW


In [95]:
sequences_df = sequences_df.pivot(index='pdb_id', columns='chain_type', values=['sequence', 'chain_id'])

In [96]:
sequences_df.columns = ['_'.join(reversed(col)) for col in sequences_df.columns]

In [97]:
sequences_df = sequences_df.reset_index()

In [98]:
sequences_df

Unnamed: 0,pdb_id,alpha_sequence,antigen_sequence,beta_sequence,mhc_sequence,alpha_chain_id,antigen_chain_id,beta_chain_id,mhc_chain_id
0,1ao7,KEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELI...,LLFGYPVYV,GVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIHY...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,D,C,E,A
1,1g6r,QSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQLL...,SIYRYYGL,EAAVTQSPRNKVAVTGGKVTLSCNQTNNHNNMYWYRQDTGHGLRLI...,GPHSLRYFVTAVSRPGLGEPRYMEVGYVDDTEFVRFDSDAENPRYE...,A,P,B,H
2,1kgc,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...,,D,,E,
3,1mi5,KTTQPNSMESNEEEPVHLPCNHSTISGTDYIHWYRQLPSQGPEYVI...,FLRGRAYGL,GVSQSPRYKVAKRGQDVALRCDPISGHVSLFWYQQALGQGPEFLTY...,GSHSMRYFDTAMSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREE...,D,C,E,A
4,1mwa,QSVTQPDARVTVSEGASLQLRCKYSYSATPYLFWYVQYPRQGLQLL...,EQYKFYSV,EAAVTQSPRNKVAVTGGKVTLSCNQTNNHNNMYWYRQDTGHGLRLI...,GPHSLRYFVTAVSRPGLGEPRYMEVGYVDDTEFVRFDSDAENPRYE...,A,P,B,H
...,...,...,...,...,...,...,...,...,...
70,7n1e,QRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLL...,RLQSLQTYV,GVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQ...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,D,C,E,A
71,7n1f,KEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELI...,YLQPRTFLL,TGVSQNPRHKITKRGQNVTFRCDPISEHNRLYWYRQTLGQGPEFLT...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...,D,C,E,A
72,7r7z,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,,AGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIH...,,A,,B,
73,7r80,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,QASQEVKNW,NAGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLI...,GSHSMRYFYTAMSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRTE...,A,E,B,C


In [102]:
combinded_df = df.merge(sequences_df, how='left')
combinded_df

Unnamed: 0,path,group,pdb_id,chains,state,alpha_chain_id,beta_chain_id,antigen_chain_id,mhc_chain_id,alpha_sequence,antigen_sequence,beta_sequence,mhc_sequence
0,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,7n1c,"[D, E]",apo,D,E,,,QRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLL...,,GVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQ...,
1,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA...,7n1e,"[D, E, C, A]",holo,D,E,C,A,QRVTQPEKLLSVFKGAPVELKCNYSYSGSPELFWYVQYSRQRLQLL...,RLQSLQTYV,GVAQSPRYKIIEKRQSVAFWCNPISGHATLYWYQQILGQGPKLLIQ...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
2,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,6at6,"[A, B]",apo,A,B,,,LAKTTQPISMDSYEGQEVNITCSHNNIATNDYITWYQQFPSQGPRF...,,KVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLIYF...,
3,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NIATNDY-GYKTK-LVGEILDNFNKFY-MDHEN-SYDVKM-ASSQR...,6avf,"[A, B, P, H]",holo,A,B,P,H,LAKTTQPISMDSYEGQEVNITCSHNNIATNDYITWYQQFPSQGPRF...,APRGPHGGAASGL,VKVTQSSRYLVKRTGEKVFLECVQDMDHENMFWYRQDPGLGLRLIY...,GSHSMRYFYTSVSRPGRGEPRFISVGYVDDTQFVRFDSDAASPREE...
4,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,DRGSQS-IYSNGD-GTYNQGGKLI-MNHEY-SMNVEV-ASSGASHEQY,3vxt,"[A, B]",apo,A,B,,,KEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELI...,,EAQVTQNPRYLITVTGKKLTVTCSQNMNHEYMSWYRQDPGLGLRQI...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,DRGSQS-IYSNGD-AVNFGGGKLI-MRHNA-SNTAGT-ASSLSFGTEAF,3qdg,"[D, E, C, A]",holo,D,E,C,A,KEVEQNSGPLSVPEGAIASLNCTYSDRGSQSFFWYRQYSGKSPELI...,ELAGIGILTV,IAGITQAPTSQILAAGRRMTLRCTQDMRHNAMYWYRQDLGLGLRLI...,GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRME...
71,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,5ivx,"[E, F, P, A]",holo,E,F,P,A,QQVRQSPQSLTVWEGETAILNCSYENSAFDYFPWYQQFPGEGPALL...,RGPGRAFVTI,MKVTQMPRYLIKRMGENVLLECGQDMSHETMYWYRQDPGLGLQLIY...,MSHSLRYFVTAVSRPGFGEPRYMEVGYVDNTEFVRFDSDAENPRYE...
72,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,NSAFDY-ILSVSNK-AASASFGDNSKLI-MSHET-SYDVDS-ASSL...,5iw1,"[A, B]",apo,A,B,,,QQVRQSPQSLTVWEGETAILNCSYENSAFDYFPWYQQFPGEGPALL...,,VTQMPRYLIKRMGENVLLECGQDMSHETMYWYRQDPGLGLQLIYIS...,
73,/project/koohylab/bmcmaste/projects/tcr-pmhc-i...,DSAIYN-IQSSQRE-AQLNQAGTALI-MNHEY-SVGAGI-ASSYGT...,7r7z,"[A, B]",apo,A,B,,,KQEVTQIPAALSVPEGENLVLNCSFTDSAIYNLQWFRQDPGKGLTS...,,AGVTQTPKFQVLKTGQSMTLQCAQDMNHEYMSWYRQDPGMGLRLIH...,


In [104]:
combinded_df['chains_combo'] = combinded_df['chains'].map(lambda chains: ''.join(chains))

In [107]:
combinded_df[['group',
              'pdb_id',
              'chains_combo',
              'state',
              'alpha_sequence',
              'beta_sequence',
              'antigen_sequence',
              'mhc_sequence']].to_csv(os.path.join(DATA_DIR, 'sequences.csv'), index=False)