A notebook to collect information on nonstandard monomers (i.e. phosphorylated or such) from our structures.

In [17]:
import os
import pandas as pd
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from biopandas.mmcif import PandasMmcif

In [18]:
def get_structure_dict(fn, path):

    # Join the path and the file name
    full_path = os.path.join(path, fn)

    # To load a PDB file make a parser object
    parser = MMCIFParser(QUIET=True)
            
    # Make an MMCIFDict object to grab more information form the .cif files
    mmcif_dict = MMCIF2Dict(full_path)

    return mmcif_dict

def nonstandard_monomers(mmcif_dict):
    '''
    Takes a mmcif_dict and returns a list of nonstandard monomer names
    '''

    # Get the nonstandard monomers
    flags = mmcif_dict['_chem_comp.mon_nstd_flag']
    nme = mmcif_dict['_chem_comp.name']

    # Create a dataframe
    df = pd.DataFrame({'flags': flags, 'nme': nme})

    # Filter for rows with nonstandard flags
    df1 = df[df['flags'] == 'n']

    # Filter for rows with non-polymers
    df2 = df[df['flags'] == '.']

    # Turn the nonstandard monomers into a list
    nstds = df1['nme'].tolist()

    nplymrs = df2['nme'].tolist()

    return nstds, nplymrs

def extra_polypeptides(mmcif_dict):
    '''
    Takes a mmcif_dict and returns a boolean value indicating if there are partner polypeptides 
    and the number of partner polypeptides
    '''

    # Get the entity id and type
    eid = mmcif_dict['_entity_poly.entity_id']
    typ = mmcif_dict['_entity_poly.type']

    # Create a dataframe
    df = pd.DataFrame({'eid': eid, 'typ': typ})

    # Filter for rows with polypeptides
    df1 = df[df['typ'] == 'polypeptide(L)']

    # If there are more than one polypeptide, return True
    if len(df1) > 1:
        return True, len(df1) - 1
    else:
        return False, 0

def add_nstd_mons(path, row):
    '''
    Takes a row of a dataframe (and a file path) and adds a list of nonstandard monomers in a new column
    '''

    uniprot = row['uniprot']
    pdb = row['pdb']

    fn = f'{pdb}.cif'
    fp = os.path.join(path, uniprot)

    # Get the mmcif_dict
    mmcif_dict = get_structure_dict(fn, fp)

    # Get the nonstandard monomers
    nstds, nplymrs = nonstandard_monomers(mmcif_dict)

    # Get the extra polypeptides
    has_partner, num_partners = extra_polypeptides(mmcif_dict)

    # Add the nonstandard monomers to the row
    row['nstds'] = nstds

    row['non-polymers'] = nplymrs

    row['has_partner'] = has_partner

    row['num_partners'] = num_partners

    if len(nstds) > 0:
        row['has_nstds'] = True
    else:
        row['has_nstds'] = False

    if len(nplymrs) > 0:
        row['has_nplymrs'] = True
    else:    
        row['has_nplymrs'] = False

    return row

def get_nonstandard_monomers(path, df):
    '''
    Takes a dataframe of structures (and a directory) and returns a dataframe with an entry for each nonstandard monomer
    '''

    # Add the nonstandard monomers to the dataframe
    df = df.apply(lambda row: add_nstd_mons(path, row), axis=1)

    # # Explode the nonstandard monomers
    # df = df.explode('nstds').reset_index(drop=True)

    return df



In [19]:
df = pd.read_csv('./project_pipeline/data/classified_files.tsv', sep='\t')

path = './project_pipeline/data/input/RCSB_cif/'

df = get_nonstandard_monomers(path, df)

df2 = pd.read_csv('./project_pipeline/data/md_rmsds.tsv', sep='\t')

df2 = get_nonstandard_monomers(path, df2)

In [20]:
df.to_csv('./project_pipeline/data/ai_nonstandard_monomers.tsv', sep='\t', index=False)

df2.to_csv('./project_pipeline/data/md_nonstandard_monomers.tsv', sep='\t', index=False)