A notebook for getting various protein features: hydrophobicity, electrostatic charge, secondary structures.

In [7]:
import os
import pandas as pd
import numpy as np
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from biopandas.mmcif import PandasMmcif
from pymol import cmd, stored

In [8]:
def string2range(x):
    
    """
    This function takes in a `string` representing a region of interest in a
    protein. The region of interest can be a single region or multiple regions
    of a protein. Returns a range for single regions or a list of ranges for
    multiple regions.
    
    Parameters:
    
        x (string): String containing a region or several regions of interest in a 
            protein.
            Format of x: single region -> 'start-end'
                         multiple regions -> 'start1-end1,start2-end2'
                     
    Returns:
    
        range or list of ranges: For single region proteins a range is returned. For 
            multiple region proteins a list of ranges is returned

            Format: single region -> range(start, end+1)
                    multiple region -> [range(start1, end1+1), range(start2, end2+1)]
    """
    # Handle instances with more than one range
    if ',' in x:
        list_temp = x.split(sep = ',') #list_temp = ['123-456,' '789-1111']
        for y in range(len(list_temp)): 
            list_temp[y] = list_temp[y].split(sep = '-') #list_temp[y] = [['123', '456'], ['789', '1111']]
        for y in range(len(list_temp)): 
            for x in range(len(list_temp[y])):
                list_temp[y][x] = int(list_temp[y][x]) #turns each list item into an integer

        # Make a range object with the bounds of the range. Note to the 
        # end a 1 has to be added in order to include the last position in the range
        for y in range(len(list_temp)): #[1, 2] where 1=[123, 456] and 2=[789, 1111]
            for x in range(len(list_temp[y])): #[123, 456]       
                list_temp[y] = list(range(list_temp[y][x], list_temp[y][x+1]+1)) #list_temp[0][0] = [123], list_temp[0][0+1]+1 or [456] + 1 = [457]
                break

        return list(set([item for sublist in list_temp for item in sublist]))

    # Handle instances with only one range
    else:
        list_temp = x.split(sep = '-')
        for y in range(len(list_temp)):
            list_temp[y] = int(list_temp[y]) #

        # Make a range object with the bounds of the region. Note to the 
        # end a 1 has to be added in order to include the last position in the range
        return list(range(list_temp[0], list_temp[1]+1))

def get_structure_dict(fn, path):

    # Join the path and the file name
    full_path = os.path.join(path, fn)
            
    # Make an MMCIFDict object to grab more information form the .cif files
    mmcif_dict = MMCIF2Dict(full_path)

    return mmcif_dict

def num_hydrophobic(df, range_list, entity_id, column_names):
    '''
    This function takes in a dataframe containing a protein sequence, a list of ranges,
    and a list of column names, and returns the number of hydrophobic residues in 
    the protein sequence within the ranges. column_names[0] is the column containing the
    amino acid and column_names[1] is the column containing the position of the amino acid.
    '''
    # Make a list of the hydrophobic residues
    hydrophobic = ['GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'PRO', 'PHE', 'MET', 'TRP']

    # Make a list of the hydrophobic residues in the protein sequence
    hydrophobic_residues = df[(df[column_names[0]].isin(hydrophobic)) 
                              & (df[column_names[1]].isin(range_list))
                              & (df[column_names[2]] == entity_id)]

    return hydrophobic_residues

def num_charged(df, range_list, entity_id, column_names):
    '''
    This function takes in a dataframe containing a protein sequence, a list of ranges,
    and a list of column names, and returns the number of charged residues in 
    the protein sequence within the ranges. column_names[0] is the column containing the
    amino acid and column_names[1] is the column containing the position of the amino acid.
    '''
    # Make a list of the charged residues
    charged = ['ARG', 'LYS', 'HIS', 'ASP', 'GLU']

    # Make a list of the charged residues in the protein sequence
    charged_residues = df[(df[column_names[0]].isin(charged)) 
                          & (df[column_names[1]].isin(range_list))
                          & (df[column_names[2]] == entity_id)]

    return charged_residues

def add_num_hydrophobic(row, path):
    '''
    This function takes in a row containing the protein IDs and ranges
    of the inhibitory module and the functional domain and adds the number of hydrophobic residues
    in each protein's interface.
    '''

    uniprot = row['uniprot']
    chain = 'A'
    fn = row['af_filename']
    im_range = string2range(row['region_1'])
    fd_range = string2range(row['region_2'])
    im_range_length = len(im_range)
    fd_range_length = len(fd_range)
    interface_length = len(im_range) + len(fd_range)
    num_interface_residues = int(row['number_interface_residues'])


    mon_id = '_entity_poly_seq.mon_id'
    num = '_entity_poly_seq.num'
    entity_id_col = '_entity_poly_seq.entity_id'
    label_entity_id = '_atom_site.label_entity_id'
    chains = '_atom_site.label_asym_id'

    # Get the structure dict
    struct_dict = get_structure_dict(fn, path)

    # Get the correct entity id
    entity_index = struct_dict[chains].index(chain)
    entity_id = struct_dict[label_entity_id][entity_index]

    # Make a dataframe from the structure dictionary
    df = pd.DataFrame({mon_id: struct_dict[mon_id], 
                       num: struct_dict[num],
                       entity_id_col: struct_dict[entity_id_col]})

    df[num] = df[num].astype(int)
    
    # Get the number of hydrophobic residues in the inhibitory module
    im_hydrophobic_residues = len(num_hydrophobic(df, im_range, entity_id, [mon_id, num, entity_id_col]))
    im_perc_hydrophobic = im_hydrophobic_residues / im_range_length

    # Get the number of hydrophobic residues in the functional domain
    fd_hydrophobic_residues = len(num_hydrophobic(df, fd_range, entity_id, [mon_id, num, entity_id_col]))
    fd_perc_hydrophobic = fd_hydrophobic_residues / fd_range_length

    # Get the number of hydrophobic residues in the protein
    num_protein_hydro = len(num_hydrophobic(df, list(range(1, len(df)+1)), entity_id, [mon_id, num, entity_id_col]))
    percent_protein_hydro = num_protein_hydro / len(df)

    # Get the percent of the im and fd that are hydrophobic
    num_domains_hydro = im_hydrophobic_residues + fd_hydrophobic_residues
    perc_domains_hydro = num_domains_hydro / interface_length

    # Get the interacting interface residues
    if num_interface_residues != 0:
    
        interface_list = row['interface_residues'].strip('}{').split(', ')
        interface_list = [int(x) for x in interface_list]
        # Define the columns we will be accessing from the structure dictionary

        # Get the number of hydrophobic interacting residues
        num_interface_hydro = num_hydrophobic(df, interface_list, entity_id, [mon_id, num, entity_id_col])
        percent_interface_hydro = len(num_interface_hydro) / len(interface_list)
        row['num_interface_hydro'] = len(num_interface_hydro)
        row['percent_interface_hydro'] = round(percent_interface_hydro, 3)
    
    else:
        row['num_interface_hydro'] = 0
        row['percent_interface_hydro'] = 0

    row['num_im_hydro'] = im_hydrophobic_residues
    row['percent_im_hydro'] = round(im_perc_hydrophobic, 3)
    row['num_fd_hydro'] = fd_hydrophobic_residues
    row['percent_fd_hydro'] = round(fd_perc_hydrophobic, 3)
    row['num_domains_hydro'] = num_domains_hydro
    row['percent_domains_hydro'] = round(perc_domains_hydro, 3)
    row['num_protein_hydro'] = num_protein_hydro
    row['percent_protein_hydro'] = round(percent_protein_hydro, 3)
    row['protein_length'] = len(df)

    return row

def add_num_charged(row, path):
    '''
    This function takes in a row containing the protein IDs and ranges
    of the inhibitory module and the functional domain and adds the number of charged residues
    in each protein's interface.
    '''

    uniprot = row['uniprot']
    chain = 'A'
    fn = row['af_filename']
    im_range = string2range(row['region_1'])
    fd_range = string2range(row['region_2'])
    im_range_length = len(im_range)
    fd_range_length = len(fd_range)
    interface_length = len(im_range) + len(fd_range)
    num_interface_residues = int(row['number_interface_residues'])


    mon_id = '_entity_poly_seq.mon_id'
    num = '_entity_poly_seq.num'
    entity_id_col = '_entity_poly_seq.entity_id'
    label_entity_id = '_atom_site.label_entity_id'
    chains = '_atom_site.label_asym_id'

    # Get the structure dict
    struct_dict = get_structure_dict(fn, path)

    # Get the correct entity id
    entity_index = struct_dict[chains].index(chain)
    entity_id = struct_dict[label_entity_id][entity_index]

    # Make a dataframe from the structure dictionary
    df = pd.DataFrame({mon_id: struct_dict[mon_id],
                          num: struct_dict[num],
                          entity_id_col: struct_dict[entity_id_col]})
    
    df[num] = df[num].astype(int)

    # Get the number of charged residues in the inhibitory module
    im_charged_residues = len(num_charged(df, im_range, entity_id, [mon_id, num, entity_id_col]))
    im_perc_charged = im_charged_residues / im_range_length

    # Get the number of charged residues in the functional domain
    fd_charged_residues = len(num_charged(df, fd_range, entity_id, [mon_id, num, entity_id_col]))
    fd_perc_charged = fd_charged_residues / fd_range_length

    # Get the number of charged residues in the protein
    num_protein_charged = len(num_charged(df, list(range(1, len(df)+1)), entity_id, [mon_id, num, entity_id_col]))
    percent_protein_charged = num_protein_charged / len(df)

    # Get the percent of the im and fd that are charged
    num_domains_charged = im_charged_residues + fd_charged_residues
    perc_domains_charged = num_domains_charged / interface_length

    # Get the interacting interface residues
    if num_interface_residues != 0:
        
            interface_list = row['interface_residues'].strip('}{').split(', ')
            interface_list = [int(x) for x in interface_list]
            # Define the columns we will be accessing from the structure dictionary
    
            # Get the number of charged interacting residues
            num_interface_charged = num_charged(df, interface_list, entity_id, [mon_id, num, entity_id_col])
            percent_interface_charged = len(num_interface_charged) / len(interface_list)
            row['num_interface_charged'] = len(num_interface_charged)
            row['percent_interface_charged'] = round(percent_interface_charged, 3)

    else:
        row['num_interface_charged'] = 0
        row['percent_interface_charged'] = 0

    row['num_im_charged'] = im_charged_residues
    row['percent_im_charged'] = round(im_perc_charged, 3)
    row['num_fd_charged'] = fd_charged_residues
    row['percent_fd_charged'] = round(fd_perc_charged, 3)
    row['num_domains_charged'] = num_domains_charged
    row['percent_domains_charged'] = round(perc_domains_charged, 3)
    row['num_protein_charged'] = num_protein_charged
    row['percent_protein_charged'] = round(percent_protein_charged, 3)

    return row

def get_ss_region_info(fp, chain, region):
    '''
    Get the secondary structure information for a region of interest
    '''
    cmd.delete('all')
    cmd.load(fp)
    stored.secondary_structure_list_by_resnumber = []
    cmd.iterate(f'(chain {chain} and resi {region} and name ca)', 'stored.secondary_structure_list_by_resnumber.append((resv, ss))', quiet=1)
    ss = list(stored.secondary_structure_list_by_resnumber)

    return ss

def get_ss_protein_info(fp, chain):
    '''
    Get the secondary structure information for the full protein structure
    '''

    cmd.delete('all')
    cmd.load(fp)
    stored.secondary_structure_list_by_resnumber = []
    cmd.iterate(f'(chain {chain} and name ca)', 'stored.secondary_structure_list_by_resnumber.append((resv, ss))', quiet=1)
    ss = list(stored.secondary_structure_list_by_resnumber)

    return ss

def categorize_ss(ss):
    '''
    Categorize the secondary structure elements
    '''

    helix_res = []
    sheet_res = []
    # Add residues to lists
    for res in ss:
        if 'H' in res:
            helix_res.append(res[0])
        elif 'S' in res:
            sheet_res.append(res[0])

    return helix_res, sheet_res

def add_ss_info(row, path):
    '''
    This function takes in a row containing the protein IDs and ranges
    of the inhibitory module and the functional domain and adds the number
    of residues involved in secondary structure elements in each protein's interface.
    '''

    uniprot = row['uniprot']
    fn = row['af_filename']
    fp = os.path.join(path, fn)
    region1 = row['region_1']
    region2 = row['region_2']
    chain = 'A'
    im_range = string2range(row['region_1'])
    fd_range = string2range(row['region_2'])
    region1_len = len(im_range)
    region2_len = len(fd_range)
    # prot_length = row['pdb_length']

    if ',' in region1:
        region1 = region1.replace(',', '+')
    
    if ',' in region2:
        region2 = region2.replace(',', '+')

    # Get the secondary structure information for the inhibitory module
    im_ss = get_ss_region_info(fp, chain, region1)
    im_helix_res, im_sheet_res = categorize_ss(im_ss)

    # Get the secondary structure information for the functional domain
    fd_ss = get_ss_region_info(fp, chain, region2)
    fd_helix_res, fd_sheet_res = categorize_ss(fd_ss)

    # Get the secondary structure information for the full protein
    protein_ss = get_ss_protein_info(fp, chain)
    protein_helix_res, protein_sheet_res = categorize_ss(protein_ss)

    im_perc_helix, im_perc_sheet = len(im_helix_res)/region1_len, len(im_sheet_res)/region1_len
    fd_perc_helix, fd_perc_sheet = len(fd_helix_res)/region2_len, len(fd_sheet_res)/region2_len
    # protein_perc_helix, protein_perc_sheet = len(protein_helix_res)/prot_length, len(protein_sheet_res)/prot_length

    # Save the secondary structure information to the row
    row['num_im_helix'] = len(im_helix_res)
    row['num_im_sheet'] = len(im_sheet_res)
    row['percent_im_helix'] = round(im_perc_helix, 3)
    row['percent_im_sheet'] = round(im_perc_sheet, 3)
    row['num_fd_helix'] = len(fd_helix_res)
    row['num_fd_sheet'] = len(fd_sheet_res)
    row['percent_fd_helix'] = round(fd_perc_helix, 3)
    row['percent_fd_sheet'] = round(fd_perc_sheet, 3)
    row['num_protein_helix'] = len(protein_helix_res)
    row['num_protein_sheet'] = len(protein_sheet_res)
    # row['percent_protein_helix'] = round(protein_perc_helix, 3)
    # row['percent_protein_sheet'] = round(protein_perc_sheet, 3)

    return row
 
def get_hydrophobic(df, path):

    df = df.fillna(0)

    df['af_filename'] = df['af_filename'].astype(str)

    # Keep only the relevant information
    sub_df = df[['uniprot', 'region_1', 'region_2', 'af_filename', 'number_interface_residues', 'interface_residues']].copy()

    # Add columns to the dataframe containing the number and percent of hydrophobic residues in the interface
    sub_df = sub_df.apply(lambda row: add_num_hydrophobic(row, path), axis = 1)

    sub_df = sub_df.apply(lambda row: add_ss_info(row, path), axis = 1)

    # Add columns to the dataframe containing the number and percent of charged residues in the interface
    sub_df = sub_df.apply(lambda row: add_num_charged(row, path), axis = 1)

    return sub_df

In [9]:
# gt_path = './project_pipeline/data/input/RCSB_cif'

# df = pd.read_csv('./test_interface.csv')
# df = get_hydrophobic(df, gt_path)

In [10]:
# # Define the path to the .cif files
# gt_path = './project_pipeline/data/input/RCSB_cif_trim'

# # Read in the dataframe
# df1 = pd.read_csv('./project_pipeline/data/proteins_interface.tsv', sep = '\t')

# # Get the number of hydrophobic residues in the interface
# df1 = get_hydrophobic(df1, gt_path)

# df1.to_csv('./project_pipeline/data/ai_structure_information.tsv', sep = '\t')

In [11]:
# df2 = pd.read_csv('./project_pipeline/data/md_proteins_interface.tsv', sep = '\t')

# df2 = get_hydrophobic(df2, gt_path)

# df2.to_csv('./project_pipeline/data/md_structure_information.tsv', sep = '\t')

In [12]:
# Define the path to the .cif files
gt_path = './project_pipeline/data/input/Alphafold_cif'
md_path = './project_pipeline/data/input/Alphafold_multi_domain'

# Read in the dataframe
df1 = pd.read_csv('./project_pipeline/data/alphafold_interface.tsv', sep = '\t')

# Get the number of hydrophobic residues in the interface
df1 = get_hydrophobic(df1, gt_path)

df1.to_csv('./project_pipeline/data/ai_full_depth_structure_information.tsv', sep = '\t')

df2 = pd.read_csv('./project_pipeline/data/md_alphafold_interface.tsv', sep = '\t')

df2 = get_hydrophobic(df2, md_path)

df2.to_csv('./project_pipeline/data/md_full_depth_structure_information.tsv', sep = '\t')

 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLoad-Detail: Detected mmCIF
 ExecutiveLo