# Generate the main dataframe 
Author: Amish Mishra  
Date: March 29, 2023  
- This notebook generates the main dataframe with protein metadata (stability scores, subject matter expert features) and pds file paths  
- It merges together the 113 selected SME features from `protein_metadata/Rocklin.v5.structural_metrics.csv`, the stability scores calibrated using a convolutional neural network from `protein_metadata/Rocklin.v6.experimental_stability_scores.csv`, and the file paths for the PDs for each protein.  
- Use `cder2` kernel

In [4]:
import pandas
import os
import re
import time
from IPython.display import display

In [5]:
def stability_threshold_labeller(label_col_value, thresh=1.0):
    '''
    Takes a stability score and returns a lable for it depending on a threshold
    '''
    if label_col_value > thresh:
        return "green"
    else:
        return "red"

In [6]:
# Path with datasets
pds_dir = './protein_pds'  # persistence diagrams for proteins
stability_scores_file = 'protein_metadata/Rocklin.v6.experimental_stability_scores.csv'  # stability scores for proteins
SME_file = 'protein_metadata/Rocklin.v5.structural_metrics.csv'  # subject matter expert measurements for proteins
SME_cols = ['name', 'AlaCount', 'T1_absq', 'T1_netq', 'Tend_absq', 'Tend_netq',
       'Tminus1_absq', 'Tminus1_netq', 'abego_res_profile',
       'abego_res_profile_penalty', 'avg_all_frags', 'avg_best_frag',
       'bb', 'buns_bb_heavy', 'buns_nonheavy', 'buns_sc_heavy',
       'buried_minus_exposed', 'buried_np', 'buried_np_AFILMVWY',
       'buried_np_AFILMVWY_per_res', 'buried_np_per_res',
       'buried_over_exposed', 'chymo_cut_sites',
       'chymo_with_LM_cut_sites', 'contact_all', 'contact_core_SASA',
       'contact_core_SCN', 'contig_not_hp_avg', 'contig_not_hp_avg_norm',
       'contig_not_hp_internal_max', 'contig_not_hp_max', 'degree',
       'dslf_fa13', 'entropy', 'exposed_hydrophobics',
       'exposed_np_AFILMVWY', 'exposed_polars', 'exposed_total', 'fa_atr',
       'fa_atr_per_res', 'fa_dun_dev', 'fa_dun_rot', 'fa_dun_semi',
       'fa_elec', 'fa_intra_atr_xover4', 'fa_intra_elec',
       'fa_intra_rep_xover4', 'fa_intra_sol_xover4', 'fa_rep',
       'fa_rep_per_res', 'fa_sol', 'frac_helix', 'frac_loop',
       'frac_sheet', 'fxn_exposed_is_np', 'hbond_bb_sc', 'hbond_lr_bb',
       'hbond_lr_bb_per_sheet', 'hbond_sc', 'hbond_sr_bb',
       'hbond_sr_bb_per_helix', 'helix_sc', 'holes', 'hphob_sc_contacts',
       'hphob_sc_degree', 'hxl_tors', 'hydrophobicity',
       'largest_hphob_cluster', 'lk_ball', 'lk_ball_bridge',
       'lk_ball_bridge_uncpl', 'lk_ball_iso', 'loop_sc',
       'mismatch_probability', 'n_charged', 'n_hphob_clusters',
       'n_hydrophobic', 'n_hydrophobic_noA', 'n_polar_core', 'n_res',
       'nearest_chymo_cut_to_Cterm', 'nearest_chymo_cut_to_Nterm',
       'nearest_chymo_cut_to_term', 'nearest_tryp_cut_to_Cterm',
       'nearest_tryp_cut_to_Nterm', 'nearest_tryp_cut_to_term',
       'net_atr_net_sol_per_res', 'net_atr_per_res', 'net_sol_per_res',
       'netcharge', 'nres', 'nres_helix', 'nres_loop', 'nres_sheet',
       'omega', 'one_core_each', 'p_aa_pp', 'pack', 'percent_core_SASA',
       'percent_core_SCN', 'pro_close', 'rama_prepro', 'ref',
       'res_count_core_SASA', 'res_count_core_SCN', 'score_per_res',
       'ss_contributes_core', 'ss_sc', 'sum_best_frags', 'total_score',
       'tryp_cut_sites', 'two_core_each', 'worst6frags', 'worstfrag']


topologies = ['HEEH','EHEE','HHH','EEHEE']
columns = ['topology', 'stabilityscore_cnn_calibrated', 'pd_path']
main_df = pandas.DataFrame(columns=columns)
stability_df = pandas.read_csv(stability_scores_file, comment='#', usecols=['name', 'stabilityscore_cnn_calibrated'])
SME_df = pandas.read_csv(SME_file, comment='#', usecols=SME_cols)

print('main_df')
display(main_df)
print('stability_df')
display(stability_df)
print('SME_df')
display(SME_df)

main_df


Unnamed: 0,topology,stabilityscore_cnn_calibrated,pd_path


stability_df


Unnamed: 0,name,stabilityscore_cnn_calibrated
0,EEHEE_rd1_0001,-0.418984
1,EEHEE_rd1_0002,-0.181598
2,EEHEE_rd1_0003,-0.042318
3,EEHEE_rd1_0004,-0.201871
4,EEHEE_rd1_0005,-1.171825
...,...,...
16169,HHH_rd4_0996,2.064374
16170,HHH_rd4_0997,0.531378
16171,HHH_rd4_0998,0.294450
16172,HHH_rd4_0999,1.391039


SME_df


Unnamed: 0,name,AlaCount,T1_absq,T1_netq,Tend_absq,Tend_netq,Tminus1_absq,Tminus1_netq,abego_res_profile,abego_res_profile_penalty,...,res_count_core_SCN,score_per_res,ss_contributes_core,ss_sc,sum_best_frags,total_score,tryp_cut_sites,two_core_each,worst6frags,worstfrag
0,HEEH_rd2_0365,5.0,3.0,-3.0,6.0,6.0,3.0,3.0,0.265625,-0.039350,...,8.0,-2.374226,1.0,0.773357,10.0275,-102.091714,10,0.50,3.6066,0.8311
1,EEHEE_rd2_0259,3.0,1.0,-1.0,3.0,3.0,2.0,2.0,0.302820,-0.043151,...,6.0,-2.115025,1.0,0.744365,9.6064,-86.716041,6,0.40,3.1315,0.6469
2,EEHEE_rd1_0176,4.0,3.0,-1.0,5.0,3.0,2.0,2.0,0.271764,-0.040629,...,8.0,-1.990048,1.0,0.738257,17.3089,-81.591967,11,0.20,6.4043,1.3895
3,EEHEE_rd1_1091,4.0,3.0,-1.0,5.0,3.0,2.0,2.0,0.302535,-0.051434,...,5.0,-2.243351,0.6,0.675238,16.0869,-91.977394,10,0.20,6.2374,1.4039
4,HEEH_rd2_1282,5.0,4.0,-4.0,8.0,8.0,4.0,4.0,0.261995,-0.043626,...,6.0,-2.298392,1.0,0.668926,9.3067,-98.830840,11,0.50,3.8221,1.1585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16169,HHH_rd4_0012,0.0,8.0,-8.0,14.0,12.0,6.0,4.0,0.237022,-0.022913,...,11.0,-3.130214,1.0,0.809614,6.9431,-134.599193,10,1.00,2.3726,0.4668
16170,HEEH_rd2_0908,3.0,3.0,-3.0,6.0,4.0,3.0,1.0,0.260765,-0.031453,...,7.0,-2.031653,1.0,0.765651,16.5237,-87.361072,8,0.50,6.3974,1.2575
16171,EEHEE_rd2_0446,2.0,2.0,-2.0,4.0,4.0,2.0,2.0,0.310850,-0.059348,...,5.0,-2.319726,1.0,0.759351,11.5641,-95.108782,3,0.00,3.8175,0.9423
16172,EEHEE_rd2_0834,3.0,2.0,-2.0,4.0,4.0,2.0,2.0,0.357115,-0.037620,...,5.0,-2.230106,1.0,0.752723,11.1539,-91.434348,5,0.20,4.0081,1.1561


## Put stability scores, SME, and PDs paths into one dataframe

In [7]:
num_missing_scores = 0
total_protein_pds = 0
verbose = False

tic = time.time()
for file in os.listdir(pds_dir):
    name = file[:-4]
    topology_group = re.search('^[A-Z]+[^_]*', file)
    if topology_group == None:  # found a file that is not a protein file, so skip it
        continue
    total_protein_pds += 1
    
    row = stability_df[stability_df['name'] == name]
    if row.empty:  # this protein has no stability score in the csv file, so skip it
        num_missing_scores += 1
        if verbose:
            print('No stability score for', name, 'so not used')
        continue
    
    # At this point, we found a protein PD file whose protein has a stability score
    topology = topology_group.group()
    stability_score = row['stabilityscore_cnn_calibrated'].iloc[0]
    pd_path = os.path.join(pds_dir, file)
    
    # Update main_df with new protein information
    new_row_df = pandas.DataFrame({'name': [name], 'topology': [topology], 
                                   'stabilityscore_cnn_calibrated': [stability_score], 'pd_path': [pd_path]})
    main_df = pandas.concat([main_df, new_row_df], ignore_index=True)

main_df = main_df.merge(SME_df, on='name')  # join the subject matter experts features as well
runtime = time.time() - tic
print(str(num_missing_scores) + '/' + str(total_protein_pds), 'proteins not used due to missing stability scores')
print('All files processed and dataframe generated in', runtime, 'sec')
display(main_df)

289/16463 proteins not used due to missing stability scores
All files processed and dataframe generated in 104.12267541885376 sec


Unnamed: 0,topology,stabilityscore_cnn_calibrated,pd_path,name,AlaCount,T1_absq,T1_netq,Tend_absq,Tend_netq,Tminus1_absq,...,res_count_core_SCN,score_per_res,ss_contributes_core,ss_sc,sum_best_frags,total_score,tryp_cut_sites,two_core_each,worst6frags,worstfrag
0,EEHEE,-0.418984,./protein_pds/EEHEE_rd1_0001.pkl,EEHEE_rd1_0001,5.0,2.0,-2.0,4.0,4.0,2.0,...,6.0,-2.380152,1.0,0.714617,14.7423,-97.586214,7,0.200000,5.8800,1.4102
1,EEHEE,-0.181598,./protein_pds/EEHEE_rd1_0002.pkl,EEHEE_rd1_0002,3.0,1.0,-1.0,3.0,3.0,2.0,...,8.0,-2.449642,1.0,0.761711,12.8237,-100.435324,8,0.600000,4.4051,1.0108
2,EEHEE,-0.042318,./protein_pds/EEHEE_rd1_0003.pkl,EEHEE_rd1_0003,5.0,3.0,-3.0,5.0,3.0,2.0,...,5.0,-2.453422,1.0,0.754639,11.2324,-100.590297,7,0.400000,3.7392,0.8318
3,EEHEE,-0.201871,./protein_pds/EEHEE_rd1_0004.pkl,EEHEE_rd1_0004,4.0,3.0,-1.0,5.0,3.0,2.0,...,5.0,-2.302929,1.0,0.754089,11.6521,-94.420098,8,0.000000,4.3599,1.0135
4,EEHEE,-1.171825,./protein_pds/EEHEE_rd1_0005.pkl,EEHEE_rd1_0005,4.0,1.0,-1.0,4.0,2.0,3.0,...,6.0,-1.874187,0.6,0.706007,13.7001,-76.841666,8,0.200000,6.0469,1.6184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16169,HHH,2.064374,./protein_pds/HHH_rd4_0996.pkl,HHH_rd4_0996,2.0,7.0,-7.0,11.0,11.0,4.0,...,6.0,-3.115125,1.0,0.751669,6.4531,-133.950370,13,1.000000,2.1059,0.4221
16170,HHH,0.531378,./protein_pds/HHH_rd4_0997.pkl,HHH_rd4_0997,2.0,7.0,-1.0,11.0,3.0,4.0,...,5.0,-3.053967,1.0,0.786528,5.9061,-131.320578,10,0.333333,1.7070,0.3360
16171,HHH,0.294450,./protein_pds/HHH_rd4_0998.pkl,HHH_rd4_0998,1.0,6.0,-4.0,11.0,9.0,5.0,...,7.0,-3.074040,1.0,0.757832,6.5762,-132.183727,12,0.666667,1.9819,0.4760
16172,HHH,1.391039,./protein_pds/HHH_rd4_0999.pkl,HHH_rd4_0999,0.0,5.0,-3.0,11.0,5.0,6.0,...,9.0,-3.073015,1.0,0.748210,6.8451,-132.139637,13,1.000000,2.0127,0.3622


## Remove columns if values are duplicates of other columns

In [8]:
main_df_dups_removed = main_df.T.drop_duplicates().T
s = set(main_df_dups_removed)
removed_cols = [x for x in main_df if x not in s]
print('Duplicate columns removed:', removed_cols)
display(main_df_dups_removed)

Duplicate columns removed: ['contact_core_SASA', 'contact_core_SCN', 'entropy', 'nres']


Unnamed: 0,topology,stabilityscore_cnn_calibrated,pd_path,name,AlaCount,T1_absq,T1_netq,Tend_absq,Tend_netq,Tminus1_absq,...,res_count_core_SCN,score_per_res,ss_contributes_core,ss_sc,sum_best_frags,total_score,tryp_cut_sites,two_core_each,worst6frags,worstfrag
0,EEHEE,-0.418984,./protein_pds/EEHEE_rd1_0001.pkl,EEHEE_rd1_0001,5.0,2.0,-2.0,4.0,4.0,2.0,...,6.0,-2.380152,1.0,0.714617,14.7423,-97.586214,7,0.2,5.88,1.4102
1,EEHEE,-0.181598,./protein_pds/EEHEE_rd1_0002.pkl,EEHEE_rd1_0002,3.0,1.0,-1.0,3.0,3.0,2.0,...,8.0,-2.449642,1.0,0.761711,12.8237,-100.435324,8,0.6,4.4051,1.0108
2,EEHEE,-0.042318,./protein_pds/EEHEE_rd1_0003.pkl,EEHEE_rd1_0003,5.0,3.0,-3.0,5.0,3.0,2.0,...,5.0,-2.453422,1.0,0.754639,11.2324,-100.590297,7,0.4,3.7392,0.8318
3,EEHEE,-0.201871,./protein_pds/EEHEE_rd1_0004.pkl,EEHEE_rd1_0004,4.0,3.0,-1.0,5.0,3.0,2.0,...,5.0,-2.302929,1.0,0.754089,11.6521,-94.420098,8,0.0,4.3599,1.0135
4,EEHEE,-1.171825,./protein_pds/EEHEE_rd1_0005.pkl,EEHEE_rd1_0005,4.0,1.0,-1.0,4.0,2.0,3.0,...,6.0,-1.874187,0.6,0.706007,13.7001,-76.841666,8,0.2,6.0469,1.6184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16169,HHH,2.064374,./protein_pds/HHH_rd4_0996.pkl,HHH_rd4_0996,2.0,7.0,-7.0,11.0,11.0,4.0,...,6.0,-3.115125,1.0,0.751669,6.4531,-133.95037,13,1.0,2.1059,0.4221
16170,HHH,0.531378,./protein_pds/HHH_rd4_0997.pkl,HHH_rd4_0997,2.0,7.0,-1.0,11.0,3.0,4.0,...,5.0,-3.053967,1.0,0.786528,5.9061,-131.320578,10,0.333333,1.707,0.336
16171,HHH,0.29445,./protein_pds/HHH_rd4_0998.pkl,HHH_rd4_0998,1.0,6.0,-4.0,11.0,9.0,5.0,...,7.0,-3.07404,1.0,0.757832,6.5762,-132.183727,12,0.666667,1.9819,0.476
16172,HHH,1.391039,./protein_pds/HHH_rd4_0999.pkl,HHH_rd4_0999,0.0,5.0,-3.0,11.0,5.0,6.0,...,9.0,-3.073015,1.0,0.74821,6.8451,-132.139637,13,1.0,2.0127,0.3622


In [9]:
# Save data frame
main_df_dups_removed.to_csv('main_df.csv', index=False)