# Train ML classifier using Subject Matter Expert (SME) features
Author: Amish Mishra  
Date: April 8, 2023  
Use `cder2` kernel  
This notebook trains an ML classifier on the SME features on a test set. It downsamples the unstable proteins to match with the number of stable proteins.

In [2]:
import time
import pickle
import pandas
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import scipy.stats as sps
from sklearn import model_selection
from IPython.display import display

In [3]:
def stability_threshold_labeller(label_col_value, thresh=1.0):
    '''
    Takes a stability score and returns a lable for it depending on a threshold
    '''
    if label_col_value > thresh:
        return "green"
    else:
        return "red"

## Load in the dataframe with protein information and label
red: unstable  
green: stable

In [4]:
raw_df = pandas.read_csv('main_df.csv')
topologies_arr = raw_df['topology'].unique()  # store the different topology types

# Label stable: green and unstable: red
stability_threshold = 1.0  # set the stability score at which to separate the stable label from the unstable label
label_col = raw_df['stabilityscore_cnn_calibrated'].apply(stability_threshold_labeller, thresh = stability_threshold)
raw_df.insert(loc=2, column='label', value=label_col)  # insert new label column in the 2nd index position
raw_df

Unnamed: 0,topology,stabilityscore_cnn_calibrated,label,pd_path,name,AlaCount,T1_absq,T1_netq,Tend_absq,Tend_netq,...,res_count_core_SCN,score_per_res,ss_contributes_core,ss_sc,sum_best_frags,total_score,tryp_cut_sites,two_core_each,worst6frags,worstfrag
0,HHH,0.542808,red,./protein_pds/HHH_rd1_0825.pkl,HHH_rd1_0825,6.0,9.0,-5.0,13.0,9.0,...,5.0,-2.635028,1.00,0.764841,5.3591,-113.306191,15,0.333333,1.5963,0.3444
1,EEHEE,1.687863,green,./protein_pds/EEHEE_rd4_0226.pkl,EEHEE_rd4_0226,4.0,2.0,-2.0,4.0,2.0,...,7.0,-3.385700,1.00,0.786320,9.6969,-145.585091,6,0.200000,3.0023,0.5696
2,HEEH,-0.325246,red,./protein_pds/HEEH_rd2_0035.pkl,HEEH_rd2_0035,3.0,5.0,-5.0,9.0,7.0,...,8.0,-2.240384,1.00,0.764385,12.5276,-96.336497,11,0.750000,4.6847,1.1061
3,EHEE,0.244920,red,./protein_pds/EHEE_rd3_0179.pkl,EHEE_rd3_0179,4.0,1.0,1.0,3.0,-1.0,...,5.0,-2.048340,1.00,0.781405,8.5412,-81.933599,8,0.250000,2.8469,0.5045
4,EEHEE,0.985595,red,./protein_pds/EEHEE_rd3_1627.pkl,EEHEE_rd3_1627,4.0,1.0,-1.0,3.0,3.0,...,5.0,-2.640597,1.00,0.764510,9.7160,-113.545656,6,0.200000,3.0599,0.6737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16169,EEHEE,-0.042749,red,./protein_pds/EEHEE_rd3_0672.pkl,EEHEE_rd3_0672,5.0,1.0,-1.0,3.0,1.0,...,7.0,-2.339757,1.00,0.766706,13.7124,-100.609572,6,0.600000,4.4130,0.8748
16170,HEEH,-0.037665,red,./protein_pds/HEEH_rd1_0808.pkl,HEEH_rd1_0808,6.0,3.0,-3.0,7.0,3.0,...,6.0,-2.275365,0.75,0.680462,22.1786,-97.840676,12,0.500000,9.0900,2.0749
16171,HHH,1.815568,green,./protein_pds/HHH_rd3_0061.pkl,HHH_rd3_0061,6.0,5.0,-1.0,8.0,2.0,...,3.0,-2.766525,1.00,0.811374,6.0599,-118.960588,9,0.000000,1.5719,0.2963
16172,EEHEE,0.681623,red,./protein_pds/EEHEE_rd4_0744.pkl,EEHEE_rd4_0744,1.0,0.0,0.0,2.0,2.0,...,8.0,-3.088340,1.00,0.762311,10.8362,-132.798628,6,0.600000,3.3353,0.6535


## Downsample the unstable and stable proteins 
Select the same number of unstable and stable proteins based on a threshold for the stable proteins

In [5]:
stable_proportion = 1  # determines what portion of the stable proteins to use
max_num = 30  # max number of proteins to pick for each topology

main_df = pandas.DataFrame(columns = raw_df.columns)
for top in topologies_arr:
    temp_df = raw_df.groupby('topology').get_group(top)
    num_stable = sum(temp_df['stabilityscore_cnn_calibrated'] > stability_threshold)
    num_unstable = sum(temp_df['stabilityscore_cnn_calibrated'] <= stability_threshold)
    stable_num_to_choose = int(stable_proportion*(num_stable))
    if (stable_num_to_choose > max_num):
        stable_num_to_choose = max_num  # choose no more than the allowed max
    if (num_unstable < stable_num_to_choose):
        unstable_num_to_choose = num_unstable  # set max on unstable proteins
    else:
        unstable_num_to_choose = stable_num_to_choose  
    print('Choosing', stable_num_to_choose, 'stable designs out of', num_stable,'for', top)
    print('Choosing', unstable_num_to_choose, 'unstable designs out of', num_unstable, 'for', top)
    
    most_stable = temp_df.nlargest(n=stable_num_to_choose, columns='stabilityscore_cnn_calibrated')
    least_stable = temp_df.nsmallest(n=unstable_num_to_choose, columns='stabilityscore_cnn_calibrated')
    most_stable = most_stable.reset_index(drop=True)
    least_stable = least_stable.reset_index(drop=True)
    main_df = pandas.concat([main_df, most_stable, least_stable])
    
main_df = main_df.reset_index(drop=True)
main_df

Choosing 30 stable designs out of 1346 for HHH
Choosing 30 unstable designs out of 1123 for HHH
Choosing 30 stable designs out of 579 for EEHEE
Choosing 30 unstable designs out of 4669 for EEHEE
Choosing 30 stable designs out of 118 for HEEH
Choosing 30 unstable designs out of 4872 for HEEH
Choosing 30 stable designs out of 679 for EHEE
Choosing 30 unstable designs out of 2788 for EHEE


Unnamed: 0,topology,stabilityscore_cnn_calibrated,label,pd_path,name,AlaCount,T1_absq,T1_netq,Tend_absq,Tend_netq,...,res_count_core_SCN,score_per_res,ss_contributes_core,ss_sc,sum_best_frags,total_score,tryp_cut_sites,two_core_each,worst6frags,worstfrag
0,HHH,2.675084,green,./protein_pds/HHH_rd4_0122.pkl,HHH_rd4_0122,1.0,7.0,-5.0,12.0,10.0,...,8.0,-3.233831,1.00,0.778585,5.9791,-139.054743,12,1.000000,1.6708,0.3360
1,HHH,2.663947,green,./protein_pds/HHH_rd4_0395.pkl,HHH_rd4_0395,2.0,6.0,-2.0,11.0,7.0,...,7.0,-3.185288,1.00,0.763438,5.4102,-136.967366,13,0.666667,1.4301,0.2915
2,HHH,2.614201,green,./protein_pds/HHH_rd4_0616.pkl,HHH_rd4_0616,1.0,5.0,-5.0,10.0,10.0,...,8.0,-3.044371,1.00,0.810521,4.7901,-130.907940,11,1.000000,1.3618,0.2776
3,HHH,2.613560,green,./protein_pds/HHH_rd4_0228.pkl,HHH_rd4_0228,2.0,7.0,-5.0,11.0,9.0,...,7.0,-3.183216,1.00,0.800401,5.7477,-136.878274,11,0.666667,1.7759,0.3556
4,HHH,2.610852,green,./protein_pds/HHH_rd4_0200.pkl,HHH_rd4_0200,1.0,8.0,-8.0,13.0,13.0,...,7.0,-2.959163,1.00,0.754055,4.8332,-127.244005,9,0.666667,1.3956,0.3211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,EHEE,-1.403157,red,./protein_pds/EHEE_rd1_0045.pkl,EHEE_rd1_0045,1.0,3.0,-3.0,5.0,3.0,...,8.0,-1.762938,1.00,0.721748,11.4578,-70.517538,5,1.000000,3.5560,0.6697
236,EHEE,-1.399824,red,./protein_pds/EHEE_rd1_0738.pkl,EHEE_rd1_0738,2.0,2.0,-2.0,4.0,2.0,...,7.0,-1.499053,1.00,0.710269,15.0872,-59.962124,7,0.500000,5.9877,1.3316
237,EHEE,-1.394263,red,./protein_pds/EHEE_rd2_1221.pkl,EHEE_rd2_1221,2.0,2.0,-2.0,4.0,4.0,...,6.0,-2.094076,1.00,0.687883,11.1541,-83.763032,7,0.500000,3.7118,0.6524
238,EHEE,-1.381898,red,./protein_pds/EHEE_rd2_0384.pkl,EHEE_rd2_0384,4.0,2.0,0.0,4.0,0.0,...,4.0,-2.011789,0.75,0.714132,9.2357,-80.471551,10,0.250000,3.0541,0.5670


## Train the models

### Train/Test split
Run multiple iterations for each topology with different train test splits and do grid search over hyperparameters to train a good random forest classifier

In [None]:
sme_perf_df = pandas.DataFrame(columns=['topology', 'iteration', 'train_accuracy', 'test_accuracy', 
                                        'roc_auc_test', 'aps', 'confusion_matrix', 'classifier_path', 'runtime'])
num_iterations = 10

start_time = time.time()
for topology in topologies_arr:
    for i in range(num_iterations):
        iter_time = time.time()
        topology_df = main_df[main_df['topology'] == topology]

        # Split the data into training and a testing sets
        test_size = 0.2
        random_state = i  # change random state each time the iteration changes

        train_df = pandas.DataFrame()
        test_df = pandas.DataFrame()

        for l in ['red', 'green']:
            train, test = train_test_split(topology_df[topology_df['label']==l], test_size=test_size, random_state=random_state)
            train_df = pandas.concat([train_df, train])
            test_df = pandas.concat([test_df, test])

        train_df = train_df.reset_index(drop=True)  # drop the index so that the concat will not mix up the labels
        test_df = test_df.reset_index(drop=True)  # drop the index so that the concat will not mix up the labels 

        # Rescale and standardize the features
        feature_scaler = StandardScaler()
        train_standardized_features = pandas.DataFrame(feature_scaler.fit_transform(train_df.iloc[:, 5:]))
        train_standardized_features = pandas.concat((train_df.iloc[:, :5], train_standardized_features), axis=1)
        test_standardized_features = pandas.DataFrame(feature_scaler.fit_transform(test_df.iloc[:, 5:]))
        test_standardized_features = pandas.concat((test_df.iloc[:, :5], test_standardized_features), axis=1)
        
        # Rename columns to have original column names
        train_standardized_features.columns = list(main_df.columns)
        test_standardized_features.columns = list(main_df.columns)
        
        # Train and test ML model
        X_train = train_standardized_features.iloc[:, 5:]
        y_train = train_standardized_features['label']
        X_test = test_standardized_features.iloc[:, 5:]
        y_test = test_standardized_features['label']
        
        # =================== RF =========================
        # perform randomized search over rf hyperparameters
        
        # relabel classes from colors to binary labels
        bin_labels_train = np.array([1 if label == 'green' else 0 for label in y_train])
        bin_labels_test = np.array([1 if label == 'green' else 0 for label in y_test])

        rf_clf = RandomForestClassifier(n_estimators=1000, class_weight='balanced')
        max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
        max_depth.append(None)
        rf_param_grid = {'max_features': sps.uniform, 
                         'max_depth': max_depth,
                         'min_samples_split': [2, 5, 10, 20, 30],
                         'min_samples_leaf': [1, 2, 4, 6, 8, 10]}

        rf_clf_gs = model_selection.RandomizedSearchCV(rf_clf, 
                                                       rf_param_grid,
                                                       scoring='average_precision', 
                                                       n_iter = 100,
                                                       cv=10, 
                                                       n_jobs=-1,
                                                       verbose=0)

        rf_clf_gs.fit(X_train, bin_labels_train)

        print('-- RF best params --')
        print(rf_clf_gs.best_params_)

        print('-- RF best APS --')
        print(rf_clf_gs.best_score_)
        
        # Save classifier
        classifier_path = f'classifiers/{topology}_sme_rf_clf_gs_{i}.pickle'
        with open(classifier_path, 'wb') as f:
            pickle.dump(rf_clf_gs, f, protocol=pickle.HIGHEST_PROTOCOL)
    
        # Get training accuracy
        y_train_pred = rf_clf_gs.predict(X_train)

        # Make predictions on test data using classifier
        y_pred = rf_clf_gs.predict(X_test)

        # Prepare performance metrics
        train_acc = metrics.accuracy_score(bin_labels_train, y_train_pred)
        test_acc = metrics.accuracy_score(bin_labels_test, y_pred)
        roc_auc = metrics.roc_auc_score(bin_labels_test, rf_clf_gs.predict_proba(X_test)[:, 1])
        aps = metrics.average_precision_score(bin_labels_test, rf_clf_gs.predict_proba(X_test)[:, 1])
        cm = confusion_matrix(bin_labels_test, y_pred)
        
        iter_runtime = time.time() - iter_time
        
        # Create row for sme_perf_df
        row = {'topology': topology,
               'iteration': i, 
               'train_accuracy': train_acc, 
               'test_accuracy': test_acc,
               'roc_auc_test': roc_auc,
               'aps': aps,
               'confusion_matrix': cm,
               'classifier_path': classifier_path,
               'runtime': iter_runtime}
        sme_perf_df.loc[len(sme_perf_df.index)] = row
        
        # Save the performance dataframe for SME model
        sme_perf_df.to_csv(f'./perf_dataframes/sme_perf_df_downsample.csv', index=False)
        print(f'Updated and saved sme_perf_df for this iteration in time {iter_runtime:.2f} seconds')

runtime = time.time() - start_time
print('Total runtime was', f'{runtime:.2f}', 'seconds')

-- RF best params --
{'max_depth': 60, 'max_features': 0.5283247294811115, 'min_samples_leaf': 6, 'min_samples_split': 2}
-- RF best APS --
0.8997261469529312
Updated and saved sme_perf_df for this iteration in time 3499.92 seconds
-- RF best params --
{'max_depth': 100, 'max_features': 0.3336454325001459, 'min_samples_leaf': 2, 'min_samples_split': 2}
-- RF best APS --
0.8971751098835148
Updated and saved sme_perf_df for this iteration in time 2435.30 seconds
-- RF best params --
{'max_depth': 10, 'max_features': 0.5433915155555668, 'min_samples_leaf': 4, 'min_samples_split': 2}
-- RF best APS --
0.9015307706098656
Updated and saved sme_perf_df for this iteration in time 1845.11 seconds
-- RF best params --
{'max_depth': None, 'max_features': 0.4983131359253353, 'min_samples_leaf': 8, 'min_samples_split': 10}
-- RF best APS --
0.9010253656284994
Updated and saved sme_perf_df for this iteration in time 1723.03 seconds
-- RF best params --
{'max_depth': 10, 'max_features': 0.43232844986

### Train one model without doing a train/test split for assessing correlations and feature importance

In [6]:
sme_perf_df = pandas.DataFrame(columns=['topology', 'train_accuracy', 
                                        'classifier_path', 'runtime'])

start_time = time.time()
for topology in topologies_arr:
    iter_time = time.time()
    train_df = main_df[main_df['topology'] == topology]

    train_df = train_df.reset_index(drop=True)  # drop the index so that the concat will not mix up the labels

    # Rescale and standardize the features
    feature_scaler = StandardScaler()
    train_standardized_features = pandas.DataFrame(feature_scaler.fit_transform(train_df.iloc[:, 5:]))
    train_standardized_features = pandas.concat((train_df.iloc[:, :5], train_standardized_features), axis=1)

    # Rename columns to have original column names
    train_standardized_features.columns = list(main_df.columns)

    # Train ML model
    X_train = train_standardized_features.iloc[:, 5:]
    y_train = train_standardized_features['label']

    # =================== RF =========================
    # perform randomized search over rf hyperparameters

    # relabel classes from colors to binary labels
    bin_labels_train = np.array([1 if label == 'green' else 0 for label in y_train])

    rf_clf = RandomForestClassifier(n_estimators=1000, class_weight='balanced')
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    rf_param_grid = {'max_features': sps.uniform, 
                     'max_depth': max_depth,
                     'min_samples_split': [2, 5, 10, 20, 30],
                     'min_samples_leaf': [1, 2, 4, 6, 8, 10]}

    rf_clf_gs = model_selection.RandomizedSearchCV(rf_clf, 
                                                   rf_param_grid,
                                                   scoring='average_precision', 
                                                   n_iter = 100,
                                                   cv=10, 
                                                   n_jobs=-1,
                                                   verbose=0)

    rf_clf_gs.fit(X_train, bin_labels_train)

    print('-- RF best params --')
    print(rf_clf_gs.best_params_)

    print('-- RF best APS --')
    print(rf_clf_gs.best_score_)

    # Save classifier
    classifier_path = f'classifiers/{topology}_sme_rf_clf_gs_no_train_test_split.pickle'
    with open(classifier_path, 'wb') as f:
        pickle.dump(rf_clf_gs, f, protocol=pickle.HIGHEST_PROTOCOL)

    # Get training accuracy
    y_train_pred = rf_clf_gs.predict(X_train)

    # Prepare performance metrics
    train_acc = metrics.accuracy_score(bin_labels_train, y_train_pred)

    iter_runtime = time.time() - iter_time

    # Create row for sme_perf_df
    row = {'topology': topology,
           'train_accuracy': train_acc, 
           'classifier_path': classifier_path,
           'runtime': iter_runtime}
    sme_perf_df.loc[len(sme_perf_df.index)] = row

    # Save the performance dataframe for SME model
    sme_perf_df.to_csv(f'./perf_dataframes/sme_perf_df_downsample_no_train_test_split.csv', index=False)
    print(f'Updated and saved sme_perf_df_downsample_no_train_test_split for this iteration in time {iter_runtime:.2f} seconds')

runtime = time.time() - start_time
print('Total runtime was', f'{runtime:.2f}', 'seconds')

-- RF best params --
{'max_depth': None, 'max_features': 0.7042164401749386, 'min_samples_leaf': 1, 'min_samples_split': 2}
-- RF best APS --
1.0
Updated and saved sme_perf_df_downsample_no_train_test_split for this iteration in time 116.33 seconds
-- RF best params --
{'max_depth': 10, 'max_features': 0.49650633071540495, 'min_samples_leaf': 4, 'min_samples_split': 2}
-- RF best APS --
1.0
Updated and saved sme_perf_df_downsample_no_train_test_split for this iteration in time 115.60 seconds
-- RF best params --
{'max_depth': 40, 'max_features': 0.5813633113586808, 'min_samples_leaf': 1, 'min_samples_split': 5}
-- RF best APS --
0.9044444444444444
Updated and saved sme_perf_df_downsample_no_train_test_split for this iteration in time 120.11 seconds
-- RF best params --
{'max_depth': 80, 'max_features': 0.20462381288140474, 'min_samples_leaf': 1, 'min_samples_split': 5}
-- RF best APS --
1.0
Updated and saved sme_perf_df_downsample_no_train_test_split for this iteration in time 113.03 s