# MS IDed Biomarker Hyperparameter Tuning

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 03NOV2020 

_This is a static version of a Jupyter notebook, and work (documentation) is still in progress_ 

**Notes**: 


<a id="0"></a>

In [1]:
# Global seed
import random  
random.seed(42)

#import os
#os.environ['PYTHONHASHSEED']=str(42)

import pandas as pd
import numpy as np
np.random.seed(42)


#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'

# Sklearn module
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
#import model cohort
modelcohort = pd.read_excel('data/modelcohort.xlsx', index_col=0)

NMRMS = modelcohort.drop(['Sample ID', 'Patient ID', 'Collection', 'Gender',
                         'Race', 'BMI', 'Smoker', 'Age'], axis=1)


NMRMS.rename(columns={720:'2-Phenylacetamide', 1481:'Lys-Ile', 
                      2102: 'Dibutylamine', 3804: 'Tromethamine phosphate', 
                      6262: '2-Hydroxyhippuric acid/mannitol', 6578: '2-mercaptobenzothiazole',
                      6594: 'N-Acetyl-D-Glucosamine'}, inplace=True)

In [3]:
final_features = {720, 1481, 2102, 3804, 6262, 6578, 6594}
final_features_ID = {'2-Phenylacetamide', 'Lys-Ile','Dibutylamine', 
                     'Tromethamine phosphate', '2-Hydroxyhippuric acid/mannitol', '2-mercaptobenzothiazole','N-Acetyl-D-Glucosamine'}

In [4]:
final_features

{720, 1481, 2102, 3804, 6262, 6578, 6594}

In [5]:
# Import MS_labels
MS_labels = pd.read_excel('data/MS_labels.xlsx', index_col=0)

In [6]:
MS_labels[MS_labels.ID.isin(final_features)]

Unnamed: 0,ID,Mode,RT [min],Name,Formula
719,720,positive,2.562,2-Aminoacetophenone;O-Acetylaniline,C8 H9 N O
1480,1481,positive,6.29,1481,
2101,2102,positive,3.449,"N,N-Diisopropylethylamine (DIPEA)",C8 H19 N
3803,3804,positive,2.595,3804,C4 H12 N O6 P
6261,6262,negative,2.667,6262,C10 H20 N9 O5 P
6577,6578,negative,0.832,6578,C6 H N O5
6593,6594,negative,3.871,N-Acetyl-D-glucosaminate,C8 H15 N O7


In [7]:
MLfeatures = NMRMS[list(final_features_ID)]
MLfeatures =(MLfeatures - MLfeatures.mean(axis=0))/MLfeatures.std(axis=0) #autoscaling

Define features and labels.

In [8]:
dfgrp = NMRMS.filter(['Groups'], axis=1)
#convert strings (RCC, Control) to integers
dfgroup = dfgrp['Groups'].map({'Control': 0, 'RCC': 1}) 
X = MLfeatures.values
y = dfgroup.values

### Grid Search: Random Forest

[Method Reference: towardsdatascience.com](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74) <br>
[GridSearchCV sklearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [9]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8],
    'n_estimators': [50, 100, 150, 200]
}
# Create a based model
rf = RandomForestClassifier(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = rsk, scoring = 'accuracy', n_jobs = 4, verbose = 2)

In [10]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.4s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   17.9s
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:   30.8s
[Parallel(n_jobs=4)]: Done 1005 tasks      | elapsed:   47.9s
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 3273 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 3600 out of 3600 | elapsed:  2.7min finished


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'n_estimators': 50}

In [11]:
grid_search.best_score_

0.9038461538461539

In [12]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067176,0.001829,0.005434,0.001165,True,10,auto,1,2,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.846154,0.769231,0.833333,0.916667,0.916667,0.856410,0.055677,496
1,0.130057,0.001489,0.008720,0.001050,True,10,auto,1,2,100,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.846154,0.769231,0.833333,0.916667,0.916667,0.856410,0.055677,496
2,0.205084,0.021276,0.013178,0.001027,True,10,auto,1,2,150,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.846154,0.769231,0.833333,0.916667,0.916667,0.856410,0.055677,496
3,0.283420,0.026992,0.020681,0.004322,True,10,auto,1,2,200,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.846154,0.769231,0.833333,0.916667,0.916667,0.856410,0.055677,496
4,0.099233,0.028289,0.005704,0.001970,True,10,auto,1,4,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.846154,0.769231,0.833333,0.916667,0.916667,0.856410,0.055677,496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.274734,0.025710,0.017777,0.002808,True,30,log2,5,6,200,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.923077,0.692308,0.916667,0.916667,0.916667,0.873077,0.090419,73
716,0.079221,0.007686,0.004936,0.001104,True,30,log2,5,8,50,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.846154,0.923077,0.916667,0.916667,0.916667,0.903846,0.028953,1
717,0.132651,0.006143,0.008549,0.001087,True,30,log2,5,8,100,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.846154,0.769231,0.916667,0.916667,0.916667,0.873077,0.058667,73
718,0.202574,0.009878,0.014182,0.001091,True,30,log2,5,8,150,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.846154,0.692308,0.916667,0.916667,0.916667,0.857692,0.087085,433


### Grid Search: SVM-RBF

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0]}

svm_cls = svm.SVC(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(svm_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [14]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  86 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


{'C': 1, 'gamma': 0.03, 'kernel': 'rbf'}

In [15]:
grid_search.best_score_

0.9038461538461539

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001977,0.000972,0.000574,0.0003,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.461538,0.461538,0.916667,0.916667,0.916667,0.734615,0.222966,17
1,0.002433,0.00187,0.000479,0.000151,0.1,0.03,rbf,"{'C': 0.1, 'gamma': 0.03, 'kernel': 'rbf'}",0.538462,0.615385,0.916667,0.916667,0.916667,0.780769,0.168208,12
2,0.001598,0.000625,0.000884,0.000853,0.1,0.1,rbf,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}",0.846154,0.615385,0.833333,0.916667,0.916667,0.825641,0.110688,8
3,0.001215,0.000338,0.000448,0.000105,0.1,0.3,rbf,"{'C': 0.1, 'gamma': 0.3, 'kernel': 'rbf'}",0.538462,0.538462,0.833333,0.916667,0.916667,0.748718,0.17435,13
4,0.001495,0.001176,0.000385,6.3e-05,0.1,1.0,rbf,"{'C': 0.1, 'gamma': 1.0, 'kernel': 'rbf'}",0.461538,0.461538,0.583333,0.75,1.0,0.651282,0.203947,20
5,0.001283,0.000606,0.000663,0.000348,1.0,0.01,rbf,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.846154,0.692308,0.833333,0.916667,0.916667,0.841026,0.082031,5
6,0.000945,9.2e-05,0.001006,0.001229,1.0,0.03,rbf,"{'C': 1, 'gamma': 0.03, 'kernel': 'rbf'}",0.846154,0.923077,0.833333,0.916667,1.0,0.903846,0.060134,1
7,0.002799,0.003042,0.000363,6.4e-05,1.0,0.1,rbf,"{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}",0.846154,0.846154,0.833333,0.916667,1.0,0.888462,0.063016,2
8,0.000808,8.3e-05,0.00052,0.00043,1.0,0.3,rbf,"{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}",0.692308,0.846154,0.833333,0.916667,0.916667,0.841026,0.082031,5
9,0.001146,0.000362,0.000346,4.9e-05,1.0,1.0,rbf,"{'C': 1, 'gamma': 1.0, 'kernel': 'rbf'}",0.615385,0.769231,0.833333,0.916667,0.916667,0.810256,0.112104,9


### Grid Search: Lin-SVM

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 5, 10.]}

svm_cls = svm.SVC(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(svm_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [18]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  23 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


{'C': 0.1, 'kernel': 'linear'}

In [19]:
grid_search.best_score_

0.8884615384615383

In [20]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001794,0.00065,0.002413,0.003066,0.001,linear,"{'C': 0.001, 'kernel': 'linear'}",0.461538,0.461538,0.916667,0.916667,0.916667,0.734615,0.222966,6
1,0.000981,8.8e-05,0.000573,0.000352,0.01,linear,"{'C': 0.01, 'kernel': 'linear'}",0.846154,0.615385,0.916667,0.916667,0.916667,0.842308,0.116702,2
2,0.001114,0.000375,0.00039,8.4e-05,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.846154,0.846154,0.833333,0.916667,1.0,0.888462,0.063016,1
3,0.001263,0.00078,0.000385,6.4e-05,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.615385,0.846154,0.833333,0.916667,1.0,0.842308,0.128051,2
4,0.001633,0.000514,0.000436,8.4e-05,5.0,linear,"{'C': 5, 'kernel': 'linear'}",0.538462,0.923077,0.833333,0.916667,0.916667,0.825641,0.147375,5
5,0.001176,0.000272,0.000433,9.9e-05,10.0,linear,"{'C': 10.0, 'kernel': 'linear'}",0.538462,0.923077,0.833333,0.916667,1.0,0.842308,0.160825,2


### Grid Search: kNN

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': list(range(3,30)), 'p': [1,2]}

knn_cls = KNeighborsClassifier()

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(knn_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [22]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:    0.2s finished


{'n_neighbors': 7, 'p': 1}

In [23]:
grid_search.best_score_

0.9038461538461539

In [24]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001099,0.000649,0.001611,0.000177,3,1,"{'n_neighbors': 3, 'p': 1}",0.923077,0.846154,0.833333,0.916667,0.916667,0.887179,0.039013,15
1,0.001241,0.000843,0.002079,0.000889,3,2,"{'n_neighbors': 3, 'p': 2}",0.846154,0.846154,0.833333,0.916667,0.916667,0.871795,0.036936,32
2,0.001118,0.00069,0.001161,8.6e-05,4,1,"{'n_neighbors': 4, 'p': 1}",0.846154,0.846154,0.833333,0.833333,0.916667,0.855128,0.031299,36
3,0.000481,3.4e-05,0.001977,0.001487,4,2,"{'n_neighbors': 4, 'p': 2}",0.846154,0.846154,0.833333,0.916667,1.0,0.888462,0.063016,11
4,0.000504,4.4e-05,0.001178,0.000137,5,1,"{'n_neighbors': 5, 'p': 1}",0.923077,0.923077,0.833333,0.916667,0.916667,0.902564,0.034734,9
5,0.000463,3.1e-05,0.001194,0.000117,5,2,"{'n_neighbors': 5, 'p': 2}",0.923077,0.846154,0.833333,0.833333,1.0,0.887179,0.065573,15
6,0.000474,3.3e-05,0.001228,0.000141,6,1,"{'n_neighbors': 6, 'p': 1}",0.846154,0.846154,0.833333,0.916667,1.0,0.888462,0.063016,11
7,0.000582,0.000136,0.001179,0.000127,6,2,"{'n_neighbors': 6, 'p': 2}",0.846154,0.846154,0.833333,0.916667,1.0,0.888462,0.063016,11
8,0.000496,3.5e-05,0.00112,7.4e-05,7,1,"{'n_neighbors': 7, 'p': 1}",0.846154,0.923077,0.833333,0.916667,1.0,0.903846,0.060134,1
9,0.00479,0.006541,0.002715,0.001887,7,2,"{'n_neighbors': 7, 'p': 2}",0.923077,0.923077,0.833333,0.916667,0.916667,0.902564,0.034734,9
