# MS Upregulated Biomarker Hyperparameter Tuning

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 03NOV2020 

_This is a static version of a Jupyter notebook, and work (documentation) is still in progress_ 

**Notes**: 


<a id="0"></a>

In [1]:
# Global seed
import random  
random.seed(42)

#import os
#os.environ['PYTHONHASHSEED']=str(42)

import pandas as pd
import numpy as np
np.random.seed(42)


#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'

# Sklearn module
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
#import model cohort
modelcohort = pd.read_excel('data/modelcohort.xlsx', index_col=0)

NMRMS = modelcohort.drop(['Sample ID', 'Patient ID', 'Collection', 'Gender',
                         'Race', 'BMI', 'Smoker', 'Age'], axis=1)


NMRMS.rename(columns={6578:'2-mercaptobenzothiazole', 1481:'Lys-Ile',
                          2102:'Dibutylamine', 6594:'N-Acetyl-D-Glucosamine',
                         5698:'m/z 630.19'}, 
                 inplace=True)

In [3]:
final_features = {1481, 2102, 5698, 6578, 6594}
final_features_ID = {'2-mercaptobenzothiazole','Lys-Ile', 
                     'Dibutylamine','N-Acetyl-D-Glucosamine',
                     'm/z 630.19'}

In [4]:
final_features

{1481, 2102, 5698, 6578, 6594}

In [5]:
# Import MS_labels
MS_labels = pd.read_excel('data/MS_labels.xlsx', index_col=0)

In [6]:
MS_labels[MS_labels.ID.isin(final_features)]

Unnamed: 0,ID,Mode,RT [min],Name,Formula
1480,1481,positive,6.29,1481,
2101,2102,positive,3.449,"N,N-Diisopropylethylamine (DIPEA)",C8 H19 N
5697,5698,negative,3.381,5698,C24 H43 N O12 P2 S
6577,6578,negative,0.832,6578,C6 H N O5
6593,6594,negative,3.871,N-Acetyl-D-glucosaminate,C8 H15 N O7


In [7]:
MLfeatures = NMRMS[list(final_features_ID)]
MLfeatures =(MLfeatures - MLfeatures.mean(axis=0))/MLfeatures.std(axis=0) #autoscaling

Define features and labels.

In [8]:
dfgrp = NMRMS.filter(['Groups'], axis=1)
#convert strings (RCC, Control) to integers
dfgroup = dfgrp['Groups'].map({'Control': 0, 'RCC': 1}) 
X = MLfeatures.values
y = dfgroup.values

### Grid Search: Random Forest

[Method Reference: towardsdatascience.com](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74) <br>
[GridSearchCV sklearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [9]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8],
    'n_estimators': [50, 100, 150, 200]
}
# Create a based model
rf = RandomForestClassifier(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = rsk, scoring = 'accuracy', n_jobs = 4, verbose = 2)

In [10]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    7.8s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   17.0s
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:   29.4s
[Parallel(n_jobs=4)]: Done 1005 tasks      | elapsed:   45.2s
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 3273 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done 3600 out of 3600 | elapsed:  2.7min finished


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 150}

In [11]:
grid_search.best_score_

0.8589743589743591

In [12]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.104011,0.041077,0.005346,0.000471,True,10,auto,1,2,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.615385,0.769231,0.833333,1.000000,1.0,0.843590,0.146041,10
1,0.183967,0.039736,0.009570,0.001725,True,10,auto,1,2,100,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.538462,0.769231,0.833333,1.000000,1.0,0.828205,0.171162,118
2,0.268806,0.035894,0.013096,0.001227,True,10,auto,1,2,150,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.692308,0.769231,0.833333,1.000000,1.0,0.858974,0.123504,1
3,0.281146,0.006896,0.016387,0.000945,True,10,auto,1,2,200,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.615385,0.769231,0.833333,1.000000,1.0,0.843590,0.146041,10
4,0.071111,0.007055,0.005748,0.001614,True,10,auto,1,4,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.615385,0.769231,0.833333,0.916667,1.0,0.826923,0.131246,163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.268062,0.013639,0.018119,0.002062,True,30,log2,5,6,200,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.538462,0.769231,0.833333,0.916667,1.0,0.811538,0.157102,181
716,0.069093,0.002732,0.004940,0.000482,True,30,log2,5,8,50,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.461538,0.769231,0.833333,0.833333,1.0,0.779487,0.176421,649
717,0.141723,0.001784,0.010345,0.001539,True,30,log2,5,8,100,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.538462,0.769231,0.833333,0.916667,1.0,0.811538,0.157102,181
718,0.208966,0.003660,0.013325,0.000995,True,30,log2,5,8,150,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.538462,0.769231,0.833333,0.916667,1.0,0.811538,0.157102,181


### Grid Search: SVM-RBF

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0]}

svm_cls = svm.SVC(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(svm_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [14]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  80 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


{'C': 10, 'gamma': 0.3, 'kernel': 'rbf'}

In [15]:
grid_search.best_score_

0.9038461538461539

In [16]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001068,0.00013,0.000395,6.3e-05,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.461538,0.461538,0.583333,0.833333,0.916667,0.651282,0.189839,20
1,0.000877,0.000137,0.000419,9e-05,0.1,0.03,rbf,"{'C': 0.1, 'gamma': 0.03, 'kernel': 'rbf'}",0.461538,0.461538,0.666667,0.833333,0.916667,0.667949,0.186775,19
2,0.001014,0.000363,0.000493,9.6e-05,0.1,0.1,rbf,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}",0.461538,0.846154,0.666667,0.916667,1.0,0.778205,0.192718,15
3,0.00079,7.1e-05,0.000471,0.00017,0.1,0.3,rbf,"{'C': 0.1, 'gamma': 0.3, 'kernel': 'rbf'}",0.538462,0.923077,0.75,0.916667,1.0,0.825641,0.165151,8
4,0.000976,0.000166,0.000367,0.000115,0.1,1.0,rbf,"{'C': 0.1, 'gamma': 1.0, 'kernel': 'rbf'}",0.538462,0.538462,0.666667,0.916667,0.833333,0.698718,0.153632,18
5,0.001635,0.001464,0.000368,6.1e-05,1.0,0.01,rbf,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.538462,0.846154,0.666667,0.916667,0.916667,0.776923,0.150246,16
6,0.000943,0.000139,0.000374,5.8e-05,1.0,0.03,rbf,"{'C': 1, 'gamma': 0.03, 'kernel': 'rbf'}",0.538462,0.846154,0.666667,0.916667,0.916667,0.776923,0.150246,16
7,0.001462,0.001093,0.000402,0.00012,1.0,0.1,rbf,"{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}",0.538462,0.846154,0.666667,0.916667,1.0,0.79359,0.168354,12
8,0.000853,0.00011,0.000459,8.6e-05,1.0,0.3,rbf,"{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}",0.538462,0.846154,0.75,0.916667,1.0,0.810256,0.158819,9
9,0.00085,0.000123,0.000356,8.6e-05,1.0,1.0,rbf,"{'C': 1, 'gamma': 1.0, 'kernel': 'rbf'}",0.538462,0.846154,0.833333,0.916667,1.0,0.826923,0.155968,7


### Grid Search: Lin-SVM

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 5, 10.]}

svm_cls = svm.SVC(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(svm_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [18]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


{'C': 10.0, 'kernel': 'linear'}

In [19]:
grid_search.best_score_

0.841025641025641

In [20]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000851,7.5e-05,0.000337,3.6e-05,0.001,linear,"{'C': 0.001, 'kernel': 'linear'}",0.461538,0.461538,0.583333,0.75,0.833333,0.617949,0.150967,6
1,0.001643,0.000999,0.000874,0.000847,0.01,linear,"{'C': 0.01, 'kernel': 'linear'}",0.692308,0.846154,0.583333,0.833333,0.833333,0.757692,0.10387,5
2,0.002301,0.002603,0.000937,0.000844,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.538462,0.846154,0.666667,0.916667,1.0,0.79359,0.168354,4
3,0.000885,7.3e-05,0.000361,9.9e-05,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.538462,0.846154,0.75,0.916667,1.0,0.810256,0.158819,2
4,0.001133,0.000425,0.00036,2.1e-05,5.0,linear,"{'C': 5, 'kernel': 'linear'}",0.538462,0.846154,0.833333,0.916667,0.916667,0.810256,0.140243,2
5,0.001069,0.000213,0.000384,0.000131,10.0,linear,"{'C': 10.0, 'kernel': 'linear'}",0.692308,0.846154,0.833333,0.916667,0.916667,0.841026,0.082031,1


### Grid Search: kNN

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': list(range(3,30)), 'p': [1,2]}

knn_cls = KNeighborsClassifier()

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(knn_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [22]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:    0.2s finished


{'n_neighbors': 11, 'p': 2}

In [23]:
grid_search.best_score_

0.826923076923077

In [24]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000645,0.000177,0.001271,0.000201,3,1,"{'n_neighbors': 3, 'p': 1}",0.615385,0.923077,0.75,0.833333,0.916667,0.807692,0.115028,7
1,0.000591,0.000177,0.002809,0.003074,3,2,"{'n_neighbors': 3, 'p': 2}",0.692308,0.923077,0.666667,0.916667,0.916667,0.823077,0.117544,4
2,0.00052,6.2e-05,0.001273,0.000317,4,1,"{'n_neighbors': 4, 'p': 1}",0.615385,0.923077,0.666667,0.75,0.833333,0.757692,0.111059,53
3,0.000807,0.000307,0.002673,0.002339,4,2,"{'n_neighbors': 4, 'p': 2}",0.692308,0.923077,0.583333,0.916667,0.916667,0.80641,0.14192,14
4,0.000486,3.9e-05,0.001148,0.000177,5,1,"{'n_neighbors': 5, 'p': 1}",0.538462,1.0,0.75,0.916667,0.833333,0.807692,0.158322,7
5,0.000624,0.000185,0.00139,0.000422,5,2,"{'n_neighbors': 5, 'p': 2}",0.615385,0.923077,0.666667,0.916667,0.916667,0.807692,0.137066,7
6,0.004353,0.007722,0.001576,0.000647,6,1,"{'n_neighbors': 6, 'p': 1}",0.615385,0.846154,0.75,0.916667,0.833333,0.792308,0.103092,26
7,0.000874,0.000472,0.002629,0.002159,6,2,"{'n_neighbors': 6, 'p': 2}",0.615385,0.846154,0.583333,1.0,0.916667,0.792308,0.165211,26
8,0.000722,0.000355,0.001271,0.000391,7,1,"{'n_neighbors': 7, 'p': 1}",0.538462,0.846154,0.833333,0.916667,0.833333,0.79359,0.131271,22
9,0.000735,0.000364,0.001388,0.000411,7,2,"{'n_neighbors': 7, 'p': 2}",0.615385,0.846154,0.75,0.916667,1.0,0.825641,0.133444,2
