# Novel Biomarker Hyperparameter Tuning

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 16FEB2021 

_This is a static version of a Jupyter notebook, and work (documentation) is still in progress_ 

**Notes**: 


<a id="0"></a>

In [19]:
# Global seed
import random  
random.seed(42)

#import os
#os.environ['PYTHONHASHSEED']=str(42)

import pandas as pd
import numpy as np
np.random.seed(42)


#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'

# Sklearn module
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

Import test cohort

In [20]:
testcohort = pd.read_excel('data/testcohort.xlsx', index_col=0)

testcohort.rename(columns={2102: 'Dibutylamine', 
                           6262: 'Mannitol hippurate', 
                           6578: '2-mercaptobenzothiazole'}, 
                  inplace=True)

In [21]:
final_features = {2102, 6262, 6578}
final_features_ID = {'Dibutylamine', 
                     'Mannitol hippurate', 
                     '2-mercaptobenzothiazole'}

In [22]:
final_features

{2102, 6262, 6578}

In [23]:
# Import MS_labels
MS_labels = pd.read_excel('data/MS_labels.xlsx', index_col=0)

In [24]:
MS_labels[MS_labels.ID.isin(final_features)]

Unnamed: 0,ID,Mode,RT [min],Name,Formula
2101,2102,positive,3.449,"N,N-Diisopropylethylamine (DIPEA)",C8 H19 N
6261,6262,negative,2.667,6262,C10 H20 N9 O5 P
6577,6578,negative,0.832,6578,C6 H N O5


In [25]:
MLfeatures = testcohort[list(final_features_ID)]
MLfeatures =(MLfeatures - MLfeatures.mean(axis=0))/MLfeatures.std(axis=0) #autoscaling

Define features and labels.

In [26]:
dfgrp = testcohort.filter(['Groups'], axis=1)
#convert strings (RCC, Control) to integers
dfgroup = dfgrp['Groups'].map({'Control': 0, 'RCC': 1}) 
X = MLfeatures.values
y = dfgroup.values

### Grid Search: Random Forest

[Method Reference: towardsdatascience.com](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74) <br>
[GridSearchCV sklearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [27]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8],
    'n_estimators': [50, 100, 150, 200]
}
# Create a based model
rf = RandomForestClassifier(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = rsk, scoring = 'accuracy', n_jobs = 4, verbose = 2)

In [28]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    2.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done 357 tasks      | elapsed:   18.1s
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:   31.7s
[Parallel(n_jobs=4)]: Done 1005 tasks      | elapsed:   48.0s
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 1977 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 3273 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 3600 out of 3600 | elapsed:  2.8min finished


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 50}

In [29]:
grid_search.best_score_

0.9587044534412957

In [30]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067619,0.001265,0.005997,0.002722,True,10,auto,1,2,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.948718,0.974359,0.948718,0.948718,0.921053,0.948313,0.016864,640
1,0.129232,0.008123,0.008138,0.000623,True,10,auto,1,2,100,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.923077,0.948718,0.948718,0.948718,0.921053,0.938057,0.013073,694
2,0.188537,0.004910,0.013439,0.001184,True,10,auto,1,2,150,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.923077,0.948718,0.948718,0.948718,0.921053,0.938057,0.013073,694
3,0.251673,0.004463,0.017840,0.001544,True,10,auto,1,2,200,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.923077,0.948718,0.948718,0.948718,0.921053,0.938057,0.013073,694
4,0.063434,0.001622,0.004306,0.000433,True,10,auto,1,4,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.923077,0.974359,0.948718,0.948718,0.947368,0.948448,0.016226,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.299785,0.019537,0.021638,0.004961,True,30,log2,5,6,200,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.948718,0.974359,0.948718,0.923077,0.947368,0.948448,0.016226,352
716,0.073236,0.007066,0.006007,0.001138,True,30,log2,5,8,50,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.974359,0.974359,0.974359,0.923077,0.947368,0.958704,0.020654,37
717,0.164370,0.008735,0.010568,0.001466,True,30,log2,5,8,100,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.974359,0.974359,0.974359,0.923077,0.947368,0.958704,0.020654,37
718,0.201982,0.008997,0.012426,0.000565,True,30,log2,5,8,150,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.948718,0.974359,0.974359,0.923077,0.947368,0.953576,0.019268,127


### Grid Search: SVM-RBF

In [31]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0]}

svm_cls = svm.SVC(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(svm_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [32]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  79 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}

In [33]:
grid_search.best_score_

0.9585695006747639

In [34]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002214,0.000853,0.000817,0.000598,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.717949,0.74359,0.74359,0.74359,0.736842,0.737112,0.009932,19
1,0.003318,0.002616,0.003865,0.005982,0.1,0.03,rbf,"{'C': 0.1, 'gamma': 0.03, 'kernel': 'rbf'}",0.717949,0.74359,0.74359,0.74359,0.736842,0.737112,0.009932,19
2,0.001545,0.000505,0.000512,0.000177,0.1,0.1,rbf,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}",0.820513,0.897436,0.897436,0.820513,0.868421,0.860864,0.034608,18
3,0.001103,0.000257,0.000384,6.9e-05,0.1,0.3,rbf,"{'C': 0.1, 'gamma': 0.3, 'kernel': 'rbf'}",0.871795,1.0,0.923077,0.820513,0.894737,0.902024,0.05942,15
4,0.001176,0.000238,0.000574,0.000408,0.1,1.0,rbf,"{'C': 0.1, 'gamma': 1.0, 'kernel': 'rbf'}",0.923077,1.0,0.923077,0.820513,0.894737,0.912281,0.057718,13
5,0.001195,0.00039,0.000439,0.000109,1.0,0.01,rbf,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.820513,0.923077,0.897436,0.846154,0.842105,0.865857,0.038157,17
6,0.000958,4.9e-05,0.00068,0.000365,1.0,0.03,rbf,"{'C': 1, 'gamma': 0.03, 'kernel': 'rbf'}",0.846154,1.0,0.897436,0.846154,0.868421,0.891633,0.057363,16
7,0.001483,0.000899,0.000537,0.000279,1.0,0.1,rbf,"{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}",0.948718,1.0,0.897436,0.820513,0.894737,0.912281,0.059953,13
8,0.001943,0.001887,0.000527,9.1e-05,1.0,0.3,rbf,"{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}",0.948718,1.0,0.923077,0.897436,0.947368,0.94332,0.03398,7
9,0.004538,0.005775,0.000387,4.8e-05,1.0,1.0,rbf,"{'C': 1, 'gamma': 1.0, 'kernel': 'rbf'}",0.974359,1.0,0.923077,0.923077,0.921053,0.948313,0.032762,4


### Grid Search: Lin-SVM

In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 5, 10.]}

svm_cls = svm.SVC(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(svm_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [36]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  23 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


{'C': 1, 'kernel': 'linear'}

In [37]:
grid_search.best_score_

0.9483130904183537

In [38]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00128,0.000325,0.00047,0.000111,0.001,linear,"{'C': 0.001, 'kernel': 'linear'}",0.717949,0.74359,0.74359,0.74359,0.736842,0.737112,0.009932,6
1,0.001061,0.000144,0.000385,4.4e-05,0.01,linear,"{'C': 0.01, 'kernel': 'linear'}",0.769231,0.871795,0.794872,0.871795,0.763158,0.81417,0.04824,5
2,0.000984,0.000123,0.000348,3.6e-05,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.923077,1.0,0.923077,0.871795,0.894737,0.922537,0.043241,4
3,0.001064,0.000385,0.000413,0.000182,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.948718,1.0,0.948718,0.923077,0.921053,0.948313,0.028467,1
4,0.001159,8.9e-05,0.000749,0.000347,5.0,linear,"{'C': 5, 'kernel': 'linear'}",0.974359,0.974359,0.923077,0.923077,0.921053,0.943185,0.025464,2
5,0.001502,0.000483,0.000346,3.3e-05,10.0,linear,"{'C': 10.0, 'kernel': 'linear'}",0.974359,0.974359,0.923077,0.897436,0.921053,0.938057,0.030982,3


### Grid Search: kNN

In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': list(range(3,30)), 'p': [1,2]}

knn_cls = KNeighborsClassifier()

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

grid_search = GridSearchCV(knn_cls, param_grid, cv=rsk, scoring='accuracy', verbose=2, n_jobs=4)

In [40]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:    0.3s finished


{'n_neighbors': 3, 'p': 1}

In [41]:
grid_search.best_score_

0.9638326585695008

In [42]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001142,0.000823,0.002388,0.000346,3,1,"{'n_neighbors': 3, 'p': 1}",0.974359,1.0,0.974359,0.923077,0.947368,0.963833,0.026313,1
1,0.00114,0.001004,0.003323,0.001755,3,2,"{'n_neighbors': 3, 'p': 2}",0.974359,1.0,0.948718,0.923077,0.947368,0.958704,0.02626,6
2,0.001076,0.00072,0.003323,0.001418,4,1,"{'n_neighbors': 4, 'p': 1}",0.974359,1.0,0.948718,0.923077,0.947368,0.958704,0.02626,6
3,0.001235,0.001339,0.002163,0.000543,4,2,"{'n_neighbors': 4, 'p': 2}",0.974359,1.0,0.974359,0.923077,0.947368,0.963833,0.026313,1
4,0.000777,0.000282,0.004162,0.002826,5,1,"{'n_neighbors': 5, 'p': 1}",0.974359,1.0,0.948718,0.923077,0.947368,0.958704,0.02626,6
5,0.000554,7.5e-05,0.004365,0.004734,5,2,"{'n_neighbors': 5, 'p': 2}",0.974359,0.974359,0.974359,0.923077,0.947368,0.958704,0.020654,10
6,0.000608,0.000257,0.002226,0.000726,6,1,"{'n_neighbors': 6, 'p': 1}",0.974359,1.0,0.974359,0.923077,0.947368,0.963833,0.026313,1
7,0.00052,6.6e-05,0.002017,0.000153,6,2,"{'n_neighbors': 6, 'p': 2}",0.974359,1.0,0.974359,0.923077,0.947368,0.963833,0.026313,1
8,0.000568,0.000113,0.00338,0.002113,7,1,"{'n_neighbors': 7, 'p': 1}",0.974359,1.0,0.974359,0.923077,0.947368,0.963833,0.026313,1
9,0.000652,0.000197,0.00255,0.000334,7,2,"{'n_neighbors': 7, 'p': 2}",0.974359,0.974359,0.974359,0.923077,0.947368,0.958704,0.020654,10
