# NMR Biomarker Hyperparameter Tuning

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 07MAY2020 

_This is a static version of a Jupyter notebook, and work (documentation) is still in progress_ 

**Goals**: 
1. To merge NMR and MS Dataset. 

<a id="0"></a>

## Notebook Content

1.  [RCC Prediction in Model Cohort](#1)
2.  [RCC Prediction: Test Cohort](#2)
3.  [Chemical Information of Biomarkers](#3)


In [24]:
# Global seed
import random  
random.seed(42)

#import os
#os.environ['PYTHONHASHSEED']=str(42)

import pandas as pd
import numpy as np
np.random.seed(42)

import scipy
import statsmodels as sms
from statsmodels.stats import multitest
from statistics import mean

#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'


# Sklearn module
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# 1. RCC Prediction in Model Cohort
<a id="1"></a>

In [25]:
import sklearn
print('sklearn version', sklearn.__version__)

sklearn version 0.23.2


#### Export Model Cohort Data

In [26]:
modelcohort = pd.read_excel('data/modelcohort.xlsx', index_col=0)

In [27]:
# select NMR features from the dataframe
NMR_modelcohort = modelcohort.iloc[:,7105:]
NMR_modelcohort.columns

Index(['Groups', 'unk1', 'unk2', 'bile_acid1', 'bile_acid2', 'HIVA', 'lactate',
       'unk3', 'acetate', 'acetone', 'unk4', 'acetoacetate', 'unk5',
       'pyruvate', 'citrate', 'DMA', 'unk6', 'methylguanidine', 'unk7',
       'choline', 'scylloinositol', 'taurine', 'acetoacetate_4HPA', 'glycine',
       'mannitol_1', 'mannitol_2', 'creatine', 'glycolate',
       'hippurate_hydroxyhippurate', 'Tatrate', 'unk8', 'unk9', 'fumarate',
       '4HPA', 'hippurate_2', 'hippurate_3', 'aminohippurate',
       'indoxylsulfate', 'hippurate_4', 'hypoxanthine_1', 'hypoxanthine_2',
       'formate', 'unk10', 'Trigonelline_1', 'Trigonellinamide_1',
       'Trigonellinamide_2', 'Trigonelline_2', 'Trigonellinamide_3', 'unk11'],
      dtype='object')

### a. Feature Selection 

### (i)  Select features with greater than 2-fold changes

In [28]:
#NMRMS = modelcohort.drop(['Sample ID', 'Patient ID', 'Collection', 'Gender',
#                         'Race', 'BMI', 'Smoker', 'Age'], axis=1)
Control = NMR_modelcohort[(NMR_modelcohort['Groups'] == 'Control')]
RCC = NMR_modelcohort[(NMR_modelcohort['Groups'] == 'RCC')]

dfmean = pd.DataFrame({'Features':NMR_modelcohort.drop(['Groups'], axis=1).mean(axis=0).index, 
                       'Control':Control.drop(['Groups'], axis=1).mean(axis=0).values,
                       'RCC':RCC.drop(['Groups'], axis=1).mean(axis=0).values})
dfmean.shape

(48, 3)

In [29]:
dfmean;

In [30]:
cols = []
ctr_val = dfmean['Control']
rcc_val = dfmean['RCC']

for ctr, rcc in zip(ctr_val, rcc_val):
    ratio1 = ctr/rcc
    ratio2 = rcc/ctr
    if ratio1 > 2:
        feature = dfmean[dfmean['Control']==ctr]['Features'].values.tolist()
        cols.append(feature)
    elif ratio2 > 2:
        feature = dfmean[dfmean['Control']==ctr]['Features'].values.tolist()
        cols.append(feature)
xfold_feat = [val for sublist in cols for val in sublist] # flatten out list of list.
len(xfold_feat)

2

In [31]:
print(' The following metabolites has > 2 fold change in NMR', xfold_feat)

 The following metabolites has > 2 fold change in NMR ['scylloinositol', 'aminohippurate']


### (ii) _t_-Test Feature Selection 

_T-Test Function_

In [32]:
def Ttest(metabolites, dfControl, dfTreat, alpha=0.05, var=True):
    '''
    Function conducts a T-test for the metabolites differences between two groups with 
    Benjamini-Hocberg FDR correction
    
    Inputs: 
    metabolites = A list containing names of metabolites
    dfControl = A pandas dataframe containing the control group metabolites data
    dfTreat =  A pandas dataframe containing the treatment group metabolites data
    alpha = alpha for statistical significant judgment, default 0.05
    var = If True (default), perform a standard independent 2 sample test that assumes 
    equal population variances [1]. If False, perform Welch’s t-test, which does not 
    assume equal population variance
    
    Outputs: A pandas dataframe with p-values of numerical cohort characteristics. 
    
    '''
    ttest_dict = {}
    for metabolite in metabolites:
        statistic, pvalue =  scipy.stats.ttest_ind(dfControl[metabolite], 
                                               dfTreat[metabolite], 
                                               equal_var=var)
        ttest_dict[metabolite] = pvalue 
        # a dictionary containing name of metabolites and p value after t-test
    ttest = pd.DataFrame.from_dict(ttest_dict, orient='index') # the dictionary in pandas df
    ttest_list=list(ttest_dict.values()) #values (pvalues) of ttest result in a list
    reject, pval_corrected, _, _ = sms.stats.multitest.multipletests(ttest_list, 
                                                                 alpha=alpha, 
                                                                 method='fdr_bh')
    ttest_results = pd.DataFrame({'Metabolite': metabolites, 'T-test p-value': ttest_list, 
                              'FDR p-value': pval_corrected, 'Reject H0': reject})
    Table = ttest_results.sort_values(by=['FDR p-value'])
    return Table

In [33]:
# '''List of final metabolites set'''
metabolite_list = NMR_modelcohort.drop(['Groups'], axis=1).columns

# '''List of final metabolites set'''
ttest_result = Ttest(metabolite_list, Control, RCC, alpha=0.05, var=True)

# '''List of Statistically relevant metabolites'''
# Select metabolites with <0.05 FDR
stat_sig = ttest_result.loc[ttest_result['Reject H0'] == True]

# Select metabolites with <0.05 t-test
#stat_sig = ttest_result.loc[ttest_result['T-test p-value'] <= 0.05]
print("The total number of significant metabolites/Features is: ", len(stat_sig))

The total number of significant metabolites/Features is:  12


In [34]:
stat_sig

Unnamed: 0,Metabolite,T-test p-value,FDR p-value,Reject H0
34,hippurate_3,0.000145,0.002616,True
19,scylloinositol,0.000218,0.002616,True
33,hippurate_2,0.000203,0.002616,True
44,Trigonellinamide_2,0.000218,0.002616,True
43,Trigonellinamide_1,0.000281,0.002693,True
35,aminohippurate,0.000372,0.00298,True
5,lactate,0.000494,0.003387,True
37,hippurate_4,0.000644,0.003864,True
27,hippurate_hydroxyhippurate,0.000801,0.00427,True
46,Trigonellinamide_3,0.001945,0.009335,True


In [35]:
Feature = stat_sig['Metabolite'].values.tolist(); # stat MS significant metabolites
MLfeatures = NMR_modelcohort[Feature] # p<0.05 metabolites
#MLfeatures =(MLfeatures - MLfeatures.mean(axis=0))/MLfeatures.std(axis=0) #autoscaling

In [36]:
MLfeatures.shape

(62, 12)

In [37]:
# Drop scylloinositol and aminohippurate, quantitation not accurate
MLfeatures = MLfeatures.drop(columns=['scylloinositol', 'aminohippurate'])

In [38]:
MLfeatures.astype(float);

In [39]:
correlated_features = set()
correlation_matrix = MLfeatures.astype(float).corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)

In [40]:
MLfeatures = MLfeatures.drop(correlated_features, axis=1)
MLfeatures.shape

(62, 4)

In [41]:
MLfeatures.columns

Index(['hippurate_3', 'Trigonellinamide_2', 'lactate', 'mannitol_1'], dtype='object')

In [42]:
MLfeatures = NMR_modelcohort.filter(['hippurate_2', 'Trigonellinamide_1',
                                     'lactate','mannitol_1'])

In [43]:
# replace hippurate_3 by hippurate_2 (7.567 7.524) more accurate
MLfeatures.columns

Index(['hippurate_2', 'Trigonellinamide_1', 'lactate', 'mannitol_1'], dtype='object')

In [44]:
#autoscaling
MLfeatures_scaled =(MLfeatures - MLfeatures.mean(axis=0))/MLfeatures.std(axis=0) 

### c. Machine Learning for RCC Prediction

_Machine Learning Model Function_

In [45]:
def ml_model(X, y, model, kfold=5):
    
    '''
    FUNCTION: 
    A  machine learning function under Kfold cross validation conditions  
    
    DEPENDENCY: 
    Various Sklearn packages
    from sklearn import metrics
    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import KFold
    from sklearn.metrics import roc_curve, auc
    from sklearn.metrics import roc_auc_score
    
    INPUTS: 
    X = numpy.ndaarray: features
    y = numpy.ndaarray: Labels encoded as 0s and 1s. 
    model = machine learning model
    kfold = numeric. K-fold default is 5. 
    
    OUTPUT: 
    A list containing the following machine learning metrics: 
    Area Under the Curve(AUC), Accuracy (ACC), Precision (PRE), 
    Recall (REC), F1-core (FS), Matthew's Correlation Coefficient (MCC). 
    
    '''

    
    cv = model_selection.StratifiedKFold(n_splits=kfold, random_state=42, shuffle=True)
    # define 'list' of various metrics
    AUC, ACC, PRE, REC, FS, MCC, SPE = [], [], [], [], [], [], [] 
    performance = []
    
    for (train, test), i in zip(cv.split(X, y), range(kfold)):
        # training the model
        model.fit(X[train], y[train])
        y_pred = model.predict(X[test])
        
        #Retrieving metrics after testing
        modelprobs = model.predict_proba(X[test])
        fpr, tpr, thresholds = metrics.roc_curve(y[test], modelprobs[:,1])
        roc_auc = auc(fpr, tpr)
        accuracy = metrics.accuracy_score(y[test], y_pred)
        precision = metrics.precision_score(y[test], y_pred)
        recall = metrics.recall_score(y[test], y_pred)
        f1_score = metrics.f1_score(y[test], y_pred)
        mcc = metrics.matthews_corrcoef(y[test], y_pred)
        # coding specificity
        tn, fp, fn, tp = confusion_matrix(y[test], y_pred).ravel()
        specificity = tn / (tn+fp)
        # append metrics to list
        AUC.append(roc_auc)
        ACC.append(accuracy)
        PRE.append(precision)
        REC.append(recall)
        FS.append(f1_score)
        MCC.append(mcc)
        SPE.append(specificity)
        
    return AUC, ACC, PRE, REC, FS, MCC, SPE

_Machine Learning Metric Plot Function_

In [46]:
def MLmetric_plot(data_dict, y_label):
    '''
    FUNCTION: 
    To plot machine learning metrics from ml_model function.   
    
    DEPENDENCY: 
    Pandas, seaborn, 
    
    INPUTS: 
    data_dict = dictionary. Contains ML model name and metric scores
    y_label = String. Label for the y-axis 
    
    OUTPUT: 
    Figure
    
    '''
    
    fig = pd.DataFrame(data=data_dict)
    splot = sns.barplot(x='Model', y=y_label, data=fig);
    #splot = sns.swarmplot(data=ACCfig, orient='h', s=10, alpha=0.7, color=".25");
    for p in splot.patches:
        splot.annotate(format(p.get_height(), '.2f'), 
                              (p.get_x() + p.get_width() / 2., p.get_height()), 
                              ha = 'center', va = 'center', xytext = (6, 15), 
                              textcoords = 'offset points')
    return splot
#plt.savefig('testACC.svg', dpi=300)



Define features and labels.

In [47]:
dfgrp = NMR_modelcohort.filter(['Groups'], axis=1)
#convert strings (RCC, Control) to integers
dfgroup = dfgrp['Groups'].map({'Control': 0, 'RCC': 1}) 
X = MLfeatures_scaled.values
y = dfgroup.values

### Grid Search: Random Forest

[Method Reference: towardsdatascience.com](https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74) <br>
[GridSearchCV sklearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [48]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8],
    'n_estimators': [50, 100, 150, 200]
}
# Create a based model
rf = RandomForestClassifier(random_state=42)

# Create a custom CV so we can seed with random state
rsk = model_selection.StratifiedKFold(n_splits=5, random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = rsk, scoring = 'accuracy', n_jobs = -1, verbose = 2)

In [49]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:   59.7s
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3273 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:  2.5min finished


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 50}

In [50]:
grid_search.best_score_

0.7935897435897437

In [51]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.093430,0.004054,0.011264,0.003352,True,10,auto,1,2,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.538462,0.615385,0.666667,0.916667,0.833333,0.714103,0.140067,712
1,0.146659,0.019261,0.008418,0.001497,True,10,auto,1,2,100,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.615385,0.692308,0.750000,0.916667,0.833333,0.761538,0.105362,568
2,0.197610,0.032559,0.015487,0.003293,True,10,auto,1,2,150,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.615385,0.692308,0.750000,0.916667,0.833333,0.761538,0.105362,568
3,0.332767,0.008411,0.020518,0.003623,True,10,auto,1,2,200,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.615385,0.692308,0.750000,0.916667,0.833333,0.761538,0.105362,568
4,0.077733,0.010580,0.006138,0.003990,True,10,auto,1,4,50,"{'bootstrap': True, 'max_depth': 10, 'max_feat...",0.538462,0.615385,0.833333,0.833333,0.833333,0.730769,0.127948,658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,0.225195,0.002883,0.014101,0.000299,True,30,log2,5,6,200,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.692308,0.692308,0.833333,0.916667,0.750000,0.776923,0.086896,199
716,0.059049,0.000694,0.004077,0.000232,True,30,log2,5,8,50,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.692308,0.692308,0.833333,0.916667,0.750000,0.776923,0.086896,199
717,0.118147,0.004051,0.008052,0.000543,True,30,log2,5,8,100,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.692308,0.692308,0.833333,0.916667,0.750000,0.776923,0.086896,199
718,0.180377,0.002104,0.011239,0.000496,True,30,log2,5,8,150,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",0.692308,0.692308,0.833333,0.916667,0.750000,0.776923,0.086896,199


### Grid Search: SVM-RBF

In [52]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['rbf'], 'C': [0.1, 1, 10, 100],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0]}

svm_cls = svm.SVC(random_state=42)
grid_search = GridSearchCV(svm_cls, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=4)

In [53]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done  93 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished


{'C': 10, 'gamma': 0.3, 'kernel': 'rbf'}

In [54]:
grid_search.best_score_

0.8743589743589745

In [55]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00098,0.000298,0.000314,5.9e-05,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.461538,0.461538,0.916667,0.75,0.916667,0.701282,0.204992,17
1,0.00104,0.000203,0.000349,5.4e-05,0.1,0.03,rbf,"{'C': 0.1, 'gamma': 0.03, 'kernel': 'rbf'}",0.461538,0.461538,0.916667,0.75,0.916667,0.701282,0.204992,17
2,0.000806,7.8e-05,0.000397,7.3e-05,0.1,0.1,rbf,"{'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}",0.769231,0.769231,0.916667,0.75,0.916667,0.824359,0.075695,10
3,0.000774,3.2e-05,0.000332,5.6e-05,0.1,0.3,rbf,"{'C': 0.1, 'gamma': 0.3, 'kernel': 'rbf'}",0.769231,0.769231,0.916667,0.666667,0.833333,0.791026,0.082451,16
4,0.000817,0.000145,0.000342,6.2e-05,0.1,1.0,rbf,"{'C': 0.1, 'gamma': 1.0, 'kernel': 'rbf'}",0.461538,0.461538,0.916667,0.666667,0.75,0.651282,0.174594,20
5,0.000864,0.000146,0.001016,0.001388,1.0,0.01,rbf,"{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}",0.769231,0.846154,0.916667,0.75,0.916667,0.839744,0.070571,6
6,0.00085,0.000207,0.000326,4e-05,1.0,0.03,rbf,"{'C': 1, 'gamma': 0.03, 'kernel': 'rbf'}",0.692308,0.923077,0.916667,0.833333,0.833333,0.839744,0.083284,6
7,0.000688,4.3e-05,0.000324,5.9e-05,1.0,0.1,rbf,"{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}",0.769231,0.923077,0.916667,0.75,0.916667,0.855128,0.078258,4
8,0.000805,0.000144,0.000362,0.000131,1.0,0.3,rbf,"{'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}",0.769231,0.923077,0.916667,0.75,0.916667,0.855128,0.078258,4
9,0.00089,0.000171,0.000469,0.000134,1.0,1.0,rbf,"{'C': 1, 'gamma': 1.0, 'kernel': 'rbf'}",0.846154,0.769231,0.916667,0.666667,0.833333,0.80641,0.084109,15


### Grid Search: Lin-SVM

In [56]:
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 5, 10.]}

svm_cls = svm.SVC(random_state=42)
grid_search = GridSearchCV(svm_cls, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=4)

In [57]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  23 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


{'C': 1, 'kernel': 'linear'}

In [58]:
grid_search.best_score_

0.8243589743589744

In [59]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001038,0.000249,0.000372,7.2e-05,0.001,linear,"{'C': 0.001, 'kernel': 'linear'}",0.461538,0.461538,0.916667,0.75,0.833333,0.684615,0.189614,6
1,0.00092,8.7e-05,0.000344,5.2e-05,0.01,linear,"{'C': 0.01, 'kernel': 'linear'}",0.692308,0.846154,0.916667,0.75,0.833333,0.807692,0.0783,2
2,0.000805,9.9e-05,0.000304,3.1e-05,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.692308,0.846154,0.916667,0.75,0.833333,0.807692,0.0783,2
3,0.001192,0.000335,0.000479,0.000297,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.615385,0.923077,0.916667,0.75,0.916667,0.824359,0.123277,1
4,0.000949,0.00031,0.000309,2.8e-05,5.0,linear,"{'C': 5, 'kernel': 'linear'}",0.615385,0.846154,0.916667,0.75,0.833333,0.792308,0.103092,4
5,0.000954,8.9e-05,0.000337,6e-05,10.0,linear,"{'C': 10.0, 'kernel': 'linear'}",0.615385,0.846154,0.916667,0.75,0.833333,0.792308,0.103092,4


### Grid Search: kNN

In [60]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': list(range(3,30)), 'p': [1,2]}

knn_cls = KNeighborsClassifier()
              
grid_search = GridSearchCV(knn_cls, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=4)

In [61]:
# Fit the grid search to the data
grid_search.fit(X, y)
grid_search.best_params_

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 270 out of 270 | elapsed:    0.2s finished


{'n_neighbors': 5, 'p': 1}

In [62]:
grid_search.best_score_

0.8551282051282051

In [63]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000625,0.000142,0.001225,0.000164,3,1,"{'n_neighbors': 3, 'p': 1}",0.692308,0.846154,0.666667,0.666667,0.833333,0.741026,0.081246,53
1,0.004352,0.005295,0.008426,0.007398,3,2,"{'n_neighbors': 3, 'p': 2}",0.769231,0.769231,0.666667,0.75,0.75,0.741026,0.038161,51
2,0.002042,0.002813,0.001284,6.8e-05,4,1,"{'n_neighbors': 4, 'p': 1}",0.692308,0.615385,0.75,0.75,0.833333,0.728205,0.072137,54
3,0.001273,0.001463,0.001242,0.000155,4,2,"{'n_neighbors': 4, 'p': 2}",0.692308,0.769231,0.75,0.75,0.833333,0.758974,0.045255,48
4,0.001576,0.001981,0.003336,0.003293,5,1,"{'n_neighbors': 5, 'p': 1}",0.846154,0.846154,0.916667,0.75,0.916667,0.855128,0.061298,1
5,0.000738,0.000182,0.003196,0.003881,5,2,"{'n_neighbors': 5, 'p': 2}",0.846154,0.769231,0.916667,0.75,0.833333,0.823077,0.059391,19
6,0.000577,0.000125,0.003795,0.004125,6,1,"{'n_neighbors': 6, 'p': 1}",0.846154,0.692308,0.833333,0.75,0.833333,0.791026,0.060079,41
7,0.000561,9e-05,0.001295,0.000256,6,2,"{'n_neighbors': 6, 'p': 2}",0.846154,0.692308,0.916667,0.75,0.833333,0.807692,0.0783,26
8,0.000727,0.000364,0.003719,0.003837,7,1,"{'n_neighbors': 7, 'p': 1}",0.846154,0.846154,0.916667,0.75,0.833333,0.838462,0.053078,9
9,0.000508,4.5e-05,0.001326,0.000483,7,2,"{'n_neighbors': 7, 'p': 2}",0.846154,0.846154,0.916667,0.75,0.833333,0.838462,0.053078,9
