In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import SCORERS, accuracy_score, auc, f1_score, precision_score, recall_score, roc_curve, confusion_matrix

# Import Data

## Read Data

Reading and importing data for training and evaluation

In [2]:
X = pd.read_csv('X.csv')
X = X.set_index('ID')
y = pd.read_csv('y.csv')
y = y.set_index('ID')

In [3]:
X.describe()

Unnamed: 0,age,latitude,longitude,date_confirmation,chronic_disease_binary,travel_history_binary,sex_female,sex_male,geo_resolution_admin0,geo_resolution_admin1,geo_resolution_admin2,geo_resolution_admin3,geo_resolution_point
count,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0,15783.0
mean,40.181195,16.561408,86.221862,102.503073,0.006843,0.053919,0.266806,0.392574,0.039409,0.050117,0.245264,0.001394,0.663815
std,16.68,6.933532,31.203998,26.0202,0.08244,0.225864,0.442304,0.488339,0.194573,0.218194,0.430257,0.03731,0.472418
min,0.0,-34.9289,-123.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.4,13.08362,74.73832,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,38.0,14.5958,80.28252,110.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,51.4,19.68333,120.9772,124.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
max,101.0,49.25,153.4,137.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Scale Data

Scaling data in order to be ready for some models as they not take negative feature inputs. Also increasing accuracy in some other models

In [3]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X, y)


# Search Best Hyper Parameters For Base Models

This section is a try for hyper parameter tuning. 

Each subsection contains 3 different steps for each learning method:

1. Define and cross validate the method with variable hyper parameter input which is given in nested "for loop"s and save scores and hyper paramters in two seperate lists with same order
2. Convert lists to pandas dataframes, specify their columns (hyper parameters and score types) and join these lists together
3. Sort list in descending order by its test accuracy score column

At the end of each subsection the best hyper paramters and fold numbers are noted

## Logistic Regression

In [12]:
LR_scores = []
LR_params = []
for c in [0.01, 0.1, 1]:

    for sol_func in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
    
        if sol_func == 'newton-cg':
            pen = ['l2', 'none']
        elif sol_func == 'lbfgs':
            pen = ['l2', 'none']
        elif sol_func == 'liblinear':
            pen = ['l1', 'l2']
        elif sol_func == 'sag':
            pen = ['l2', 'none']
        elif sol_func == 'saga':
            pen = ['l2', 'none']

        for p in pen:

            for folds in range(2, 11):
                LR_score = cross_validate(LogisticRegression(C=c, solver=sol_func, penalty=p),X,y, 
                scoring = [
                    'accuracy', 'f1_weighted', 'precision_weighted',
                    'recall_weighted'], 
                    cv=folds, n_jobs=-1, return_train_score=True, )
                scores = []
                for i in LR_score:
                    scores.append(LR_score[i].mean())    
                LR_scores.append((scores[:]))    
                LR_params.append((c, sol_func, p, folds))

In [13]:
%store LR_scores
%store LR_params

Stored 'LR_scores' (list)
Stored 'LR_params' (list)


In [14]:
LR_scores_df = pd.DataFrame(LR_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
LR_params_df = pd.DataFrame(LR_params, columns = ['C', 'Solver', 'Penalty', 'Folds'])
LR_scores_df = LR_params_df.join(LR_scores_df, how='left')
LR_scores_df

Unnamed: 0,C,Solver,Penalty,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,0.01,newton-cg,l2,2,0.620121,0.203869,0.779191,0.789328,0.752762,0.764901,0.749376,0.756400,0.779191,0.789328
1,0.01,newton-cg,l2,3,0.727551,0.148572,0.785909,0.801337,0.763112,0.783815,0.764639,0.788820,0.785909,0.801337
2,0.01,newton-cg,l2,4,1.140295,0.134416,0.786345,0.802677,0.762879,0.786491,0.761238,0.789143,0.786345,0.802677
3,0.01,newton-cg,l2,5,1.216649,0.113528,0.797368,0.804251,0.776814,0.789009,0.784187,0.792108,0.797368,0.804251
4,0.01,newton-cg,l2,6,1.135722,0.072045,0.801868,0.804131,0.781505,0.789002,0.790762,0.792092,0.801868,0.804131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,1.00,saga,none,6,2.470136,0.072914,0.842872,0.897295,0.835656,0.895406,0.871478,0.899661,0.842872,0.897295
266,1.00,saga,none,7,2.830445,0.064000,0.853381,0.896006,0.845301,0.894112,0.881069,0.898344,0.853381,0.896006
267,1.00,saga,none,8,3.647385,0.101312,0.857239,0.895502,0.847908,0.893634,0.888948,0.898030,0.857239,0.895502
268,1.00,saga,none,9,3.394830,0.057767,0.848440,0.895766,0.838445,0.893912,0.877357,0.898266,0.848440,0.895766


In [18]:
LR_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,C,Solver,Penalty,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
186,1.00,newton-cg,l2,8,2.369932,0.073155,0.862116,0.887247,0.851900,0.884517,0.887869,0.889957,0.862116,0.887247
258,1.00,saga,l2,8,1.488236,0.058591,0.862116,0.887247,0.851900,0.884517,0.887869,0.889957,0.862116,0.887247
240,1.00,sag,l2,8,0.617168,0.058591,0.862116,0.887220,0.851900,0.884493,0.887869,0.889928,0.862116,0.887220
204,1.00,lbfgs,l2,8,1.772072,0.060541,0.861673,0.887193,0.851521,0.884478,0.887538,0.890024,0.861673,0.887193
222,1.00,liblinear,l1,8,8.461740,0.051712,0.857935,0.886940,0.847876,0.883424,0.884498,0.890233,0.857935,0.886940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,0.10,sag,none,2,1.480045,0.192058,0.761078,0.922828,0.757195,0.921780,0.802943,0.924564,0.761078,0.922828
63,0.01,sag,none,2,0.678450,0.150154,0.760888,0.922828,0.756941,0.921780,0.802518,0.924564,0.760888,0.922828
99,0.10,newton-cg,none,2,3.520716,0.168031,0.760381,0.922828,0.756555,0.921782,0.802779,0.924552,0.760381,0.922828
9,0.01,newton-cg,none,2,2.171811,0.148428,0.760381,0.922828,0.756555,0.921782,0.802779,0.924552,0.760381,0.922828


C = 1

Solver = newton-cg

penalty= L2

Folds = 8

## Decision Tree

In [305]:
DT_scores = []
DT_params = []

for crit in ["gini", "entropy"]:
    for sp in ["best", "random"]:
        for md in [10,None]:
            for mss in [2,10]:
                for msl in [2,10]:
                    for mf in ["auto", "sqrt", "log2"]:
                        for mln in [None, 10000]:
                            for mid in [1, 0]:
                                for ca in [1, 0]:

                                    for folds in range(2, 11):
                                        print({'criterion' : crit, 'splitter' : sp, 'max depth' : md,
                                         'min_samples_split' : mss, 'min_samples_leaf' : msl, 
                                         'max_features' : mf,'max_leaf_nodes' : mln, 'min_impurity_decrease' : mid,
                                        'ccp_alpha' : ca, 'Folds' : folds})
                                        DT_score = cross_validate(DecisionTreeClassifier(criterion=crit, 
                                            splitter=sp, max_depth=md, min_samples_split=mss,
                                            min_samples_leaf=msl, max_features=mf, max_leaf_nodes=mln, 
                                            min_impurity_decrease=mid,ccp_alpha=ca),X,y, 
                                            scoring = [
                                                'accuracy', 'f1_weighted', 'precision_weighted',
                                                'recall_weighted'], 
                                            cv=folds, n_jobs=-1, return_train_score=True)
                                        scores = []
                                        for i in DT_score:
                                            scores.append(DT_score[i].mean())    
                                        DT_scores.append((scores[:]))    
                                        DT_params.append((crit, sp, md, mss, msl, mf, mln, mid, ca, folds))

{'criterion': 'gini', 'splitter': 'best', 'max depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 1, 'ccp_alpha': 1, 'Folds': 2}
{'criterion': 'gini', 'splitter': 'best', 'max depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 1, 'ccp_alpha': 1, 'Folds': 3}
{'criterion': 'gini', 'splitter': 'best', 'max depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 1, 'ccp_alpha': 1, 'Folds': 4}
{'criterion': 'gini', 'splitter': 'best', 'max depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 1, 'ccp_alpha': 1, 'Folds': 5}
{'criterion': 'gini', 'splitter': 'best', 'max depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_i

In [310]:
%store DT_scores
%store DT_params

Stored 'DT_scores' (list)
Stored 'DT_params' (list)


In [308]:
DT_scores_df = pd.DataFrame(DT_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
DT_params_df = pd.DataFrame(DT_params, columns = ['criterion', 'splitter', 'max_depth',
 'min_samples_split', 'min_samples_leaf', 'max_features', 'max_leaf_nodes', 
 'min_impurity_decrease', 'ccp_alpha', 'Folds'])
DT_scores_df = DT_params_df.join(DT_scores_df, how='left')
DT_scores_df

Unnamed: 0,criterion,splitter,max_depth,min_samples_split,min_samples_leaf,max_features,max_leaf_nodes,min_impurity_decrease,ccp_alpha,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,gini,best,10.0,2,2,auto,,1,1,2,0.019988,0.271832,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
1,gini,best,10.0,2,2,auto,,1,1,3,0.026671,0.153668,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
2,gini,best,10.0,2,2,auto,,1,1,4,0.034225,0.138664,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
3,gini,best,10.0,2,2,auto,,1,1,5,0.034336,0.097720,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
4,gini,best,10.0,2,2,auto,,1,1,6,0.038975,0.083282,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6907,entropy,random,,10,10,log2,10000.0,0,0,6,0.034481,0.070310,0.723613,0.774897,0.705535,0.766865,0.734570,0.795868,0.723613,0.774897
6908,entropy,random,,10,10,log2,10000.0,0,0,7,0.036469,0.067339,0.792045,0.825181,0.780175,0.821195,0.816375,0.829750,0.792045,0.825181
6909,entropy,random,,10,10,log2,10000.0,0,0,8,0.041014,0.062499,0.772590,0.795602,0.745011,0.781853,0.780224,0.824314,0.772590,0.795602
6910,entropy,random,,10,10,log2,10000.0,0,0,9,0.037943,0.051873,0.785528,0.820621,0.767326,0.813918,0.821906,0.831800,0.785528,0.820621


In [309]:
DT_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,criterion,splitter,max_depth,min_samples_split,min_samples_leaf,max_features,max_leaf_nodes,min_impurity_decrease,ccp_alpha,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
3922,entropy,best,10.0,10,2,auto,,0,0,9,0.052049,0.062498,0.860415,0.927295,0.850093,0.927143,0.889882,0.930478,0.860415,0.927295
3775,entropy,best,10.0,2,10,sqrt,,0,0,6,0.050404,0.097181,0.858774,0.911537,0.851851,0.911211,0.879154,0.916221,0.858774,0.911537
5289,entropy,random,10.0,2,2,sqrt,,0,0,8,0.039061,0.058591,0.858377,0.880585,0.848872,0.879294,0.881209,0.889299,0.858377,0.880585
680,gini,best,10.0,10,10,auto,,0,0,7,0.049599,0.057370,0.854839,0.918372,0.846454,0.917779,0.882143,0.920880,0.854839,0.918372
753,gini,best,10.0,10,10,sqrt,,0,0,8,0.046874,0.060545,0.853501,0.912781,0.844729,0.912338,0.879689,0.915830,0.853501,0.912781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4683,entropy,best,,2,10,log2,,1,1,5,0.031364,0.095000,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
2244,gini,random,10.0,10,2,sqrt,,1,0,5,0.033795,0.101613,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
6240,entropy,random,,2,2,log2,10000.0,1,0,5,0.027230,0.093747,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
2235,gini,random,10.0,10,2,sqrt,,1,1,5,0.033776,0.129320,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495


criterion = entropy

splitter = best

max_depth = 10

min_sample_split = 10

min_sample_leaf = 2

max_features = auto

max_leaf_nodes = None

min_impurity_decrease = 0

ccp_alpha = 0

Folds = 9

## Random Forest

parameters from Decision Tree used

In [311]:
RF_scores = []
RF_params = []

for ne in [1,10,100,1000]:
        for bs in [True, False]:
            for os in [True, False]:
                for ws in [True, False]:
                    for cw in [None, "balanced", "balanced_subsample"]:
                        for folds in range(2, 11):
                            print(ne, bs, os, ws, cw, folds)
                            RF_score = cross_validate(RandomForestClassifier(n_estimators=ne, criterion='entropy', max_depth=10, min_samples_split=10,
                            min_samples_leaf=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0,ccp_alpha=0, bootstrap=bs, 
                            oob_score=os, warm_start=ws, class_weight=cw, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=folds, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in RF_score:
                                scores.append(RF_score[i].mean())    
                            RF_scores.append((scores[:]))    
                            RF_params.append((ne, bs, os, ws, cw, folds))

1 True True True None 2
1 True True True None 3
1 True True True None 4
1 True True True None 5
1 True True True None 6
1 True True True None 7
1 True True True None 8
1 True True True None 9
1 True True True None 10
1 True True True balanced 2
1 True True True balanced 3
1 True True True balanced 4
1 True True True balanced 5
1 True True True balanced 6
1 True True True balanced 7
1 True True True balanced 8
1 True True True balanced 9
1 True True True balanced 10
1 True True True balanced_subsample 2
1 True True True balanced_subsample 3
1 True True True balanced_subsample 4
1 True True True balanced_subsample 5
1 True True True balanced_subsample 6
1 True True True balanced_subsample 7
1 True True True balanced_subsample 8
1 True True True balanced_subsample 9
1 True True True balanced_subsample 10
1 True True False None 2
1 True True False None 3
1 True True False None 4
1 True True False None 5
1 True True False None 6
1 True True False None 7
1 True True False None 8
1 True True 

In [312]:
%store RF_scores
%store RF_params

Stored 'RF_scores' (list)
Stored 'RF_params' (list)


In [314]:
RF_scores_df = pd.DataFrame(RF_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
RF_params_df = pd.DataFrame(RF_params, columns = ['n_estimators', 'bootstrap', 'oob_score',
 'warm_start', 'class_weight','Folds'])
RF_scores_df = RF_params_df.join(RF_scores_df, how='left')
RF_scores_df

Unnamed: 0,n_estimators,bootstrap,oob_score,warm_start,class_weight,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1,True,True,True,,2,0.056966,0.185384,0.744668,0.919026,0.740960,0.918496,0.789960,0.920684,0.744668,0.919026
1,1,True,True,True,,3,0.052968,0.204206,0.739023,0.930400,0.715575,0.930152,0.794822,0.931583,0.739023,0.930400
2,1,True,True,True,,4,0.064209,0.171394,0.789140,0.920335,0.771394,0.919958,0.833793,0.922549,0.789140,0.920335
3,1,True,True,True,,5,0.064360,0.113729,0.724328,0.903678,0.700763,0.903487,0.769583,0.907724,0.724328,0.903678
4,1,True,True,True,,6,0.069124,0.102436,0.793018,0.923690,0.782884,0.923496,0.837975,0.925635,0.793018,0.923690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,1000,False,False,False,balanced_subsample,6,17.544270,1.057277,0.849400,0.938478,0.839513,0.938056,0.881367,0.945891,0.849400,0.938478
860,1000,False,False,False,balanced_subsample,7,18.918446,1.517386,0.861618,0.937770,0.851888,0.937352,0.896655,0.945169,0.861618,0.937770
861,1000,False,False,False,balanced_subsample,8,21.545465,1.683517,0.887461,0.937084,0.879468,0.936646,0.917568,0.944731,0.887461,0.937084
862,1000,False,False,False,balanced_subsample,9,20.146926,1.344874,0.870170,0.936957,0.860382,0.936532,0.894410,0.944531,0.870170,0.936957


In [315]:
RF_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,bootstrap,oob_score,warm_start,class_weight,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
717,1000,True,False,True,balanced,8,16.824650,1.305822,0.890502,0.933844,0.883010,0.933397,0.919990,0.941785,0.890502,0.933844
528,100,True,False,False,balanced,8,1.580087,0.216370,0.890438,0.933436,0.882971,0.932989,0.920413,0.941392,0.890438,0.933436
663,1000,True,True,True,balanced,8,21.789305,0.786068,0.890375,0.933988,0.882939,0.933537,0.919793,0.941940,0.890375,0.933988
672,1000,True,True,True,balanced_subsample,8,28.565428,0.831238,0.890312,0.934034,0.882924,0.933583,0.920061,0.941965,0.890312,0.934034
753,1000,True,False,False,balanced_subsample,8,26.991982,1.424887,0.889931,0.933907,0.882455,0.933457,0.919612,0.941848,0.889931,0.933907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,1000,False,True,False,balanced_subsample,6,0.056129,0.000000,,,,,,,,
806,1000,False,True,False,balanced_subsample,7,0.060106,0.000000,,,,,,,,
807,1000,False,True,False,balanced_subsample,8,0.063333,0.000000,,,,,,,,
808,1000,False,True,False,balanced_subsample,9,0.064802,0.000000,,,,,,,,


n_estimators = 1000

bootstrap = True

oob_score = False

warm_start = True

class_weight = balanced

Folds = 8

## SVM

In [321]:
SVM_scores = []
SVM_params = []

for c in [0.1, 1]:
    for ker in ['linear', 'poly', 'rbf']:
        for deg in [3,5]:
            for prob in [True, False]:
                    for cw in [None, 'balanced']:
                        for folds in range(2, 11):
                            print(c, ker, deg, prob, cw, folds)
                            SVM_score = cross_validate(SVC(C=c, kernel=ker, degree=deg, gamma='auto', probability=prob, 
                            cache_size=1000, class_weight=cw),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=folds, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in SVM_score:
                                scores.append(SVM_score[i].mean())    
                            SVM_scores.append((scores[:]))    
                            SVM_params.append((c, ker, deg, prob, cw, folds))


0.1 linear 3 True None 2
0.1 linear 3 True None 3
0.1 linear 3 True None 4
0.1 linear 3 True None 5
0.1 linear 3 True None 6
0.1 linear 3 True None 7
0.1 linear 3 True None 8
0.1 linear 3 True None 9
0.1 linear 3 True None 10
0.1 linear 3 True balanced 2
0.1 linear 3 True balanced 3
0.1 linear 3 True balanced 4
0.1 linear 3 True balanced 5
0.1 linear 3 True balanced 6
0.1 linear 3 True balanced 7
0.1 linear 3 True balanced 8
0.1 linear 3 True balanced 9
0.1 linear 3 True balanced 10
0.1 linear 3 False None 2
0.1 linear 3 False None 3
0.1 linear 3 False None 4
0.1 linear 3 False None 5
0.1 linear 3 False None 6
0.1 linear 3 False None 7
0.1 linear 3 False None 8
0.1 linear 3 False None 9
0.1 linear 3 False None 10
0.1 linear 3 False balanced 2
0.1 linear 3 False balanced 3
0.1 linear 3 False balanced 4
0.1 linear 3 False balanced 5
0.1 linear 3 False balanced 6
0.1 linear 3 False balanced 7
0.1 linear 3 False balanced 8
0.1 linear 3 False balanced 9
0.1 linear 3 False balanced 10
0.1 li

In [322]:
%store SVM_scores
%store SVM_params

Stored 'SVM_scores' (list)
Stored 'SVM_params' (list)


In [323]:
SVM_scores_df = pd.DataFrame(SVM_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
SVM_params_df = pd.DataFrame(SVM_params, columns = ['C', 'kernel', 'degree', 
'probability', 'class_weight','Folds'])
SVM_scores_df = SVM_params_df.join(SVM_scores_df, how='left')
SVM_scores_df

Unnamed: 0,C,kernel,degree,probability,class_weight,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,0.1,linear,3,True,,2,6.760841,1.502577,0.770387,0.883673,0.775914,0.880831,0.809865,0.890420,0.770387,0.883673
1,0.1,linear,3,True,,3,16.269325,1.298868,0.772540,0.864474,0.774799,0.861054,0.811527,0.876582,0.772540,0.864474
2,0.1,linear,3,True,,4,32.288443,1.798693,0.786918,0.872310,0.783398,0.868637,0.824873,0.881789,0.786918,0.872310
3,0.1,linear,3,True,,5,34.087247,1.251852,0.810998,0.878619,0.806832,0.874384,0.845188,0.887779,0.810998,0.878619
4,0.1,linear,3,True,,6,26.846653,0.902697,0.843692,0.879288,0.835429,0.874629,0.864744,0.889133,0.843692,0.879288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,1.0,rbf,5,False,balanced,6,7.674609,3.495804,0.848953,0.880644,0.849451,0.882062,0.896865,0.903544,0.848953,0.880644
428,1.0,rbf,5,False,balanced,7,8.161315,3.163936,0.862630,0.881127,0.862920,0.882800,0.915956,0.904317,0.862630,0.881127
429,1.0,rbf,5,False,balanced,8,8.617588,3.153142,0.870417,0.882360,0.866256,0.884211,0.925220,0.905800,0.870417,0.882360
430,1.0,rbf,5,False,balanced,9,8.833202,2.833536,0.869601,0.883664,0.863329,0.885399,0.927107,0.906588,0.869601,0.883664


In [324]:
SVM_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,C,kernel,degree,probability,class_weight,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
269,1.0,linear,5,True,balanced,10,28.944092,0.443861,0.877788,0.891177,0.868219,0.891536,0.882192,0.908931,0.877788,0.891177
287,1.0,linear,5,False,balanced,10,5.479085,0.458522,0.877788,0.891177,0.868219,0.891536,0.882192,0.908931,0.877788,0.891177
251,1.0,linear,3,False,balanced,10,5.553058,0.460660,0.877788,0.891177,0.868219,0.891536,0.882192,0.908931,0.877788,0.891177
233,1.0,linear,3,True,balanced,10,29.663815,0.449419,0.877788,0.891177,0.868219,0.891536,0.882192,0.908931,0.877788,0.891177
431,1.0,rbf,5,False,balanced,10,9.007696,2.401479,0.874494,0.883947,0.866916,0.885705,0.885957,0.906837,0.874494,0.883947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,0.1,poly,5,True,balanced,10,260.292638,1.876507,0.075003,0.074526,0.023869,0.023845,0.015911,0.015902,0.075003,0.074526
135,0.1,poly,5,False,balanced,2,9.823580,3.929570,0.026103,0.026042,0.001701,0.001692,0.000883,0.000878,0.026103,0.026042
117,0.1,poly,5,True,balanced,2,51.444690,3.992069,0.026103,0.026042,0.001701,0.001692,0.000883,0.000878,0.026103,0.026042
120,0.1,poly,5,True,balanced,5,191.083207,3.228030,0.024712,0.024615,0.001502,0.001496,0.000778,0.000775,0.024712,0.024615


C = 1

kernel = linear

degree = 5

probability = True

class_weight = balanced

Folds = 10

## GaussianNB

In [325]:
GNB_scores = []
GNB_params = []

for vs in [1,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10,0]:
    for folds in range(2, 11):
        print(vs, folds)
        GNB_score = cross_validate(GaussianNB(var_smoothing=vs),X,y, 
        scoring = [
            'accuracy', 'f1_weighted', 'precision_weighted',
            'recall_weighted'], 
        cv=folds, n_jobs=-1, return_train_score=True)
        scores = []
        for i in GNB_score:
            scores.append(GNB_score[i].mean())    
        GNB_scores.append((scores[:]))    
        GNB_params.append((vs, folds))



1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
0.1 2
0.1 3
0.1 4
0.1 5
0.1 6
0.1 7
0.1 8
0.1 9
0.1 10
0.01 2
0.01 3
0.01 4
0.01 5
0.01 6
0.01 7
0.01 8
0.01 9
0.01 10
0.001 2
0.001 3
0.001 4
0.001 5
0.001 6
0.001 7
0.001 8
0.001 9
0.001 10
0.0001 2
0.0001 3
0.0001 4
0.0001 5
0.0001 6
0.0001 7
0.0001 8
0.0001 9
0.0001 10
1e-05 2
1e-05 3
1e-05 4
1e-05 5
1e-05 6
1e-05 7
1e-05 8
1e-05 9
1e-05 10
1e-06 2
1e-06 3
1e-06 4
1e-06 5
1e-06 6
1e-06 7
1e-06 8
1e-06 9
1e-06 10
1e-07 2
1e-07 3
1e-07 4
1e-07 5
1e-07 6
1e-07 7
1e-07 8
1e-07 9
1e-07 10
1e-08 2
1e-08 3
1e-08 4
1e-08 5
1e-08 6
1e-08 7
1e-08 8
1e-08 9
1e-08 10
1e-09 2
1e-09 3
1e-09 4
1e-09 5
1e-09 6
1e-09 7
1e-09 8
1e-09 9
1e-09 10
1e-10 2
1e-10 3
1e-10 4
1e-10 5
1e-10 6
1e-10 7
1e-10 8
1e-10 9
1e-10 10
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10


In [326]:
%store GNB_scores
%store GNB_params

Stored 'GNB_scores' (list)
Stored 'GNB_params' (list)


In [327]:
GNB_scores_df = pd.DataFrame(GNB_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
GNB_params_df = pd.DataFrame(GNB_params, columns = ['var_smoothing','Folds'])
GNB_scores_df = GNB_params_df.join(GNB_scores_df, how='left')
GNB_scores_df

Unnamed: 0,var_smoothing,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1.0,2,0.035478,0.224862,0.796171,0.803965,0.783053,0.791803,0.795507,0.815767,0.796171,0.803965
1,1.0,3,0.028284,0.214159,0.792562,0.799341,0.775818,0.786013,0.786295,0.798776,0.792562,0.799341
2,1.0,4,0.042967,0.125993,0.791540,0.792687,0.772558,0.778062,0.778144,0.786894,0.791540,0.792687
3,1.0,5,0.055645,0.119466,0.792046,0.792482,0.771765,0.777849,0.781223,0.787067,0.792046,0.792482
4,1.0,6,0.047635,0.079118,0.792303,0.792536,0.771987,0.778013,0.782212,0.786745,0.792303,0.792536
...,...,...,...,...,...,...,...,...,...,...,...,...
103,0.0,6,0.049635,0.092110,0.011912,0.011912,0.000280,0.000280,0.000142,0.000142,0.011912,0.011912
104,0.0,7,0.046399,0.073812,0.011912,0.011912,0.000280,0.000280,0.000142,0.000142,0.011912,0.011912
105,0.0,8,0.048219,0.069207,0.011912,0.011912,0.000281,0.000280,0.000142,0.000142,0.011912,0.011912
106,0.0,9,0.052410,0.062739,0.011912,0.011912,0.000280,0.000280,0.000142,0.000142,0.011912,0.011912


In [329]:
GNB_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,var_smoothing,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1.0,2,0.035478,0.224862,0.796171,0.803965,0.783053,0.791803,0.795507,0.815767,0.796171,0.803965
1,1.0,3,0.028284,0.214159,0.792562,0.799341,0.775818,0.786013,0.786295,0.798776,0.792562,0.799341
4,1.0,6,0.047635,0.079118,0.792303,0.792536,0.771987,0.778013,0.782212,0.786745,0.792303,0.792536
3,1.0,5,0.055645,0.119466,0.792046,0.792482,0.771765,0.777849,0.781223,0.787067,0.792046,0.792482
7,1.0,9,0.054632,0.068515,0.792041,0.792625,0.765413,0.778134,0.771993,0.786138,0.792041,0.792625
...,...,...,...,...,...,...,...,...,...,...,...,...
100,0.0,3,0.029980,0.149574,0.011912,0.011912,0.000280,0.000280,0.000142,0.000142,0.011912,0.011912
107,0.0,10,0.049668,0.060962,0.011912,0.011912,0.000281,0.000280,0.000142,0.000142,0.011912,0.011912
106,0.0,9,0.052410,0.062739,0.011912,0.011912,0.000280,0.000280,0.000142,0.000142,0.011912,0.011912
103,0.0,6,0.049635,0.092110,0.011912,0.011912,0.000280,0.000280,0.000142,0.000142,0.011912,0.011912


var_smoothing = 1

Folds = 2

## MultinomialNB

In [331]:
MNB_scores = []
MNB_params = []

for a in [1,1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10,0]:
    for fp in [True, False]:
        for folds in range(2, 11):
            print(a, fp,folds)
            MNB_score = cross_validate(MultinomialNB(alpha=a, fit_prior=fp),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=folds, n_jobs=-1, return_train_score=True)
            scores = []
            for i in MNB_score:
                scores.append(MNB_score[i].mean())    
            MNB_scores.append((scores[:]))    
            MNB_params.append((a, fp, folds))



1 True 2
1 True 3
1 True 4
1 True 5
1 True 6
1 True 7
1 True 8
1 True 9
1 True 10
1 False 2
1 False 3
1 False 4
1 False 5
1 False 6
1 False 7
1 False 8
1 False 9
1 False 10
0.1 True 2
0.1 True 3
0.1 True 4
0.1 True 5
0.1 True 6
0.1 True 7
0.1 True 8
0.1 True 9
0.1 True 10
0.1 False 2
0.1 False 3
0.1 False 4
0.1 False 5
0.1 False 6
0.1 False 7
0.1 False 8
0.1 False 9
0.1 False 10
0.01 True 2
0.01 True 3
0.01 True 4
0.01 True 5
0.01 True 6
0.01 True 7
0.01 True 8
0.01 True 9
0.01 True 10
0.01 False 2
0.01 False 3
0.01 False 4
0.01 False 5
0.01 False 6
0.01 False 7
0.01 False 8
0.01 False 9
0.01 False 10
0.001 True 2
0.001 True 3
0.001 True 4
0.001 True 5
0.001 True 6
0.001 True 7
0.001 True 8
0.001 True 9
0.001 True 10
0.001 False 2
0.001 False 3
0.001 False 4
0.001 False 5
0.001 False 6
0.001 False 7
0.001 False 8
0.001 False 9
0.001 False 10
0.0001 True 2
0.0001 True 3
0.0001 True 4
0.0001 True 5
0.0001 True 6
0.0001 True 7
0.0001 True 8
0.0001 True 9
0.0001 True 10
0.0001 False 2
0.00

In [335]:
%store MNB_scores
%store MNB_params

Stored 'MNB_scores' (list)
Stored 'MNB_params' (list)


In [339]:
MNB_scores_df = pd.DataFrame(MNB_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
MNB_params_df = pd.DataFrame(MNB_params, columns = ['alpha', 'fit_prior','Folds'])
MNB_scores_df = MNB_params_df.join(MNB_scores_df, how='left')
MNB_scores_df

Unnamed: 0,alpha,fit_prior,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1.0,True,2,0.039475,0.171894,0.788378,0.813912,0.788062,0.804306,0.813837,0.825649,0.788378,0.813912
1,1.0,True,3,0.052633,0.141579,0.767028,0.812171,0.765391,0.802678,0.808673,0.818782,0.767028,0.812171
2,1.0,True,4,0.074703,0.134416,0.788752,0.808379,0.771937,0.796404,0.784208,0.807981,0.788752,0.808379
3,1.0,True,5,0.070557,0.098139,0.776525,0.810349,0.760633,0.798855,0.782211,0.810229,0.776525,0.810349
4,1.0,True,6,0.097273,0.115429,0.799018,0.805879,0.781306,0.793470,0.793198,0.804479,0.799018,0.805879
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,0.0,False,6,0.068460,0.065943,0.770069,0.770437,0.777426,0.778336,0.845538,0.827237,0.770069,0.770437
212,0.0,False,7,0.072129,0.060266,0.770316,0.770629,0.776826,0.778519,0.848934,0.827178,0.770316,0.770629
213,0.0,False,8,0.074214,0.062500,0.770182,0.770485,0.773212,0.778512,0.846224,0.827101,0.770182,0.770485
214,0.0,False,9,0.074757,0.050348,0.770382,0.770402,0.770520,0.778482,0.840138,0.826991,0.770382,0.770402


In [340]:
MNB_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,alpha,fit_prior,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
76,1.000000e-04,True,6,0.070829,0.072915,0.800221,0.806133,0.782445,0.793829,0.795731,0.808393,0.800221,0.806133
148,1.000000e-08,True,6,0.064299,0.065103,0.800221,0.806133,0.782445,0.793829,0.795731,0.808393,0.800221,0.806133
184,1.000000e-10,True,6,0.101618,0.094468,0.800221,0.806133,0.782445,0.793829,0.795731,0.808393,0.800221,0.806133
130,1.000000e-07,True,6,0.061516,0.065102,0.800221,0.806133,0.782445,0.793829,0.795731,0.808393,0.800221,0.806133
202,0.000000e+00,True,6,0.062228,0.067709,0.800221,0.806133,0.782445,0.793829,0.795731,0.808393,0.800221,0.806133
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,1.000000e-08,True,3,0.052082,0.123838,0.766901,0.812013,0.765142,0.802201,0.804448,0.822067,0.766901,0.812013
181,1.000000e-10,True,3,0.046873,0.122724,0.766901,0.812013,0.765142,0.802201,0.804448,0.822067,0.766901,0.812013
91,1.000000e-05,True,3,0.052085,0.117745,0.766901,0.812013,0.765142,0.802201,0.804448,0.822067,0.766901,0.812013
37,1.000000e-02,True,3,0.072953,0.188884,0.766901,0.812013,0.765142,0.802201,0.804448,0.822067,0.766901,0.812013


alpha = 1e-4

fit_prior = True

Folds = 6

## KNN

In [341]:
KNN_scores = []
KNN_params = []

for nn in [2,6,10]:
    for w in ['uniform', 'distance']:
        for alg in ['auto', 'ball_tree', 'kd_tree']:
            for ls in [10,30,50]:
                for P in [1,2,3]:
                    for metr in ['minkowski', 'euclidean']:
                        for folds in range(2, 11):
                            print(nn, w, alg, ls, P, metr, folds)
                            KNN_score = cross_validate(KNeighborsClassifier(n_neighbors=nn, weights=w, algorithm=alg, leaf_size=ls, p=P, metric=metr, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=folds, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in KNN_score:
                                scores.append(KNN_score[i].mean())    
                            KNN_scores.append((scores[:]))    
                            KNN_params.append((nn, w, alg, ls, P, metr, folds))


2 uniform auto 10 1 minkowski 2
2 uniform auto 10 1 minkowski 3
2 uniform auto 10 1 minkowski 4
2 uniform auto 10 1 minkowski 5
2 uniform auto 10 1 minkowski 6
2 uniform auto 10 1 minkowski 7
2 uniform auto 10 1 minkowski 8
2 uniform auto 10 1 minkowski 9
2 uniform auto 10 1 minkowski 10
2 uniform auto 10 1 euclidean 2
2 uniform auto 10 1 euclidean 3
2 uniform auto 10 1 euclidean 4
2 uniform auto 10 1 euclidean 5
2 uniform auto 10 1 euclidean 6
2 uniform auto 10 1 euclidean 7
2 uniform auto 10 1 euclidean 8
2 uniform auto 10 1 euclidean 9
2 uniform auto 10 1 euclidean 10
2 uniform auto 10 2 minkowski 2
2 uniform auto 10 2 minkowski 3
2 uniform auto 10 2 minkowski 4
2 uniform auto 10 2 minkowski 5
2 uniform auto 10 2 minkowski 6
2 uniform auto 10 2 minkowski 7
2 uniform auto 10 2 minkowski 8
2 uniform auto 10 2 minkowski 9
2 uniform auto 10 2 minkowski 10
2 uniform auto 10 2 euclidean 2
2 uniform auto 10 2 euclidean 3
2 uniform auto 10 2 euclidean 4
2 uniform auto 10 2 euclidean 5
2 uni

In [342]:
%store KNN_scores
%store KNN_params

Stored 'KNN_scores' (list)
Stored 'KNN_params' (list)


In [343]:
KNN_scores_df = pd.DataFrame(KNN_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
KNN_params_df = pd.DataFrame(KNN_params, columns = ['n_neighbors', 'weights', 'algorithm', 'leaf_size', 'p','metric','Folds'])
KNN_scores_df = KNN_params_df.join(KNN_scores_df, how='left')
KNN_scores_df

Unnamed: 0,n_neighbors,weights,algorithm,leaf_size,p,metric,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,2,uniform,auto,10,1,minkowski,2,0.128420,1.390010,0.717615,0.965786,0.709132,0.965543,0.797932,0.967985,0.717615,0.965786
1,2,uniform,auto,10,1,minkowski,3,0.190059,0.729390,0.729519,0.964234,0.726086,0.964131,0.804225,0.966778,0.729519,0.964234
2,2,uniform,auto,10,1,minkowski,4,0.359527,0.841483,0.720146,0.961604,0.710877,0.961494,0.789466,0.964469,0.720146,0.961604
3,2,uniform,auto,10,1,minkowski,5,0.341188,0.615022,0.739850,0.960353,0.730337,0.960203,0.796190,0.963355,0.739850,0.960353
4,2,uniform,auto,10,1,minkowski,6,0.347285,0.540834,0.769441,0.958728,0.762740,0.958605,0.807817,0.962080,0.769441,0.958728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,10,distance,kd_tree,50,3,euclidean,6,0.280375,0.385405,0.831722,0.993803,0.825170,0.993806,0.861204,0.993838,0.831722,0.993803
2912,10,distance,kd_tree,50,3,euclidean,7,0.279009,0.328858,0.837227,0.993516,0.829292,0.993519,0.872001,0.993553,0.837227,0.993516
2913,10,distance,kd_tree,50,3,euclidean,8,0.320302,0.312492,0.854262,0.993402,0.846861,0.993405,0.884285,0.993439,0.854262,0.993402
2914,10,distance,kd_tree,50,3,euclidean,9,0.350683,0.236104,0.838747,0.993387,0.828768,0.993390,0.864473,0.993424,0.838747,0.993387


In [344]:
KNN_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_neighbors,weights,algorithm,leaf_size,p,metric,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
2652,10,distance,ball_tree,30,1,minkowski,8,0.306184,1.149168,0.855529,0.993402,0.847414,0.993405,0.886153,0.993439,0.855529,0.993402
2706,10,distance,ball_tree,50,1,minkowski,8,0.260837,1.129181,0.855529,0.993402,0.847414,0.993405,0.886153,0.993439,0.855529,0.993402
2598,10,distance,ball_tree,10,1,minkowski,8,0.647492,1.659109,0.855403,0.993402,0.847281,0.993405,0.886052,0.993439,0.855403,0.993402
2436,10,distance,auto,10,1,minkowski,8,0.374501,0.392885,0.855403,0.993402,0.847276,0.993405,0.886063,0.993439,0.855403,0.993402
2814,10,distance,kd_tree,30,1,minkowski,8,0.332019,0.333976,0.855403,0.993402,0.847291,0.993405,0.886056,0.993439,0.855403,0.993402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,2,distance,kd_tree,10,3,minkowski,2,0.119987,1.201598,0.701078,0.992650,0.693906,0.992435,0.786769,0.992783,0.701078,0.992650
738,2,distance,ball_tree,30,3,minkowski,2,0.139342,8.474092,0.701078,0.992714,0.693906,0.992498,0.786769,0.992841,0.701078,0.992714
576,2,distance,auto,30,3,minkowski,2,0.098434,1.704453,0.701078,0.992714,0.693906,0.992498,0.786769,0.992841,0.701078,0.992714
900,2,distance,kd_tree,30,3,minkowski,2,0.129985,1.569740,0.701078,0.992714,0.693906,0.992498,0.786769,0.992841,0.701078,0.992714


n_neighbors = 10

weights = distance

algorithm = ball_tree

leaf_size = 30

p = 1

metric = minkowski

Folds = 8

# Search Best Hyper Parameters For Ensemble Methods

Trying cross validation and hyper paramter tuning on ensemble methods with mentioned base estimators (base methods above) and also doing the same process for some ensemble methods that don't need/have base estimators

## Base Models

Define base models with hyper paramteres acquired from previous section which resulted in best scores, in hope to achieve the best possible results in ensemble methods as well

In [4]:
LR = LogisticRegression(C=1, solver='newton-cg', penalty='l2',n_jobs=-1)

In [5]:
DT = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=10, min_samples_split=10,
    min_samples_leaf=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0,ccp_alpha=0)

In [6]:
RF = RandomForestClassifier(n_estimators=1000,criterion='entropy', max_depth=10, min_samples_split=10,
    min_samples_leaf=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0,ccp_alpha=0, 
    bootstrap=True, oob_score=False, warm_start=True, class_weight='balanced', n_jobs=-1)

In [7]:
SVM = SVC(C=1, kernel='linear', degree=5, gamma='auto', probability=True, 
                            cache_size=1000, class_weight='balanced')

In [8]:
GNB = GaussianNB(var_smoothing=1)

In [9]:
MNB = MultinomialNB(alpha=1e-4, fit_prior=True)

In [10]:
KNN = KNeighborsClassifier(n_neighbors=10, weights='distance', algorithm='ball_tree', 
        leaf_size=30, p=1, metric='minkowski', n_jobs=-1)

## Ada Boost

Each subsection contains the same step as previous section for cross validating and hyper paramter tuning in order to find best state and get best results

At the end of each subsection the best hyper paramters and fold numbers are noted

### LR

In [20]:
LR_Ada_scores = []
LR_Ada_params = []

for ne in [10,100,1000]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1, 0.001]:
            print(ne, alg, lr)
            LR_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=LR, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=8, n_jobs=-1, return_train_score=True)
            scores = []
            for i in LR_Ada_score:
                scores.append(LR_Ada_score[i].mean())    
            LR_Ada_scores.append((scores[:]))    
            LR_Ada_params.append((ne, alg, lr))

10 SAMME 1
10 SAMME 0.1
10 SAMME 0.001
10 SAMME.R 1
10 SAMME.R 0.1
10 SAMME.R 0.001
100 SAMME 1
100 SAMME 0.1
100 SAMME 0.001
100 SAMME.R 1
100 SAMME.R 0.1
100 SAMME.R 0.001
1000 SAMME 1
1000 SAMME 0.1
1000 SAMME 0.001
1000 SAMME.R 1
1000 SAMME.R 0.1
1000 SAMME.R 0.001


In [21]:
%store LR_Ada_scores
%store LR_Ada_params

Stored 'LR_Ada_scores' (list)
Stored 'LR_Ada_params' (list)


In [22]:
LR_Ada_scores_df = pd.DataFrame(LR_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
LR_Ada_params_df = pd.DataFrame(LR_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
LR_Ada_scores_df = LR_Ada_params_df.join(LR_Ada_scores_df, how='left')
LR_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,SAMME,1.0,5.273135,0.080886,0.700996,0.697015,0.647899,0.650917,0.631467,0.623066,0.700996,0.697015
1,10,SAMME,0.1,7.207203,0.090607,0.770814,0.765914,0.732701,0.73469,0.72464,0.714382,0.770814,0.765914
2,10,SAMME,0.001,7.535879,0.079832,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
3,10,SAMME.R,1.0,4.732174,0.078301,0.539307,0.541675,0.445989,0.453443,0.606884,0.638703,0.539307,0.541675
4,10,SAMME.R,0.1,5.501116,0.073023,0.377241,0.377748,0.207576,0.208567,0.177241,0.220025,0.377241,0.377748
5,10,SAMME.R,0.001,7.239038,0.084374,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
6,100,SAMME,1.0,37.627559,0.152789,0.700362,0.76662,0.674306,0.751719,0.70646,0.761205,0.700362,0.76662
7,100,SAMME,0.1,48.524604,0.207853,0.779622,0.803296,0.749954,0.785218,0.75661,0.791234,0.779622,0.803296
8,100,SAMME,0.001,59.569029,0.181679,0.377495,0.377631,0.206901,0.207269,0.142502,0.168193,0.377495,0.377631
9,100,SAMME.R,1.0,29.048547,0.215973,0.799831,0.813434,0.780278,0.80232,0.791323,0.812135,0.799831,0.813434


In [23]:
LR_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
9,100,SAMME.R,1.0,29.048547,0.215973,0.799831,0.813434,0.780278,0.80232,0.791323,0.812135,0.799831,0.813434
16,1000,SAMME.R,0.1,314.298974,1.765081,0.799641,0.813406,0.780048,0.802275,0.791049,0.812057,0.799641,0.813406
13,1000,SAMME,0.1,62.383877,0.237319,0.781411,0.811451,0.766059,0.802821,0.805041,0.817878,0.781411,0.811451
7,100,SAMME,0.1,48.524604,0.207853,0.779622,0.803296,0.749954,0.785218,0.75661,0.791234,0.779622,0.803296
1,10,SAMME,0.1,7.207203,0.090607,0.770814,0.765914,0.732701,0.73469,0.72464,0.714382,0.770814,0.765914
15,1000,SAMME.R,1.0,270.255959,1.71408,0.757207,0.782189,0.73753,0.765496,0.776134,0.791722,0.757207,0.782189
12,1000,SAMME,1.0,73.061468,0.290739,0.732941,0.758565,0.706486,0.740745,0.735745,0.75169,0.732941,0.758565
14,1000,SAMME,0.001,538.714581,1.109837,0.713795,0.735103,0.681546,0.710222,0.693571,0.697268,0.713795,0.735103
0,10,SAMME,1.0,5.273135,0.080886,0.700996,0.697015,0.647899,0.650917,0.631467,0.623066,0.700996,0.697015
6,100,SAMME,1.0,37.627559,0.152789,0.700362,0.76662,0.674306,0.751719,0.70646,0.761205,0.700362,0.76662


n_estimators = 100

algorithm = SAMME.R

learning_rate = 1

### DT

In [22]:
DT_Ada_scores = []
DT_Ada_params = []

for ne in [10,100,1000]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1, 0.001]:
            print(ne, alg, lr)
            DT_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=DT, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=9, n_jobs=-1, return_train_score=True)
            scores = []
            for i in DT_Ada_score:
                scores.append(DT_Ada_score[i].mean())    
            DT_Ada_scores.append((scores[:]))    
            DT_Ada_params.append((ne, alg, lr))

10 SAMME 1
10 SAMME 0.1
10 SAMME 0.001
10 SAMME.R 1
10 SAMME.R 0.1
10 SAMME.R 0.001
100 SAMME 1
100 SAMME 0.1
100 SAMME 0.001
100 SAMME.R 1
100 SAMME.R 0.1
100 SAMME.R 0.001
1000 SAMME 1
1000 SAMME 0.1
1000 SAMME 0.001
1000 SAMME.R 1
1000 SAMME.R 0.1
1000 SAMME.R 0.001


In [23]:
%store DT_Ada_scores
%store DT_Ada_params

Stored 'DT_Ada_scores' (list)
Stored 'DT_Ada_params' (list)


In [24]:
DT_Ada_scores_df = pd.DataFrame(DT_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
DT_Ada_params_df = pd.DataFrame(DT_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
DT_Ada_scores_df = DT_Ada_params_df.join(DT_Ada_scores_df, how='left')
DT_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,SAMME,1.0,0.618504,0.075192,0.777923,0.951379,0.764737,0.951342,0.808487,0.953587,0.777923,0.951379
1,10,SAMME,0.1,0.588216,0.084451,0.836587,0.948837,0.827147,0.948615,0.852002,0.951277,0.836587,0.948837
2,10,SAMME,0.001,0.522776,0.072451,0.848375,0.942652,0.837842,0.942378,0.865096,0.946426,0.848375,0.942652
3,10,SAMME.R,1.0,0.743089,0.079782,0.762966,0.918329,0.752667,0.918513,0.798048,0.92205,0.762966,0.918329
4,10,SAMME.R,0.1,0.606944,0.075195,0.745296,0.958381,0.726795,0.958216,0.794563,0.961052,0.745296,0.958381
5,10,SAMME.R,0.001,0.60723,0.068681,0.777921,0.952892,0.76424,0.952692,0.813592,0.956569,0.777921,0.952892
6,100,SAMME,1.0,5.708479,0.259308,0.765759,0.991272,0.753774,0.991276,0.801821,0.991334,0.765759,0.991272
7,100,SAMME,0.1,5.387372,0.169443,0.776528,0.976462,0.762432,0.976398,0.808709,0.976906,0.776528,0.976462
8,100,SAMME,0.001,5.644511,0.188958,0.864467,0.944402,0.854769,0.944122,0.882358,0.948558,0.864467,0.944402
9,100,SAMME.R,1.0,5.887483,0.161839,0.780904,0.963014,0.769057,0.963035,0.814635,0.963198,0.780904,0.963014


In [25]:
DT_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
8,100,SAMME,0.001,5.644511,0.188958,0.864467,0.944402,0.854769,0.944122,0.882358,0.948558,0.864467,0.944402
2,10,SAMME,0.001,0.522776,0.072451,0.848375,0.942652,0.837842,0.942378,0.865096,0.946426,0.848375,0.942652
14,1000,SAMME,0.001,45.465209,1.133533,0.847298,0.9529,0.835832,0.952678,0.867324,0.955155,0.847298,0.9529
1,10,SAMME,0.1,0.588216,0.084451,0.836587,0.948837,0.827147,0.948615,0.852002,0.951277,0.836587,0.948837
11,100,SAMME.R,0.001,5.698281,0.1664,0.781213,0.961565,0.762863,0.961408,0.816036,0.96452,0.781213,0.961565
9,100,SAMME.R,1.0,5.887483,0.161839,0.780904,0.963014,0.769057,0.963035,0.814635,0.963198,0.780904,0.963014
0,10,SAMME,1.0,0.618504,0.075192,0.777923,0.951379,0.764737,0.951342,0.808487,0.953587,0.777923,0.951379
5,10,SAMME.R,0.001,0.60723,0.068681,0.777921,0.952892,0.76424,0.952692,0.813592,0.956569,0.777921,0.952892
7,100,SAMME,0.1,5.387372,0.169443,0.776528,0.976462,0.762432,0.976398,0.808709,0.976906,0.776528,0.976462
12,1000,SAMME,1.0,46.77028,1.288088,0.773869,0.992682,0.762952,0.99268,0.80652,0.992691,0.773869,0.992682


n_estimators = 100

agorithm = SAMME

learning_rate = 0.001

### RF

In [27]:
RF_Ada_scores = []
RF_Ada_params = []

for ne in [1,10]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1, 0.001]:
            print(ne, alg, lr)
            RF_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=RF, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=8, n_jobs=-1, return_train_score=True)
            scores = []
            for i in RF_Ada_score:
                scores.append(RF_Ada_score[i].mean())    
            RF_Ada_scores.append((scores[:]))    
            RF_Ada_params.append((ne, alg, lr))

1 SAMME 1
1 SAMME 0.1
1 SAMME 0.001
1 SAMME.R 1
1 SAMME.R 0.1
1 SAMME.R 0.001
10 SAMME 1
10 SAMME 0.1
10 SAMME 0.001
10 SAMME.R 1
10 SAMME.R 0.1
10 SAMME.R 0.001


In [28]:
%store RF_Ada_scores
%store RF_Ada_params

Stored 'RF_Ada_scores' (list)
Stored 'RF_Ada_params' (list)


In [29]:
RF_Ada_scores_df = pd.DataFrame(RF_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
RF_Ada_params_df = pd.DataFrame(RF_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
RF_Ada_scores_df = RF_Ada_params_df.join(RF_Ada_scores_df, how='left')
RF_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1,SAMME,1.0,21.786599,1.325059,0.890058,0.933952,0.882642,0.933508,0.919572,0.941869,0.890058,0.933952
1,1,SAMME,0.1,21.422696,1.434743,0.889171,0.934088,0.881675,0.933642,0.919202,0.942002,0.889171,0.934088
2,1,SAMME,0.001,21.566233,1.620253,0.889615,0.933807,0.882205,0.933362,0.919398,0.941739,0.889615,0.933807
3,1,SAMME.R,1.0,20.364333,1.459941,0.889931,0.933907,0.882442,0.93346,0.919726,0.941876,0.889931,0.933907
4,1,SAMME.R,0.1,19.553875,1.325825,0.889678,0.933943,0.882241,0.933497,0.919372,0.941872,0.889678,0.933943
5,1,SAMME.R,0.001,19.382167,1.39864,0.888854,0.933816,0.88133,0.933372,0.919064,0.941746,0.888854,0.933816
6,10,SAMME,1.0,215.50304,11.172272,0.732177,0.898389,0.707197,0.896443,0.815475,0.921184,0.732177,0.898389
7,10,SAMME,0.1,217.608604,13.86608,0.860661,0.954834,0.851322,0.954604,0.893515,0.957754,0.860661,0.954834
8,10,SAMME,0.001,224.13515,12.77029,0.889234,0.934151,0.881733,0.933703,0.919224,0.942062,0.889234,0.934151
9,10,SAMME.R,1.0,234.414782,12.091669,0.855023,0.952082,0.842752,0.951872,0.887737,0.953949,0.855023,0.952082


In [30]:
RF_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1,SAMME,1.0,21.786599,1.325059,0.890058,0.933952,0.882642,0.933508,0.919572,0.941869,0.890058,0.933952
3,1,SAMME.R,1.0,20.364333,1.459941,0.889931,0.933907,0.882442,0.93346,0.919726,0.941876,0.889931,0.933907
4,1,SAMME.R,0.1,19.553875,1.325825,0.889678,0.933943,0.882241,0.933497,0.919372,0.941872,0.889678,0.933943
2,1,SAMME,0.001,21.566233,1.620253,0.889615,0.933807,0.882205,0.933362,0.919398,0.941739,0.889615,0.933807
11,10,SAMME.R,0.001,245.758058,14.632749,0.889298,0.934133,0.881842,0.933688,0.919298,0.942078,0.889298,0.934133
8,10,SAMME,0.001,224.13515,12.77029,0.889234,0.934151,0.881733,0.933703,0.919224,0.942062,0.889234,0.934151
1,1,SAMME,0.1,21.422696,1.434743,0.889171,0.934088,0.881675,0.933642,0.919202,0.942002,0.889171,0.934088
5,1,SAMME.R,0.001,19.382167,1.39864,0.888854,0.933816,0.88133,0.933372,0.919064,0.941746,0.888854,0.933816
10,10,SAMME.R,0.1,232.757006,13.118943,0.887017,0.936088,0.878951,0.935657,0.917305,0.943701,0.887017,0.936088
7,10,SAMME,0.1,217.608604,13.86608,0.860661,0.954834,0.851322,0.954604,0.893515,0.957754,0.860661,0.954834


n_estimators = 1

agorithm = SAMME

learning_rate = 1

### SVM

In [11]:
SVM_Ada_scores = []
SVM_Ada_params = []

for ne in [1,10]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1]:
            print(ne, alg, lr)
            SVM_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=SVM, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=10, n_jobs=-1, return_train_score=True)
            scores = []
            for i in SVM_Ada_score:
                scores.append(SVM_Ada_score[i].mean())    
            SVM_Ada_scores.append((scores[:]))    
            SVM_Ada_params.append((ne, alg, lr))

1 SAMME 1
1 SAMME 0.1
1 SAMME.R 1
1 SAMME.R 0.1
10 SAMME 1
10 SAMME 0.1
10 SAMME.R 1
10 SAMME.R 0.1


In [12]:
%store SVM_Ada_scores
%store SVM_Ada_params

Stored 'SVM_Ada_scores' (list)
Stored 'SVM_Ada_params' (list)


In [13]:
SVM_Ada_scores_df = pd.DataFrame(SVM_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
SVM_Ada_params_df = pd.DataFrame(SVM_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
SVM_Ada_scores_df = SVM_Ada_params_df.join(SVM_Ada_scores_df, how='left')
SVM_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1,SAMME,1.0,301.906273,1.054751,,,,,,,,
1,1,SAMME,0.1,300.112588,1.050154,,,,,,,,
2,1,SAMME.R,1.0,305.884237,1.972787,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
3,1,SAMME.R,0.1,299.629185,1.945103,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
4,10,SAMME,1.0,482.5375,1.61714,,,,,,,,
5,10,SAMME,0.1,464.066595,1.410896,,,,,,,,
6,10,SAMME.R,1.0,2134.735897,16.094684,0.686941,0.667484,0.650493,0.63188,0.651711,0.633631,0.686941,0.667484
7,10,SAMME.R,0.1,2102.802457,16.471385,0.818097,0.833337,0.80312,0.828216,0.818559,0.842945,0.818097,0.833337


In [14]:
SVM_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
7,10,SAMME.R,0.1,2102.802457,16.471385,0.818097,0.833337,0.80312,0.828216,0.818559,0.842945,0.818097,0.833337
6,10,SAMME.R,1.0,2134.735897,16.094684,0.686941,0.667484,0.650493,0.63188,0.651711,0.633631,0.686941,0.667484
2,1,SAMME.R,1.0,305.884237,1.972787,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
3,1,SAMME.R,0.1,299.629185,1.945103,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
0,1,SAMME,1.0,301.906273,1.054751,,,,,,,,
1,1,SAMME,0.1,300.112588,1.050154,,,,,,,,
4,10,SAMME,1.0,482.5375,1.61714,,,,,,,,
5,10,SAMME,0.1,464.066595,1.410896,,,,,,,,


n_estimators = 10

agorithm = SAMME.R

learning_rate = 0.1

### GNB

In [15]:
GNB_Ada_scores = []
GNB_Ada_params = []

for ne in [10,100,1000]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1, 0.001]:
            print(ne, alg, lr)
            GNB_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=GNB, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=2, n_jobs=-1, return_train_score=True)
            scores = []
            for i in GNB_Ada_score:
                scores.append(GNB_Ada_score[i].mean())    
            GNB_Ada_scores.append((scores[:]))    
            GNB_Ada_params.append((ne, alg, lr))

10 SAMME 1
10 SAMME 0.1
10 SAMME 0.001
10 SAMME.R 1
10 SAMME.R 0.1
10 SAMME.R 0.001
100 SAMME 1
100 SAMME 0.1
100 SAMME 0.001
100 SAMME.R 1
100 SAMME.R 0.1
100 SAMME.R 0.001
1000 SAMME 1
1000 SAMME 0.1
1000 SAMME 0.001
1000 SAMME.R 1
1000 SAMME.R 0.1
1000 SAMME.R 0.001


In [16]:
%store GNB_Ada_scores
%store GNB_Ada_params

Stored 'GNB_Ada_scores' (list)
Stored 'GNB_Ada_params' (list)


In [17]:
GNB_Ada_scores_df = pd.DataFrame(GNB_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
GNB_Ada_params_df = pd.DataFrame(GNB_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
GNB_Ada_scores_df = GNB_Ada_params_df.join(GNB_Ada_scores_df, how='left')
GNB_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,SAMME,1.0,0.531234,0.511493,0.717924,0.78984,0.705654,0.7695,0.753189,0.797311,0.717924,0.78984
1,10,SAMME,0.1,0.367179,0.312489,0.781599,0.826838,0.767146,0.816898,0.785224,0.827467,0.781599,0.826838
2,10,SAMME,0.001,0.251294,0.234367,0.796044,0.803965,0.782949,0.791803,0.795373,0.815767,0.796044,0.803965
3,10,SAMME.R,1.0,0.379383,0.359365,0.744473,0.849586,0.740592,0.84172,0.764563,0.841409,0.744473,0.849586
4,10,SAMME.R,0.1,0.303865,0.296865,0.799846,0.810048,0.789077,0.799888,0.803589,0.819877,0.799846,0.810048
5,10,SAMME.R,0.001,0.364889,0.296866,0.796108,0.803965,0.783023,0.791803,0.795457,0.815767,0.796108,0.803965
6,100,SAMME,1.0,2.70833,0.999965,0.636004,0.75911,0.608173,0.752193,0.664579,0.768592,0.636004,0.75911
7,100,SAMME,0.1,2.831377,1.484329,0.71261,0.879238,0.698995,0.878724,0.751585,0.881019,0.71261,0.879238
8,100,SAMME,0.001,2.69626,1.046844,0.796044,0.794714,0.783325,0.78225,0.795953,0.810385,0.796044,0.794714
9,100,SAMME.R,1.0,3.632235,1.656197,0.572271,0.688523,0.539295,0.700741,0.635675,0.825788,0.572271,0.688523


In [18]:
GNB_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
4,10,SAMME.R,0.1,0.303865,0.296865,0.799846,0.810048,0.789077,0.799888,0.803589,0.819877,0.799846,0.810048
17,1000,SAMME.R,0.001,32.65721,14.562068,0.799782,0.810555,0.788997,0.800385,0.803311,0.820223,0.799782,0.810555
11,100,SAMME.R,0.001,3.609038,1.75595,0.796234,0.804028,0.783254,0.791871,0.795622,0.815931,0.796234,0.804028
5,10,SAMME.R,0.001,0.364889,0.296866,0.796108,0.803965,0.783023,0.791803,0.795457,0.815767,0.796108,0.803965
8,100,SAMME,0.001,2.69626,1.046844,0.796044,0.794714,0.783325,0.78225,0.795953,0.810385,0.796044,0.794714
2,10,SAMME,0.001,0.251294,0.234367,0.796044,0.803965,0.782949,0.791803,0.795373,0.815767,0.796044,0.803965
14,1000,SAMME,0.001,25.491431,9.046605,0.78369,0.827154,0.770284,0.817246,0.790491,0.828374,0.78369,0.827154
1,10,SAMME,0.1,0.367179,0.312489,0.781599,0.826838,0.767146,0.816898,0.785224,0.827467,0.781599,0.826838
10,100,SAMME.R,0.1,3.118365,1.195271,0.74498,0.849649,0.741286,0.841654,0.765412,0.841297,0.74498,0.849649
3,10,SAMME.R,1.0,0.379383,0.359365,0.744473,0.849586,0.740592,0.84172,0.764563,0.841409,0.744473,0.849586


n_estimators = 10

agorithm = SAMME.R

learning_rate = 0.1

### MNB

In [19]:
MNB_Ada_scores = []
MNB_Ada_params = []

for ne in [10,100,1000]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1, 0.001]:
            print(ne, alg, lr)
            MNB_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=MNB, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=6, n_jobs=-1, return_train_score=True)
            scores = []
            for i in MNB_Ada_score:
                scores.append(MNB_Ada_score[i].mean())    
            MNB_Ada_scores.append((scores[:]))    
            MNB_Ada_params.append((ne, alg, lr))

10 SAMME 1
10 SAMME 0.1
10 SAMME 0.001
10 SAMME.R 1
10 SAMME.R 0.1
10 SAMME.R 0.001
100 SAMME 1
100 SAMME 0.1
100 SAMME 0.001
100 SAMME.R 1
100 SAMME.R 0.1
100 SAMME.R 0.001
1000 SAMME 1
1000 SAMME 0.1
1000 SAMME 0.001
1000 SAMME.R 1
1000 SAMME.R 0.1
1000 SAMME.R 0.001


In [20]:
%store MNB_Ada_scores
%store MNB_Ada_params

Stored 'MNB_Ada_scores' (list)
Stored 'MNB_Ada_params' (list)


In [21]:
MNB_Ada_scores_df = pd.DataFrame(MNB_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
MNB_Ada_params_df = pd.DataFrame(MNB_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
MNB_Ada_scores_df = MNB_Ada_params_df.join(MNB_Ada_scores_df, how='left')
MNB_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,SAMME,1.0,0.76582,0.091144,0.753909,0.775936,0.740423,0.765465,0.770689,0.781914,0.753909,0.775936
1,10,SAMME,0.1,0.647577,0.080728,0.803962,0.815776,0.795171,0.811019,0.807968,0.814166,0.803962,0.815776
2,10,SAMME,0.001,0.653939,0.080729,0.798448,0.805575,0.780718,0.793068,0.792512,0.803893,0.798448,0.805575
3,10,SAMME.R,1.0,0.780036,0.10156,0.787489,0.806462,0.774445,0.796992,0.795286,0.808237,0.787489,0.806462
4,10,SAMME.R,0.1,0.837555,0.098874,0.803199,0.806247,0.785104,0.792997,0.795631,0.803237,0.803199,0.806247
5,10,SAMME.R,0.001,0.796726,0.088539,0.798004,0.805487,0.780188,0.792861,0.791863,0.803761,0.798004,0.805487
6,100,SAMME,1.0,3.636808,0.124881,0.714442,0.768004,0.702369,0.766825,0.742317,0.780249,0.714442,0.768004
7,100,SAMME,0.1,5.836179,0.138427,0.791241,0.84335,0.780476,0.840971,0.815207,0.843685,0.791241,0.84335
8,100,SAMME,0.001,6.51173,0.174174,0.799398,0.805778,0.781995,0.793519,0.795904,0.803764,0.799398,0.805778
9,100,SAMME.R,1.0,7.773206,0.304678,0.678662,0.703502,0.675575,0.71691,0.768428,0.779475,0.678662,0.703502


In [22]:
MNB_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
14,1000,SAMME,0.001,64.483182,1.039033,0.805674,0.817778,0.798566,0.813066,0.817063,0.813999,0.805674,0.817778
1,10,SAMME,0.1,0.647577,0.080728,0.803962,0.815776,0.795171,0.811019,0.807968,0.814166,0.803962,0.815776
17,1000,SAMME.R,0.001,77.623185,2.390554,0.803262,0.806298,0.785253,0.79304,0.797447,0.803448,0.803262,0.806298
4,10,SAMME.R,0.1,0.837555,0.098874,0.803199,0.806247,0.785104,0.792997,0.795631,0.803237,0.803199,0.806247
8,100,SAMME,0.001,6.51173,0.174174,0.799398,0.805778,0.781995,0.793519,0.795904,0.803764,0.799398,0.805778
11,100,SAMME.R,0.001,7.74196,0.299469,0.799081,0.805385,0.781078,0.792781,0.79247,0.803583,0.799081,0.805385
2,10,SAMME,0.001,0.653939,0.080729,0.798448,0.805575,0.780718,0.793068,0.792512,0.803893,0.798448,0.805575
5,10,SAMME.R,0.001,0.796726,0.088539,0.798004,0.805487,0.780188,0.792861,0.791863,0.803761,0.798004,0.805487
13,1000,SAMME,0.1,6.536266,0.151037,0.792509,0.843616,0.781589,0.841251,0.816243,0.843988,0.792509,0.843616
7,100,SAMME,0.1,5.836179,0.138427,0.791241,0.84335,0.780476,0.840971,0.815207,0.843685,0.791241,0.84335


n_estimators = 1000

agorithm = SAMME

learning_rate = 0.001

### KNN

In [27]:
KNN_Ada_scores = []
KNN_Ada_params = []

for ne in [10,100,1000]:
    for alg in ['SAMME', 'SAMME.R']:
        for lr in [1, 0.1, 0.001]:
            print(ne, alg, lr)
            KNN_Ada_score = cross_validate(AdaBoostClassifier(base_estimator=KNN, n_estimators=ne, algorithm=alg, learning_rate=lr),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=8, n_jobs=-1, return_train_score=True)
            scores = []
            for i in KNN_Ada_score:
                scores.append(KNN_Ada_score[i].mean())    
            KNN_Ada_scores.append((scores[:]))    
            KNN_Ada_params.append((ne, alg, lr))

10 SAMME 1
10 SAMME 0.1
10 SAMME 0.001
10 SAMME.R 1
10 SAMME.R 0.1
10 SAMME.R 0.001
100 SAMME 1
100 SAMME 0.1
100 SAMME 0.001
100 SAMME.R 1
100 SAMME.R 0.1
100 SAMME.R 0.001
1000 SAMME 1
1000 SAMME 0.1
1000 SAMME 0.001
1000 SAMME.R 1
1000 SAMME.R 0.1
1000 SAMME.R 0.001


In [28]:
%store KNN_Ada_scores
%store KNN_Ada_params

Stored 'KNN_Ada_scores' (list)
Stored 'KNN_Ada_params' (list)


In [29]:
KNN_Ada_scores_df = pd.DataFrame(KNN_Ada_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
KNN_Ada_params_df = pd.DataFrame(KNN_Ada_params, columns = ['n_estimators', 'algorithm', 'learning_rate'])
KNN_Ada_scores_df = KNN_Ada_params_df.join(KNN_Ada_scores_df, how='left')
KNN_Ada_scores_df

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,SAMME,1.0,0.010868,0.0,,,,,,,,
1,10,SAMME,0.1,0.011368,0.0,,,,,,,,
2,10,SAMME,0.001,0.013365,0.0,,,,,,,,
3,10,SAMME.R,1.0,0.008743,0.0,,,,,,,,
4,10,SAMME.R,0.1,0.014365,0.0,,,,,,,,
5,10,SAMME.R,0.001,0.008743,0.0,,,,,,,,
6,100,SAMME,1.0,0.010492,0.0,,,,,,,,
7,100,SAMME,0.1,0.015239,0.0,,,,,,,,
8,100,SAMME,0.001,0.012616,0.0,,,,,,,,
9,100,SAMME.R,1.0,0.010992,0.0,,,,,,,,


In [26]:
KNN_Ada_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,algorithm,learning_rate,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,SAMME,1.0,0.009766,0.0,,,,,,,,
1,10,SAMME,0.1,0.011718,0.0,,,,,,,,
2,10,SAMME,0.001,0.003906,0.0,,,,,,,,
3,10,SAMME.R,1.0,0.007813,0.0,,,,,,,,
4,10,SAMME.R,0.1,0.009766,0.0,,,,,,,,
5,10,SAMME.R,0.001,0.005859,0.0,,,,,,,,
6,100,SAMME,1.0,0.007813,0.0,,,,,,,,
7,100,SAMME,0.1,0.00586,0.0,,,,,,,,
8,100,SAMME,0.001,0.015626,0.0,,,,,,,,
9,100,SAMME.R,1.0,0.009766,0.0,,,,,,,,


Incompatible with Adaboost

Compared to DecisionTreeClassifier, KNeighborsClassifier does not have the attributes classes_/n_classes_, which is requirement as stated in AdaBoostClassifier docs: Support for sample weighting is required, as well as proper classes_ and n_classes_ attributes.

https://stackoverflow.com/questions/38651913/how-do-you-combine-knn-and-adaboost-with-sklearn

## XGBoost

Cross validation and hyper paramter tuning for XGBoost (Extreme Gradient Boosting), an implementation of gradient boosted decision trees designed for speed and performance which provides a regularizing gradient boosting.

In [5]:
XGB_scores = []
XGB_params = []

for ne in [1000]: #10,50,100
    for md in [10000]: #100,1000,100000
        for lr in [0.001]: #1
            for b in ['gbtree']: #, 'gblinear', 'dart'
                for tm in ['approx']: #'auto', 'exact',  'hist'
                    for g in [1]: # 0.1,
                        for mcw in [0.1]: # ,1
                            for mds in [1]: #0.1,
                                for ra in [1]: #0.1,
                                    for rl in [1]: #0.1,
                                        #for folds in range(7,11): best folds = 8
                                        print(ne, md, lr, b, tm, g, mcw, mds, ra, rl, 8)
                                        XGB_score = cross_validate(XGBClassifier(n_estimators=ne, max_depth=md,
                                        learning_rate=lr, booster=b, tree_method=tm, gamma=g, min_child_weight=mcw, max_delta_step=mds, reg_alpha=ra, reg_lambda=rl),X,y, 
                                        scoring = [
                                            'accuracy', 'f1_weighted', 'precision_weighted',
                                            'recall_weighted'], 
                                        cv=8, n_jobs=-1, return_train_score=True)
                                        scores = []
                                        for i in XGB_score:
                                            scores.append(XGB_score[i].mean())    
                                        XGB_scores.append((scores[:]))    
                                        XGB_params.append((ne, md, lr, b, tm, g, mcw, mds, ra, rl, 8))
                                        print('scores:')
                                        print(scores)

1000 100000 0.001 gbtree approx 1 0.1 1 1 1 8
scores:
[617.4948163330555, 0.7242419421672821, 0.8483066658510583, 0.9542000904577578, 0.8381928992372267, 0.9540290038542578, 0.877415820515235, 0.9560648106565403, 0.8483066658510583, 0.9542000904577578]


In [8]:
%store XGB_scores
%store XGB_params

Stored 'XGB_scores' (list)
Stored 'XGB_params' (list)


#### Note

The cells below found the best number of folds but didn't represent it, so anothe cell was created to show the best number of folds

In [None]:
XGB_scores_df = pd.DataFrame(XGB_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
XGB_params_df = pd.DataFrame(XGB_params, columns = ['n_estimators', 'max_depth', 'learning_rate', 'booster', 
'tree_method', 'gamma', 'min_child_weight', 'max_delta_step', 'reg_alpha(l1)', 'reg_lambda(l2)', 'Folds'])
XGB_scores_df = XGB_params_df.join(XGB_scores_df, how='left')
XGB_scores_df

Unnamed: 0,n_estimators,max_depth,learning_rate,booster,tree_method,gamma,min_child_weight,max_delta_step,reg_alpha(l1),reg_lambda(l2),...,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,509.318529,0.573453,0.834248,0.954677,0.823064,0.954503,0.866889,0.956612,0.834248,0.954677
1,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,525.679808,0.657914,0.848307,0.9542,0.838193,0.954029,0.877416,0.956065,0.848307,0.9542
2,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,563.194405,0.795028,0.799197,0.953756,0.787779,0.953575,0.824981,0.955747,0.799197,0.953756
3,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,532.949064,0.450795,0.812022,0.953586,0.800138,0.953405,0.829582,0.955505,0.812022,0.953586


In [10]:
XGB_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_depth,learning_rate,booster,tree_method,gamma,min_child_weight,max_delta_step,reg_alpha(l1),reg_lambda(l2),...,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
1,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,525.679808,0.657914,0.848307,0.9542,0.838193,0.954029,0.877416,0.956065,0.848307,0.9542
0,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,509.318529,0.573453,0.834248,0.954677,0.823064,0.954503,0.866889,0.956612,0.834248,0.954677
3,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,532.949064,0.450795,0.812022,0.953586,0.800138,0.953405,0.829582,0.955505,0.812022,0.953586
2,1000,10000,0.001,gbtree,approx,1,0.1,1,1,1,...,563.194405,0.795028,0.799197,0.953756,0.787779,0.953575,0.824981,0.955747,0.799197,0.953756


In [11]:
XGB_scores_df.sort_values(by='test_accuracy',ascending=False)['Folds']

1     8
0     7
3    10
2     9
Name: Folds, dtype: int64

#### Note

Following cell will provide the results with cv=8 (best number of folds) alongside other changes in parameters.

In [9]:
XGB_scores_df = pd.DataFrame(XGB_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
XGB_params_df = pd.DataFrame(XGB_params, columns = ['n_estimators', 'max_depth', 'learning_rate', 'booster', 
'tree_method', 'gamma', 'min_child_weight', 'max_delta_step', 'reg_alpha(l1)', 'reg_lambda(l2)', 'Folds'])
XGB_scores_df = XGB_params_df.join(XGB_scores_df, how='left')
XGB_scores_df

Unnamed: 0,n_estimators,max_depth,learning_rate,booster,tree_method,gamma,min_child_weight,max_delta_step,reg_alpha(l1),reg_lambda(l2),...,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1000,100000,0.001,gbtree,approx,1,0.1,1,1,1,...,617.494816,0.724242,0.848307,0.9542,0.838193,0.954029,0.877416,0.956065,0.848307,0.9542


n_estimators = 1000

max_depth = 100000

learning_rate = 0.001

booster = gbtree

tree_method = approx

gamma = 1

min_child_weight = 0.1

max_delta_step = 1

reg_alpha = 1

reg_delta = 1

## Bagging

Each subsection contains the same step as previous section for cross validating and hyper paramter tuning in order to find best state and get best results

At the end of each subsection the best hyper paramters and fold numbers are noted

### LR

In [24]:
LR_bag_scores = []
LR_bag_params = []

for ne in [10,100]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            LR_bag_score = cross_validate(BaggingClassifier(base_estimator=LR, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=8, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in LR_bag_score:
                                scores.append(LR_bag_score[i].mean())    
                            LR_bag_scores.append((scores[:]))    
                            LR_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

10 1 0.2 True True True True
10 1 0.2 True True True False
10 1 0.2 True True False True
10 1 0.2 True True False False
10 1 0.2 True False True True
10 1 0.2 True False True False
10 1 0.2 True False False True
10 1 0.2 True False False False
10 1 0.2 False True True True
10 1 0.2 False True True False
10 1 0.2 False True False True
10 1 0.2 False True False False
10 1 0.2 False False True True
10 1 0.2 False False True False
10 1 0.2 False False False True
10 1 0.2 False False False False
10 1 0.5 True True True True
10 1 0.5 True True True False
10 1 0.5 True True False True
10 1 0.5 True True False False
10 1 0.5 True False True True
10 1 0.5 True False True False
10 1 0.5 True False False True
10 1 0.5 True False False False
10 1 0.5 False True True True
10 1 0.5 False True True False
10 1 0.5 False True False True
10 1 0.5 False True False False
10 1 0.5 False False True True
10 1 0.5 False False True False
10 1 0.5 False False False True
10 1 0.5 False False False False
10 1 0.7

In [25]:
%store LR_bag_scores
%store LR_bag_params

Stored 'LR_bag_scores' (list)
Stored 'LR_bag_params' (list)


In [26]:
LR_bag_scores_df = pd.DataFrame(LR_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
LR_bag_params_df = pd.DataFrame(LR_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
LR_bag_scores_df = LR_bag_params_df.join(LR_bag_scores_df, how='left')
LR_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.038664,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,2.856069,0.156528,0.341630,0.341724,0.175902,0.175984,0.118966,0.119030,0.341630,0.341724
2,10,1,0.2,True,True,False,True,2.807216,0.198321,0.320913,0.321042,0.157707,0.157811,0.105047,0.105125,0.320913,0.321042
3,10,1,0.2,True,True,False,False,2.890552,0.106418,0.316357,0.316262,0.154198,0.154126,0.102539,0.102488,0.316357,0.316262
4,10,1,0.2,True,False,True,True,0.037360,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,46.484637,0.280964,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
380,100,7,1.0,False,False,True,True,0.031248,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.033203,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,46.068578,0.328115,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495


In [27]:
LR_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
358,100,7,0.7,True,False,False,True,67.912027,0.230001,0.665321,0.672830,0.636890,0.647290,0.641327,0.640128,0.665321,0.672830
359,100,7,0.7,True,False,False,False,67.685569,0.186869,0.648975,0.651958,0.598325,0.605536,0.591550,0.593543,0.648975,0.651958
357,100,7,0.7,True,False,True,False,69.839676,0.186296,0.614120,0.627421,0.561410,0.574436,0.560577,0.580686,0.614120,0.627421
367,100,7,0.7,False,False,False,False,67.341791,0.327391,0.605830,0.627437,0.546761,0.576302,0.551206,0.576784,0.605830,0.627437
362,100,7,0.7,False,True,False,True,66.379109,0.303562,0.602029,0.598817,0.536755,0.538516,0.545310,0.556927,0.602029,0.598817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,100,7,1.0,True,False,True,True,0.035155,0.000000,,,,,,,,
376,100,7,1.0,False,True,True,True,0.029296,0.000000,,,,,,,,
377,100,7,1.0,False,True,True,False,0.031249,0.000000,,,,,,,,
380,100,7,1.0,False,False,True,True,0.031248,0.000000,,,,,,,,


### DT

In [30]:
DT_bag_scores = []
DT_bag_params = []

for ne in [10,100]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            DT_bag_score = cross_validate(BaggingClassifier(base_estimator=DT, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=9, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in DT_bag_score:
                                scores.append(DT_bag_score[i].mean())    
                            DT_bag_scores.append((scores[:]))    
                            DT_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

10 1 0.2 True True True True
10 1 0.2 True True True False
10 1 0.2 True True False True
10 1 0.2 True True False False
10 1 0.2 True False True True
10 1 0.2 True False True False
10 1 0.2 True False False True
10 1 0.2 True False False False
10 1 0.2 False True True True
10 1 0.2 False True True False
10 1 0.2 False True False True
10 1 0.2 False True False False
10 1 0.2 False False True True
10 1 0.2 False False True False
10 1 0.2 False False False True
10 1 0.2 False False False False
10 1 0.5 True True True True
10 1 0.5 True True True False
10 1 0.5 True True False True
10 1 0.5 True True False False
10 1 0.5 True False True True
10 1 0.5 True False True False
10 1 0.5 True False False True
10 1 0.5 True False False False
10 1 0.5 False True True True
10 1 0.5 False True True False
10 1 0.5 False True False True
10 1 0.5 False True False False
10 1 0.5 False False True True
10 1 0.5 False False True False
10 1 0.5 False False False True
10 1 0.5 False False False False
10 1 0.7

In [32]:
%store DT_bag_scores
%store DT_bag_params

Stored 'DT_bag_scores' (list)
Stored 'DT_bag_params' (list)


In [33]:
DT_bag_scores_df = pd.DataFrame(DT_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
DT_bag_params_df = pd.DataFrame(DT_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
DT_bag_scores_df = DT_bag_params_df.join(DT_bag_scores_df, how='left')
DT_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.038531,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,0.201652,0.122923,0.336440,0.336501,0.171291,0.171337,0.115421,0.115454,0.336440,0.336501
2,10,1,0.2,True,True,False,True,0.104046,0.087945,0.349933,0.349910,0.182683,0.182664,0.123973,0.123959,0.349933,0.349910
3,10,1,0.2,True,True,False,False,0.102935,0.085947,0.332191,0.332280,0.168013,0.168084,0.113074,0.113127,0.332191,0.332280
4,10,1,0.2,True,False,True,True,0.038753,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,0.614820,0.142427,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
380,100,7,1.0,False,False,True,True,0.051315,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.041669,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,0.744875,0.247155,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495


In [34]:
DT_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
383,100,7,1.0,False,False,False,False,0.729028,0.327728,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
267,100,5,0.2,False,True,False,False,0.817914,0.187846,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
263,100,5,0.2,True,False,False,False,0.600879,0.152599,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
262,100,5,0.2,True,False,False,True,0.594469,0.173734,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
261,100,5,0.2,True,False,True,False,1.436013,0.244075,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,100,7,1.0,True,False,True,True,0.047955,0.000000,,,,,,,,
376,100,7,1.0,False,True,True,True,0.048670,0.000000,,,,,,,,
377,100,7,1.0,False,True,True,False,0.036531,0.000000,,,,,,,,
380,100,7,1.0,False,False,True,True,0.051315,0.000000,,,,,,,,


### RF

In [12]:
RF_bag_scores = []
RF_bag_params = []

for ne in [1,10]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            RF_bag_score = cross_validate(BaggingClassifier(base_estimator=RF, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=8, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in RF_bag_score:
                                scores.append(RF_bag_score[i].mean())    
                            RF_bag_scores.append((scores[:]))    
                            RF_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

1 1 0.2 True True True True
1 1 0.2 True True True False
1 1 0.2 True True False True
1 1 0.2 True True False False
1 1 0.2 True False True True
1 1 0.2 True False True False
1 1 0.2 True False False True
1 1 0.2 True False False False
1 1 0.2 False True True True
1 1 0.2 False True True False
1 1 0.2 False True False True
1 1 0.2 False True False False
1 1 0.2 False False True True
1 1 0.2 False False True False
1 1 0.2 False False False True
1 1 0.2 False False False False
1 1 0.5 True True True True
1 1 0.5 True True True False
1 1 0.5 True True False True
1 1 0.5 True True False False
1 1 0.5 True False True True
1 1 0.5 True False True False
1 1 0.5 True False False True
1 1 0.5 True False False False
1 1 0.5 False True True True
1 1 0.5 False True True False
1 1 0.5 False True False True
1 1 0.5 False True False False
1 1 0.5 False False True True
1 1 0.5 False False True False
1 1 0.5 False False False True
1 1 0.5 False False False False
1 1 0.7 True True True True
1 1 0.7 True

In [13]:
%store RF_bag_scores
%store RF_bag_params

Stored 'RF_bag_scores' (list)
Stored 'RF_bag_params' (list)


In [14]:
RF_bag_scores_df = pd.DataFrame(RF_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
RF_bag_params_df = pd.DataFrame(RF_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
RF_bag_scores_df = RF_bag_params_df.join(RF_bag_scores_df, how='left')
RF_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,1,1,0.2,True,True,True,True,0.035477,0.000000,,,,,,,,
1,1,1,0.2,True,True,True,False,8.130874,1.183772,0.290754,0.290847,0.132239,0.132305,0.085928,0.085973,0.290754,0.290847
2,1,1,0.2,True,True,False,True,5.946465,0.941921,0.306783,0.306768,0.146824,0.146807,0.097267,0.097252,0.306783,0.306768
3,1,1,0.2,True,True,False,False,5.863017,1.043733,0.251600,0.251600,0.111959,0.111944,0.072543,0.072531,0.251600,0.251600
4,1,1,0.2,True,False,True,True,0.050717,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,10,7,1.0,False,True,False,False,52.852351,4.939382,0.337005,0.336954,0.172343,0.172306,0.116423,0.116398,0.337005,0.336954
380,10,7,1.0,False,False,True,True,0.036810,0.000000,,,,,,,,
381,10,7,1.0,False,False,True,False,0.032391,0.000000,,,,,,,,
382,10,7,1.0,False,False,False,True,57.732752,5.806140,0.346514,0.346457,0.179681,0.179631,0.121676,0.121638,0.346514,0.346457


In [15]:
RF_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
310,10,5,1.0,True,False,False,True,52.969147,5.040185,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
46,1,1,0.7,False,False,False,True,5.871139,0.901820,0.362353,0.362406,0.194113,0.194155,0.132901,0.132932,0.362353,0.362406
226,10,1,0.7,True,True,False,True,57.735293,5.331232,0.356841,0.356803,0.188748,0.188722,0.128611,0.128594,0.356841,0.356803
321,10,7,0.2,True,True,True,False,72.839633,6.881461,0.356841,0.356803,0.188748,0.188722,0.128611,0.128594,0.356841,0.356803
293,10,5,0.7,True,False,True,False,66.950862,5.767405,0.356778,0.356812,0.188697,0.188729,0.128574,0.128599,0.356778,0.356812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,10,7,1.0,True,False,True,True,0.033874,0.000000,,,,,,,,
376,10,7,1.0,False,True,True,True,0.038336,0.000000,,,,,,,,
377,10,7,1.0,False,True,True,False,0.036130,0.000000,,,,,,,,
380,10,7,1.0,False,False,True,True,0.036810,0.000000,,,,,,,,


### SVM

In [16]:
SVM_bag_scores = []
SVM_bag_params = []

for ne in [10,100]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            SVM_bag_score = cross_validate(BaggingClassifier(base_estimator=SVM, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=10, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in SVM_bag_score:
                                scores.append(SVM_bag_score[i].mean())    
                            SVM_bag_scores.append((scores[:]))    
                            SVM_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

10 1 0.2 True True True True
10 1 0.2 True True True False
10 1 0.2 True True False True
10 1 0.2 True True False False
10 1 0.2 True False True True
10 1 0.2 True False True False
10 1 0.2 True False False True
10 1 0.2 True False False False
10 1 0.2 False True True True
10 1 0.2 False True True False
10 1 0.2 False True False True
10 1 0.2 False True False False
10 1 0.2 False False True True
10 1 0.2 False False True False
10 1 0.2 False False False True
10 1 0.2 False False False False
10 1 0.5 True True True True
10 1 0.5 True True True False
10 1 0.5 True True False True
10 1 0.5 True True False False
10 1 0.5 True False True True
10 1 0.5 True False True False
10 1 0.5 True False False True
10 1 0.5 True False False False
10 1 0.5 False True True True
10 1 0.5 False True True False
10 1 0.5 False True False True
10 1 0.5 False True False False
10 1 0.5 False False True True
10 1 0.5 False False True False
10 1 0.5 False False False True
10 1 0.5 False False False False
10 1 0.7

In [17]:
%store SVM_bag_scores
%store SVM_bag_params

Stored 'SVM_bag_scores' (list)
Stored 'SVM_bag_params' (list)


In [18]:
SVM_bag_scores_df = pd.DataFrame(SVM_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
SVM_bag_params_df = pd.DataFrame(SVM_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
SVM_bag_scores_df = SVM_bag_params_df.join(SVM_bag_scores_df, how='left')
SVM_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.040270,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,0.096517,0.000000,,,,,,,,
2,10,1,0.2,True,True,False,True,0.091929,0.000000,,,,,,,,
3,10,1,0.2,True,True,False,False,0.098081,0.000000,,,,,,,,
4,10,1,0.2,True,False,True,True,0.038021,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,1.241021,0.038048,,,,,,,,
380,100,7,1.0,False,False,True,True,0.032995,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.034496,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,1.220787,0.056980,,,,,,,,


In [19]:
SVM_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.040270,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,0.096517,0.000000,,,,,,,,
2,10,1,0.2,True,True,False,True,0.091929,0.000000,,,,,,,,
3,10,1,0.2,True,True,False,False,0.098081,0.000000,,,,,,,,
4,10,1,0.2,True,False,True,True,0.038021,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,1.241021,0.038048,,,,,,,,
380,100,7,1.0,False,False,True,True,0.032995,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.034496,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,1.220787,0.056980,,,,,,,,


### GNB

In [20]:
GNB_bag_scores = []
GNB_bag_params = []

for ne in [10,100]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            GNB_bag_score = cross_validate(BaggingClassifier(base_estimator=GNB, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=2, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in GNB_bag_score:
                                scores.append(GNB_bag_score[i].mean())    
                            GNB_bag_scores.append((scores[:]))    
                            GNB_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

10 1 0.2 True True True True
10 1 0.2 True True True False
10 1 0.2 True True False True
10 1 0.2 True True False False
10 1 0.2 True False True True
10 1 0.2 True False True False
10 1 0.2 True False False True
10 1 0.2 True False False False
10 1 0.2 False True True True
10 1 0.2 False True True False
10 1 0.2 False True False True
10 1 0.2 False True False False
10 1 0.2 False False True True
10 1 0.2 False False True False
10 1 0.2 False False False True
10 1 0.2 False False False False
10 1 0.5 True True True True
10 1 0.5 True True True False
10 1 0.5 True True False True
10 1 0.5 True True False False
10 1 0.5 True False True True
10 1 0.5 True False True False
10 1 0.5 True False False True
10 1 0.5 True False False False
10 1 0.5 False True True True
10 1 0.5 False True True False
10 1 0.5 False True False True
10 1 0.5 False True False False
10 1 0.5 False False True True
10 1 0.5 False False True False
10 1 0.5 False False False True
10 1 0.5 False False False False
10 1 0.7

In [21]:
%store GNB_bag_scores
%store GNB_bag_params

Stored 'GNB_bag_scores' (list)
Stored 'GNB_bag_params' (list)


In [22]:
GNB_bag_scores_df = pd.DataFrame(GNB_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
GNB_bag_params_df = pd.DataFrame(GNB_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
GNB_bag_scores_df = GNB_bag_params_df.join(GNB_bag_scores_df, how='left')
GNB_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.017500,0.0,,,,,,,,
1,10,1,0.2,True,True,True,False,0.039652,0.0,,,,,,,,
2,10,1,0.2,True,True,False,True,0.029995,0.0,,,,,,,,
3,10,1,0.2,True,True,False,False,0.029995,0.0,,,,,,,,
4,10,1,0.2,True,False,True,True,0.014998,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,0.032495,0.0,,,,,,,,
380,100,7,1.0,False,False,True,True,0.015000,0.0,,,,,,,,
381,100,7,1.0,False,False,True,False,0.017581,0.0,,,,,,,,
382,100,7,1.0,False,False,False,True,0.032496,0.0,,,,,,,,


In [23]:
GNB_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.017500,0.0,,,,,,,,
1,10,1,0.2,True,True,True,False,0.039652,0.0,,,,,,,,
2,10,1,0.2,True,True,False,True,0.029995,0.0,,,,,,,,
3,10,1,0.2,True,True,False,False,0.029995,0.0,,,,,,,,
4,10,1,0.2,True,False,True,True,0.014998,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,0.032495,0.0,,,,,,,,
380,100,7,1.0,False,False,True,True,0.015000,0.0,,,,,,,,
381,100,7,1.0,False,False,True,False,0.017581,0.0,,,,,,,,
382,100,7,1.0,False,False,False,True,0.032496,0.0,,,,,,,,


### MNB

In [24]:
MNB_bag_scores = []
MNB_bag_params = []

for ne in [10,100]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            MNB_bag_score = cross_validate(BaggingClassifier(base_estimator=MNB, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=6, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in MNB_bag_score:
                                scores.append(MNB_bag_score[i].mean())    
                            MNB_bag_scores.append((scores[:]))    
                            MNB_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

10 1 0.2 True True True True
10 1 0.2 True True True False
10 1 0.2 True True False True
10 1 0.2 True True False False
10 1 0.2 True False True True
10 1 0.2 True False True False
10 1 0.2 True False False True
10 1 0.2 True False False False
10 1 0.2 False True True True
10 1 0.2 False True True False
10 1 0.2 False True False True
10 1 0.2 False True False False
10 1 0.2 False False True True
10 1 0.2 False False True False
10 1 0.2 False False False True
10 1 0.2 False False False False
10 1 0.5 True True True True
10 1 0.5 True True True False
10 1 0.5 True True False True
10 1 0.5 True True False False
10 1 0.5 True False True True
10 1 0.5 True False True False
10 1 0.5 True False False True
10 1 0.5 True False False False
10 1 0.5 False True True True
10 1 0.5 False True True False
10 1 0.5 False True False True
10 1 0.5 False True False False
10 1 0.5 False False True True
10 1 0.5 False False True False
10 1 0.5 False False False True
10 1 0.5 False False False False
10 1 0.7

In [25]:
%store MNB_bag_scores
%store MNB_bag_params

Stored 'MNB_bag_scores' (list)
Stored 'MNB_bag_params' (list)


In [26]:
MNB_bag_scores_df = pd.DataFrame(MNB_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
MNB_bag_params_df = pd.DataFrame(MNB_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
MNB_bag_scores_df = MNB_bag_params_df.join(MNB_bag_scores_df, how='left')
MNB_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.035058,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,1.843953,0.106807,0.309706,0.309649,0.148704,0.148656,0.098460,0.098424,0.309706,0.309649
2,10,1,0.2,True,True,False,True,0.105353,0.118434,0.349865,0.349922,0.182625,0.182675,0.123928,0.123967,0.349865,0.349922
3,10,1,0.2,True,True,False,False,0.101611,0.114253,0.343667,0.343558,0.177856,0.177767,0.120521,0.120455,0.343667,0.343558
4,10,1,0.2,True,False,True,True,0.028331,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,0.724096,0.281738,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495
380,100,7,1.0,False,False,True,True,0.034162,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.029997,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,0.708719,0.273726,0.377495,0.377495,0.206901,0.206901,0.142502,0.142502,0.377495,0.377495


In [27]:
MNB_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
274,100,5,0.5,True,True,False,True,0.840640,0.284594,0.769430,0.767534,0.735718,0.737097,0.729937,0.723454,0.769430,0.767534
366,100,7,0.7,False,False,False,True,0.832620,0.292027,0.767213,0.767851,0.733259,0.737381,0.728198,0.723505,0.767213,0.767851
343,100,7,0.5,True,False,False,False,0.876658,0.282080,0.767086,0.768941,0.733032,0.738369,0.727047,0.723609,0.767086,0.768941
363,100,7,0.7,False,True,False,False,0.905176,0.307536,0.766770,0.767725,0.732828,0.737162,0.727548,0.722701,0.766770,0.767725
357,100,7,0.7,True,False,True,False,2.113945,0.226100,0.766770,0.767636,0.732684,0.737190,0.727169,0.723469,0.766770,0.767636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,100,7,1.0,True,False,True,True,0.029996,0.000000,,,,,,,,
376,100,7,1.0,False,True,True,True,0.028330,0.000000,,,,,,,,
377,100,7,1.0,False,True,True,False,0.029997,0.000000,,,,,,,,
380,100,7,1.0,False,False,True,True,0.034162,0.000000,,,,,,,,


### KNN

In [28]:
KNN_bag_scores = []
KNN_bag_params = []

for ne in [10,100]:
    for ms in [1,5,7]:
        for mf in [0.2, 0.5, 0.7, 1]:
            for bs in [True, False]:
                for bsf in [True, False]:
                    for os in [True, False]:
                        for ws in [True, False]:
                            print(ne, ms, mf, bs, bsf, os, ws)
                            KNN_bag_score = cross_validate(BaggingClassifier(base_estimator=KNN, n_estimators=ne, max_samples=ms, max_features=mf, bootstrap=bs,
                            bootstrap_features=bsf, oob_score=os, warm_start=ws, n_jobs=-1),X,y, 
                            scoring = [
                                'accuracy', 'f1_weighted', 'precision_weighted',
                                'recall_weighted'], 
                            cv=8, n_jobs=-1, return_train_score=True)
                            scores = []
                            for i in KNN_bag_score:
                                scores.append(KNN_bag_score[i].mean())    
                            KNN_bag_scores.append((scores[:]))    
                            KNN_bag_params.append((ne, ms, mf, bs, bsf, os, ws))

10 1 0.2 True True True True
10 1 0.2 True True True False
10 1 0.2 True True False True
10 1 0.2 True True False False
10 1 0.2 True False True True
10 1 0.2 True False True False
10 1 0.2 True False False True
10 1 0.2 True False False False
10 1 0.2 False True True True
10 1 0.2 False True True False
10 1 0.2 False True False True
10 1 0.2 False True False False
10 1 0.2 False False True True
10 1 0.2 False False True False
10 1 0.2 False False False True
10 1 0.2 False False False False
10 1 0.5 True True True True
10 1 0.5 True True True False
10 1 0.5 True True False True
10 1 0.5 True True False False
10 1 0.5 True False True True
10 1 0.5 True False True False
10 1 0.5 True False False True
10 1 0.5 True False False False
10 1 0.5 False True True True
10 1 0.5 False True True False
10 1 0.5 False True False True
10 1 0.5 False True False False
10 1 0.5 False False True True
10 1 0.5 False False True False
10 1 0.5 False False False True
10 1 0.5 False False False False
10 1 0.7

In [29]:
%store KNN_bag_scores
%store KNN_bag_params

Stored 'KNN_bag_scores' (list)
Stored 'KNN_bag_params' (list)


In [30]:
KNN_bag_scores_df = pd.DataFrame(KNN_bag_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
KNN_bag_params_df = pd.DataFrame(KNN_bag_params, columns = ['n_estimators', 'max_samples', 'max_features', 'bootstrap', 'bootstrap_features', 'oob_score', 'warm_start'])
KNN_bag_scores_df = KNN_bag_params_df.join(KNN_bag_scores_df, how='left')
KNN_bag_scores_df

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.035621,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,0.081241,0.000000,,,,,,,,
2,10,1,0.2,True,True,False,True,0.075034,0.017338,,,,,,,,
3,10,1,0.2,True,True,False,False,0.074170,0.016497,,,,,,,,
4,10,1,0.2,True,False,True,True,0.035620,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,0.318141,0.014055,,,,,,,,
380,100,7,1.0,False,False,True,True,0.032583,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.035938,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,0.319669,0.016138,,,,,,,,


In [31]:
KNN_bag_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,n_estimators,max_samples,max_features,bootstrap,bootstrap_features,oob_score,warm_start,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,10,1,0.2,True,True,True,True,0.035621,0.000000,,,,,,,,
1,10,1,0.2,True,True,True,False,0.081241,0.000000,,,,,,,,
2,10,1,0.2,True,True,False,True,0.075034,0.017338,,,,,,,,
3,10,1,0.2,True,True,False,False,0.074170,0.016497,,,,,,,,
4,10,1,0.2,True,False,True,True,0.035620,0.000000,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,100,7,1.0,False,True,False,False,0.318141,0.014055,,,,,,,,
380,100,7,1.0,False,False,True,True,0.032583,0.000000,,,,,,,,
381,100,7,1.0,False,False,True,False,0.035938,0.000000,,,,,,,,
382,100,7,1.0,False,False,False,True,0.319669,0.016138,,,,,,,,


## Voting

Cross validating and hyper paramter tuning for voitng ensemble method in order to find best state and get best results

In [40]:
estimators = [('LR', LR), ('DT', DT), ('RF', RF), ('SVM', SVM), ('GNB', GNB), ('MNB', MNB), ('KNN', KNN)]

In [41]:
VT_scores = []
VT_params = []

for v in ['hard', 'soft']:
    for ft in [True, False]:
        for folds in range(2, 11):
            print(v,ft,folds)
            VT_score = cross_validate(VotingClassifier(estimators=estimators, voting=v, n_jobs=-1, flatten_transform=ft),X,y, 
            scoring = [
                'accuracy', 'f1_weighted', 'precision_weighted',
                'recall_weighted'], 
            cv=folds, n_jobs=-1, return_train_score=True)
            scores = []
            for i in VT_score:
                scores.append(VT_score[i].mean())    
            VT_scores.append((scores[:]))    
            VT_params.append((v,ft,folds))

hard True 2
hard True 3
hard True 4
hard True 5
hard True 6
hard True 7
hard True 8
hard True 9
hard True 10
hard False 2
hard False 3
hard False 4
hard False 5
hard False 6
hard False 7
hard False 8
hard False 9
hard False 10
soft True 2
soft True 3
soft True 4
soft True 5
soft True 6
soft True 7
soft True 8
soft True 9
soft True 10
soft False 2
soft False 3
soft False 4
soft False 5
soft False 6
soft False 7
soft False 8
soft False 9
soft False 10


In [42]:
%store VT_scores
%store VT_params

Stored 'VT_scores' (list)
Stored 'VT_params' (list)


In [43]:
VT_scores_df = pd.DataFrame(VT_scores, columns = ['fit_time', 'score_time', 'test_accuracy',
 'train_accuracy', 'test_f1_weighted', 'train_f1_weighted', 'test_precision_weighted',
  'train_precision_weighted', 'test_recall_weighted', 'train_recall_weighted'])
VT_params_df = pd.DataFrame(VT_params, columns = ['voting', 'flatten_transform', 'Folds'])
VT_scores_df = VT_params_df.join(VT_scores_df, how='left')
VT_scores_df

Unnamed: 0,voting,flatten_transform,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
0,hard,True,2,13.785469,3.440377,0.811762,0.935943,0.816803,0.935318,0.853557,0.941943,0.811762,0.935943
1,hard,True,3,25.510899,3.073997,0.808592,0.922733,0.811801,0.921809,0.854894,0.931529,0.808592,0.922733
2,hard,True,4,43.256304,2.736312,0.820565,0.919639,0.81834,0.918658,0.862542,0.929328,0.820565,0.919639
3,hard,True,5,45.046535,3.08915,0.838053,0.918757,0.832245,0.917614,0.870841,0.928328,0.838053,0.918757
4,hard,True,6,46.308089,2.358212,0.861625,0.91909,0.855599,0.917928,0.887032,0.928819,0.861625,0.91909
5,hard,True,7,50.333081,2.195788,0.873337,0.916397,0.86687,0.91521,0.900119,0.926167,0.873337,0.916397
6,hard,True,8,56.101878,2.909955,0.884419,0.916583,0.877056,0.915356,0.91463,0.926572,0.884419,0.916583
7,hard,True,9,54.468092,2.245733,0.876506,0.916991,0.867584,0.915785,0.907176,0.926784,0.876506,0.916991
8,hard,True,10,53.741335,2.302879,0.875507,0.91738,0.864981,0.916205,0.90571,0.927091,0.875507,0.91738
9,hard,False,2,11.721766,3.129572,0.794783,0.936197,0.799617,0.935562,0.838902,0.942126,0.794783,0.936197


In [44]:
VT_scores_df.sort_values(by='test_accuracy',ascending=False)

Unnamed: 0,voting,flatten_transform,Folds,fit_time,score_time,test_accuracy,train_accuracy,test_f1_weighted,train_f1_weighted,test_precision_weighted,train_precision_weighted,test_recall_weighted,train_recall_weighted
24,soft,True,8,58.774853,2.513449,0.890121,0.936984,0.882778,0.936621,0.918197,0.943971,0.890121,0.936984
33,soft,False,8,53.262255,2.600145,0.889868,0.936441,0.883202,0.936106,0.916116,0.943689,0.889868,0.936441
15,hard,False,8,61.422474,3.66519,0.888664,0.916248,0.88143,0.915045,0.91965,0.926356,0.888664,0.916248
6,hard,True,8,56.101878,2.909955,0.884419,0.916583,0.877056,0.915356,0.91463,0.926572,0.884419,0.916583
35,soft,False,10,51.54479,2.017156,0.87893,0.936373,0.868267,0.936049,0.908128,0.943462,0.87893,0.936373
25,soft,True,9,53.910619,2.17056,0.87657,0.935286,0.867243,0.93487,0.907329,0.942526,0.87657,0.935286
7,hard,True,9,54.468092,2.245733,0.876506,0.916991,0.867584,0.915785,0.907176,0.926784,0.876506,0.916991
14,hard,False,7,50.781003,2.241604,0.875745,0.917854,0.869325,0.916679,0.905065,0.927621,0.875745,0.917854
34,soft,False,9,51.726411,2.012766,0.875683,0.937021,0.866717,0.93669,0.907296,0.94388,0.875683,0.937021
8,hard,True,10,53.741335,2.302879,0.875507,0.91738,0.864981,0.916205,0.90571,0.927091,0.875507,0.91738
