# 3. Modelling and Hyperparameter Tuning

## 3.1 Imports

In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold

import pickle

from Classification import Classification
from Ensemble import Ensemble

I have created .py files; Classifiction.py and Ensemble.py with classes, that contain functions to simplify the modelling process, and to neaten up the modelling notebook.

In [23]:
sns.set_context('poster')

In [24]:
x_train = pd.read_csv('3.x_train_data.csv')
y_train = pd.read_csv('3.y_train_data.csv')

In [25]:
print(x_train.shape)
print(y_train.shape)

(7524, 139)
(7524, 1)


## 3.2 Train and Validation Split

I did another data split into Train and Validation data in preparation for using GridSearch Cross Validation. I also chose Stratified 5-fold has a my choice for cross validating.

In [26]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['score'],test_size=.25,random_state=42)

In [27]:
skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

## 3.3 Decision Tree (Baseline)

### 3.3.1 1st Attempt

In [28]:
params = {'min_samples_leaf':[3,5,10,15,30,50,100],
          'max_depth':[3,4,5,6,7,8,9]}

In [29]:
dec_tree_1 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [30]:
dec_tree_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.437356,0.37799,0.059366


The best hyperparameters are:  {'max_depth': 9, 'min_samples_leaf': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.369458,0.341463,0.322751,0.357977,0.480114
recall,0.787402,0.072539,0.324468,0.256983,0.444737
f1-score,0.502934,0.119658,0.323607,0.299187,0.461749


### 3.3.2 2nd Attempt

In [31]:
params = {'min_samples_leaf':[3,4,5],
          'max_depth':[7,8,9,10]}

In [32]:
dec_tree_2 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [33]:
dec_tree_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.454368,0.380117,0.074251


The best hyperparameters are:  {'max_depth': 10, 'min_samples_leaf': 4} 



Unnamed: 0,1,2,3,4,5
precision,0.371147,0.346939,0.324484,0.350746,0.482192
recall,0.790026,0.088083,0.292553,0.26257,0.463158
f1-score,0.505034,0.140496,0.307692,0.300319,0.472483


## 3.4 Random Forest

### 3.4.1 1st Attempt

In [34]:
params = {'min_samples_leaf':[3,5,10,15,30,50,100],
          'max_depth':[3,5,7,9,11,13,15]}

In [35]:
ran_for_1 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [36]:
ran_for_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.60039,0.443381,0.157009


The best hyperparameters are:  {'max_depth': 13, 'min_samples_leaf': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.514634,0.358209,0.444444,0.413105,0.476395
recall,0.553806,0.373057,0.297872,0.405028,0.584211
f1-score,0.533502,0.365482,0.356688,0.409027,0.524823


### 3.4.2 2nd Attempt

In [37]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14],
          'max_depth':[13,14,15,16,17,18]}

In [38]:
ran_for_2 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [39]:
ran_for_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.579656,0.437002,0.142655


The best hyperparameters are:  {'max_depth': 17, 'min_samples_leaf': 10} 



Unnamed: 0,1,2,3,4,5
precision,0.488739,0.328841,0.461847,0.408824,0.480084
recall,0.569554,0.316062,0.305851,0.388268,0.602632
f1-score,0.526061,0.322325,0.368,0.398281,0.534422


### 3.4.3 3rd Attempt

In [40]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14],
          'max_depth':[17,18,19,20,21,22]}

In [41]:
ran_for_3 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [42]:
ran_for_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.608719,0.446039,0.162679


The best hyperparameters are:  {'max_depth': 22, 'min_samples_leaf': 9} 



Unnamed: 0,1,2,3,4,5
precision,0.516129,0.376731,0.453488,0.384615,0.473469
recall,0.587927,0.352332,0.31117,0.363128,0.610526
f1-score,0.549693,0.364123,0.369085,0.373563,0.533333


## 3.5 Logistic Regression

### 3.5.1 1st Attempt

In [43]:
params = {'penalty':['l1','l2'],
          'C':[0.01,0.05,0.1,0.5,1,5,10]}

In [44]:
log_reg_1 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [45]:
log_reg_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.556619,0.497608,0.059011


The best hyperparameters are:  {'C': 1, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.560364,0.425287,0.463127,0.435967,0.579897
recall,0.645669,0.38342,0.417553,0.446927,0.592105
f1-score,0.6,0.40327,0.439161,0.441379,0.585938


### 3.5.2 2nd Attempt

In [46]:
params = {'penalty':['l1','l2'],
          'C':[0.3,0.4,0.5,0.6,0,7]}

In [47]:
log_reg_2 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [48]:
log_reg_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.55591,0.493886,0.062024


The best hyperparameters are:  {'C': 0.6, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.557823,0.417143,0.463636,0.437326,0.566085
recall,0.645669,0.378238,0.406915,0.438547,0.597368
f1-score,0.59854,0.396739,0.433428,0.437936,0.581306


## 3.6 Support Vector Machines

### 3.6.1 1st Attempt

In [49]:
params = {'kernel':['poly'],
          'degree':[2,3]}

In [50]:
svm_1 = Classification('SVM',x_train,x_val,y_train,y_val)

In [51]:
svm_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.812511,0.475279,0.337232


The best hyperparameters are:  {'degree': 2, 'kernel': 'poly'} 



Unnamed: 0,1,2,3,4,5
precision,0.540481,0.394009,0.419048,0.448276,0.564607
recall,0.648294,0.443005,0.351064,0.399441,0.528947
f1-score,0.589499,0.417073,0.382055,0.422452,0.546196


### 3.6.2 2nd Attempt

In [52]:
params = {'C':[0.1,0.3,0.5],
          'kernel':['linear'],
          'gamma':['scale','auto']}

In [53]:
svm_2 = Classification('SVM',x_train,x_val,y_train,y_val)

In [54]:
svm_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.558745,0.496544,0.062201


The best hyperparameters are:  {'C': 0.3, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.565517,0.403465,0.467532,0.451429,0.580729
recall,0.645669,0.42228,0.382979,0.441341,0.586842
f1-score,0.602941,0.412658,0.421053,0.446328,0.58377


### 3.6.3 3rd Attempt

In [55]:
params = {'C':[0.28,0.29,0,30,0.31,0.32,0.33],
          'kernel':['linear'],
          'gamma':['scale']}

In [56]:
svm_3 = Classification('SVM',x_train,x_val,y_train,y_val)

In [57]:
svm_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.561404,0.498671,0.062733


The best hyperparameters are:  {'C': 0.33, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.56682,0.410319,0.467532,0.453521,0.583554
recall,0.645669,0.432642,0.382979,0.449721,0.578947
f1-score,0.603681,0.421185,0.421053,0.451613,0.581242


### 3.6.4 4th Attempt

In [58]:
params = {'C':[0.32,0.33,0.34,0.35],
          'kernel':['linear'],
          'gamma':['scale']}

In [59]:
svm_4 = Classification('SVM',x_train,x_val,y_train,y_val)

In [60]:
svm_4.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.561404,0.498671,0.062733


The best hyperparameters are:  {'C': 0.33, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.56682,0.410319,0.467532,0.453521,0.583554
recall,0.645669,0.432642,0.382979,0.449721,0.578947
f1-score,0.603681,0.421185,0.421053,0.451613,0.581242


## 3.7 Guassian Naive Bayes

In [61]:
gnb_1 = Classification('Naive Bayes',x_train,x_val,y_train,y_val)

In [62]:
gnb_1.get_scores({},skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Naive Bayes,0.409888,0.405635,0.004253


Unnamed: 0,1,2,3,4,5
precision,0.498195,0.48913,0.716049,0.272727,0.338389
recall,0.724409,0.11658,0.154255,0.075419,0.939474
f1-score,0.590374,0.188285,0.253829,0.118162,0.497561


## 3.8 KNN

### 3.8.1 1st Attempt

In [63]:
params = {'n_neighbors':[5,10,50,100,200,300]}

In [64]:
knn_1 = Classification('KNN',x_train,x_val,y_train,y_val)

In [65]:
knn_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.493532,0.472089,0.021442


The best hyperparameters are:  {'n_neighbors': 200} 



Unnamed: 0,1,2,3,4,5
precision,0.506958,0.401099,0.564103,0.404959,0.509091
recall,0.669291,0.378238,0.234043,0.410615,0.663158
f1-score,0.576923,0.389333,0.330827,0.407767,0.576


### 3.8.2 2nd Attempt

In [66]:
params = {'n_neighbors':[190,200,210]}

In [67]:
knn_2 = Classification('KNN',x_train,x_val,y_train,y_val)

In [68]:
knn_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.493532,0.472089,0.021442


The best hyperparameters are:  {'n_neighbors': 200} 



Unnamed: 0,1,2,3,4,5
precision,0.506958,0.401099,0.564103,0.404959,0.509091
recall,0.669291,0.378238,0.234043,0.410615,0.663158
f1-score,0.576923,0.389333,0.330827,0.407767,0.576


### 3.8.3 3rd Attempt

In [69]:
params = {'n_neighbors':[198,199,200,201,202]}

In [70]:
knn_3 = Classification('KNN',x_train,x_val,y_train,y_val)

In [71]:
knn_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.493532,0.472089,0.021442


The best hyperparameters are:  {'n_neighbors': 200} 



Unnamed: 0,1,2,3,4,5
precision,0.506958,0.401099,0.564103,0.404959,0.509091
recall,0.669291,0.378238,0.234043,0.410615,0.663158
f1-score,0.576923,0.389333,0.330827,0.407767,0.576


## 3.9 Adaboost (log_reg_1)

### 3.9.1 1st Attempt

In [72]:
params = {'learning_rate':[0.1,1,10]}

In [73]:
adaboost_1 = Ensemble('AdaBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [74]:
adaboost_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.520468,0.475279,0.045189


The best hyperparameters are:  {'learning_rate': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.648084,0.383529,0.410417,0.426702,0.602606
recall,0.488189,0.42228,0.523936,0.455307,0.486842
f1-score,0.556886,0.401973,0.46028,0.440541,0.538574


### 3.9.2 2nd Attempt

In [75]:
params = {'n_estimators':[18,19,20,21],
          'learning_rate':[0.08,0.09,0.1,0.11,0.12]}

In [76]:
adaboost_2 = Ensemble('AdaBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [77]:
adaboost_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.51586,0.481659,0.034202


The best hyperparameters are:  {'learning_rate': 0.09, 'n_estimators': 19} 



Unnamed: 0,1,2,3,4,5
precision,0.661922,0.397094,0.420043,0.428934,0.58642
recall,0.488189,0.42487,0.523936,0.472067,0.5
f1-score,0.561934,0.410513,0.466272,0.449468,0.539773


## 3.10 XGBoost (log_reg_1)

### 3.10.1 1st Attempt

In [78]:
params = {'eta':[0.001,0.005,0.1,0.5],
          'min_child_weight':[1,5,10]}

In [79]:
xgboost_1 = Ensemble('XGBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [80]:
xgboost_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.856814,0.473153,0.383661


The best hyperparameters are:  {'eta': 0.001, 'min_child_weight': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.528604,0.379888,0.426087,0.450867,0.556962
recall,0.606299,0.352332,0.390957,0.435754,0.578947
f1-score,0.564792,0.365591,0.407767,0.443182,0.567742


### 3.10.2 2nd Attempt

In [81]:
params = {'eta':[0.0001,0.0005,0.001],
          'min_child_weight':[5]}

In [82]:
xgboost_2 = Ensemble('XGBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [83]:
xgboost_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.856814,0.473153,0.383661


The best hyperparameters are:  {'eta': 0.0001, 'min_child_weight': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.528604,0.379888,0.426087,0.450867,0.556962
recall,0.606299,0.352332,0.390957,0.435754,0.578947
f1-score,0.564792,0.365591,0.407767,0.443182,0.567742


## 3.11 Voting (Adaboost(log_reg_1)/XGBoost(log_reg_1))

In [84]:
params = {'voting':['hard','soft']}

In [85]:
adaboost_best = ('ada', adaboost_2.best_model)
xgboost_best = ('xgb', xgboost_2.best_model)

estimators = [adaboost_best,xgboost_best]

In [86]:
voting = Ensemble('Voting',estimators,x_train,x_val,y_train,y_val)

In [87]:
voting.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Voting,0.705653,0.482722,0.222931


The best hyperparameters are:  {'voting': 'hard'} 



Unnamed: 0,1,2,3,4,5
precision,0.534884,0.395288,0.429612,0.466472,0.616236
recall,0.664042,0.391192,0.470745,0.446927,0.439474
f1-score,0.592506,0.393229,0.449239,0.456491,0.513057


## 3.12 Stacking (Adaboost(log_reg_1)/XGBoost(log_reg_1))

In [88]:
adaboost_best = ('ada', adaboost_2.best_model)
xgboost_best = ('xgb', xgboost_2.best_model)

estimators = [adaboost_best,xgboost_best]

In [89]:
stacking = Ensemble('Stacking',estimators,x_train,x_val,y_train,y_val)

In [90]:
stacking.get_scores({},skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Stacking,0.758639,0.487507,0.271132


Unnamed: 0,1,2,3,4,5
precision,0.578283,0.395225,0.423267,0.463127,0.578082
recall,0.60105,0.38601,0.454787,0.438547,0.555263
f1-score,0.589447,0.390564,0.438462,0.450502,0.566443


## 3.12 All Models Compared

For the majority of models I created, I applied hyperparameter tuning, where I started with a broad range of hyperparameters, and tuned for optimal train accuracy and validation accuracy. 

In [91]:
all_models = pd.concat([dec_tree_1.scores_table,
                        ran_for_3.scores_table,
                        log_reg_1.scores_table,
                        svm_4.scores_table,
                        gnb_1.scores_table,
                        knn_3.scores_table,
                        adaboost_2.scores_table,
                        xgboost_1.scores_table,
                        voting.scores_table,
                        stacking.scores_table],
                        axis=0)

In [92]:
all_models

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.437356,0.37799,0.059366
0,Random Forest,0.608719,0.446039,0.162679
0,Logistic Regression,0.556619,0.497608,0.059011
0,SVM,0.561404,0.498671,0.062733
0,Naive Bayes,0.409888,0.405635,0.004253
0,KNN,0.493532,0.472089,0.021442
0,AdaBoost,0.51586,0.481659,0.034202
0,XGBoost,0.856814,0.473153,0.383661
0,Voting,0.705653,0.482722,0.222931
0,Stacking,0.758639,0.487507,0.271132


Initially, I thought the validation accuracy was low for most of the models I created, but when considering these models were attempting to classify for 5 different classes, 0.45 and greater seems very reasonable (where 0.2 = randomly guessing correctly).

## 3.13 Saving Models

I have saved all the models using the pickle library's dump function.

In [98]:
for model in [dec_tree_1,ran_for_3,log_reg_1,svm_4,gnb_1,knn_3,adaboost_2,xgboost_1,voting,stacking]:
    pickle.dump(model.best_model, open(f'models\{model.model_type}.pkl', 'wb'))