# 3. Modelling and Hyperparameter Tuning

## 3.1 Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold

import pickle

from Classification import Classification
from Ensemble import Ensemble



I have created .py files; Classifiction.py and Ensemble.py with classes, that contain functions to simplify the modelling process, and to neaten up the modelling notebook.

In [2]:
sns.set_context('poster')

In [3]:
x_train = pd.read_csv('Data/3.x_train_data.csv')
y_train = pd.read_csv('Data/3.y_train_data.csv')

In [4]:
print(x_train.shape)
print(y_train.shape)

(7524, 138)
(7524, 1)


## 3.2 Train and Validation Split

I did another data split into Train and Validation data in preparation for using GridSearch Cross Validation. I also chose Stratified 5-fold has a my choice for cross validating.

In [5]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['score'],test_size=.25,random_state=42)

In [6]:
skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

## 3.3 Decision Tree (Baseline)

### 3.3.1 1st Attempt

In [7]:
params = {'min_samples_leaf':[3,5,10,15,30,50,100],
          'max_depth':[3,4,5,6,7,8,9]}

In [8]:
dec_tree_1 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [9]:
dec_tree_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.405458,0.349282,0.056176


The best hyperparameters are:  {'max_depth': 9, 'min_samples_leaf': 15} 



Unnamed: 0,1,2,3,4,5
precision,0.520548,0.268468,0.237443,0.368231,0.479592
recall,0.299213,0.38601,0.276596,0.284916,0.494737
f1-score,0.38,0.316684,0.255528,0.32126,0.487047


### 3.3.2 2nd Attempt

In [10]:
params = {'min_samples_leaf':[3,4,5],
          'max_depth':[7,8,9,10]}

In [11]:
dec_tree_2 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [12]:
dec_tree_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.438596,0.350346,0.088251


The best hyperparameters are:  {'max_depth': 10, 'min_samples_leaf': 4} 



Unnamed: 0,1,2,3,4,5
precision,0.514644,0.276158,0.24024,0.342105,0.452607
recall,0.322835,0.417098,0.212766,0.290503,0.502632
f1-score,0.396774,0.332301,0.22567,0.314199,0.476309


## 3.4 Random Forest

### 3.4.1 1st Attempt

In [13]:
params = {'min_samples_leaf':[3,5,10,15,30,50,100],
          'max_depth':[3,5,7,9,11,13,15]}

In [14]:
ran_for_1 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [15]:
ran_for_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.590289,0.469963,0.120326


The best hyperparameters are:  {'max_depth': 15, 'min_samples_leaf': 10} 



Unnamed: 0,1,2,3,4,5
precision,0.485605,0.391892,0.399209,0.458564,0.552339
recall,0.664042,0.300518,0.268617,0.463687,0.652632
f1-score,0.560976,0.340176,0.321145,0.461111,0.598311


### 3.4.2 2nd Attempt

In [16]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14],
          'max_depth':[13,14,15,16,17,18]}

In [17]:
ran_for_2 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [18]:
ran_for_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.592061,0.473153,0.118908


The best hyperparameters are:  {'max_depth': 16, 'min_samples_leaf': 9} 



Unnamed: 0,1,2,3,4,5
precision,0.498188,0.43038,0.363636,0.441734,0.558036
recall,0.721785,0.264249,0.265957,0.455307,0.657895
f1-score,0.589496,0.327448,0.30722,0.448418,0.603865


### 3.4.3 3rd Attempt

In [19]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14],
          'max_depth':[17,18,19,20,21,22]}

In [20]:
ran_for_3 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [21]:
ran_for_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.57434,0.471026,0.103314


The best hyperparameters are:  {'max_depth': 20, 'min_samples_leaf': 13} 



Unnamed: 0,1,2,3,4,5
precision,0.491557,0.423221,0.391129,0.446575,0.536325
recall,0.687664,0.292746,0.257979,0.455307,0.660526
f1-score,0.573304,0.346095,0.310897,0.450899,0.591981


## 3.5 Logistic Regression

### 3.5.1 1st Attempt

In [22]:
params = {'penalty':['l1','l2'],
          'C':[0.01,0.05,0.1,0.5,1,5,10]}

In [23]:
log_reg_1 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [24]:
log_reg_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.56353,0.49176,0.07177


The best hyperparameters are:  {'C': 0.5, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.549398,0.426036,0.426036,0.445013,0.588972
recall,0.598425,0.373057,0.382979,0.486034,0.618421
f1-score,0.572864,0.39779,0.403361,0.464619,0.603338


### 3.5.2 2nd Attempt

In [25]:
params = {'penalty':['l1','l2'],
          'C':[0.3,0.4,0.5,0.6,0,7]}

In [26]:
log_reg_2 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [27]:
log_reg_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.564948,0.49176,0.073188


The best hyperparameters are:  {'C': 0.6, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.551313,0.425595,0.421365,0.44473,0.59
recall,0.606299,0.370466,0.37766,0.48324,0.621053
f1-score,0.5775,0.396122,0.398317,0.463186,0.605128


## 3.6 Support Vector Machines

### 3.6.1 1st Attempt

In [28]:
params = {'kernel':['poly'],
          'degree':[2,3]}

In [29]:
svm_1 = Classification('SVM',x_train,x_val,y_train,y_val)

In [30]:
svm_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.814106,0.480595,0.333511


The best hyperparameters are:  {'degree': 2, 'kernel': 'poly'} 



Unnamed: 0,1,2,3,4,5
precision,0.530837,0.39886,0.409605,0.444126,0.597855
recall,0.632546,0.362694,0.385638,0.432961,0.586842
f1-score,0.577246,0.379919,0.39726,0.438472,0.592297


### 3.6.2 2nd Attempt

In [31]:
params = {'C':[0.1,0.3,0.5],
          'kernel':['linear'],
          'gamma':['scale','auto']}

In [32]:
svm_2 = Classification('SVM',x_train,x_val,y_train,y_val)

In [33]:
svm_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.57115,0.490165,0.080985


The best hyperparameters are:  {'C': 0.5, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.551163,0.420168,0.401813,0.442708,0.612137
recall,0.622047,0.388601,0.353723,0.47486,0.610526
f1-score,0.584464,0.403769,0.376238,0.458221,0.611331


### 3.6.3 3rd Attempt

In [34]:
params = {'C':[0.28,0.29,0,30,0.31,0.32,0.33],
          'kernel':['linear'],
          'gamma':['scale']}

In [35]:
svm_3 = Classification('SVM',x_train,x_val,y_train,y_val)

In [36]:
svm_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.566897,0.489633,0.077264


The best hyperparameters are:  {'C': 0.28, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.550351,0.424658,0.396923,0.441176,0.607692
recall,0.616798,0.401554,0.343085,0.460894,0.623684
f1-score,0.581683,0.412783,0.368046,0.45082,0.615584


### 3.6.4 4th Attempt

In [37]:
params = {'C':[0.32,0.33,0.34,0.35],
          'kernel':['linear'],
          'gamma':['scale']}

In [38]:
svm_4 = Classification('SVM',x_train,x_val,y_train,y_val)

In [39]:
svm_4.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.570441,0.490696,0.079745


The best hyperparameters are:  {'C': 0.32, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.553191,0.420765,0.398176,0.442667,0.613402
recall,0.614173,0.398964,0.348404,0.463687,0.626316
f1-score,0.58209,0.409574,0.371631,0.452933,0.619792


## 3.7 Guassian Naive Bayes

In [40]:
gnb_1 = Classification('Naive Bayes',x_train,x_val,y_train,y_val)

In [41]:
gnb_1.get_scores({},skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Naive Bayes,0.390218,0.38756,0.002658


Unnamed: 0,1,2,3,4,5
precision,0.509554,0.435294,0.625,0.292308,0.320447
recall,0.629921,0.095855,0.159574,0.053073,0.981579
f1-score,0.56338,0.157113,0.254237,0.089835,0.483161


## 3.8 KNN

### 3.8.1 1st Attempt

In [42]:
params = {'n_neighbors':[5,10,50,100,200,300]}

In [43]:
knn_1 = Classification('KNN',x_train,x_val,y_train,y_val)

In [44]:
knn_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.507177,0.478469,0.028708


The best hyperparameters are:  {'n_neighbors': 300} 



Unnamed: 0,1,2,3,4,5
precision,0.501859,0.406639,0.460526,0.42663,0.533597
recall,0.708661,0.253886,0.279255,0.438547,0.710526
f1-score,0.587595,0.3126,0.347682,0.432507,0.609481


### 3.8.2 2nd Attempt

In [45]:
params = {'n_neighbors':[190,200,210]}

In [46]:
knn_2 = Classification('KNN',x_train,x_val,y_train,y_val)

In [47]:
knn_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.509658,0.475279,0.034379


The best hyperparameters are:  {'n_neighbors': 210} 



Unnamed: 0,1,2,3,4,5
precision,0.50478,0.412451,0.465116,0.421751,0.520629
recall,0.692913,0.274611,0.265957,0.444134,0.697368
f1-score,0.584071,0.329705,0.338409,0.432653,0.596175


### 3.8.3 3rd Attempt

In [48]:
params = {'n_neighbors':[198,199,200,201,202]}

In [49]:
knn_3 = Classification('KNN',x_train,x_val,y_train,y_val)

In [50]:
knn_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.51019,0.477937,0.032252


The best hyperparameters are:  {'n_neighbors': 202} 



Unnamed: 0,1,2,3,4,5
precision,0.504798,0.409449,0.468468,0.431267,0.522417
recall,0.690289,0.26943,0.276596,0.446927,0.705263
f1-score,0.583149,0.325,0.347826,0.438957,0.600224


## 3.9 Adaboost (log_reg_1)

### 3.9.1 1st Attempt

In [51]:
params = {'learning_rate':[0.1,1,10]}

In [52]:
adaboost_1 = Ensemble('AdaBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [53]:
adaboost_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.519936,0.483254,0.036683


The best hyperparameters are:  {'learning_rate': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.638225,0.410798,0.401408,0.412587,0.648208
recall,0.490814,0.453368,0.454787,0.494413,0.523684
f1-score,0.554896,0.431034,0.426434,0.449809,0.57933


### 3.9.2 2nd Attempt

In [54]:
params = {'n_estimators':[18,19,20,21],
          'learning_rate':[0.08,0.09,0.1,0.11,0.12]}

In [55]:
adaboost_2 = Ensemble('AdaBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [56]:
adaboost_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.528088,0.486443,0.041645


The best hyperparameters are:  {'learning_rate': 0.08, 'n_estimators': 20} 



Unnamed: 0,1,2,3,4,5
precision,0.641844,0.421446,0.38914,0.425581,0.644172
recall,0.475066,0.437824,0.457447,0.511173,0.552632
f1-score,0.546003,0.429479,0.420538,0.464467,0.594901


## 3.10 XGBoost (log_reg_1)

### 3.10.1 1st Attempt

In [57]:
params = {'eta':[0.001,0.005,0.1,0.5],
          'min_child_weight':[1,5,10]}

In [58]:
xgboost_1 = Ensemble('XGBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [59]:
xgboost_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.800106,0.493355,0.306752


The best hyperparameters are:  {'eta': 0.001, 'min_child_weight': 10} 



Unnamed: 0,1,2,3,4,5
precision,0.553922,0.403409,0.403409,0.465565,0.6133
recall,0.593176,0.367876,0.37766,0.472067,0.655263
f1-score,0.572877,0.384824,0.39011,0.468793,0.633588


### 3.10.2 2nd Attempt

In [60]:
params = {'eta':[0.0001,0.0005,0.001],
          'min_child_weight':[5]}

In [61]:
xgboost_2 = Ensemble('XGBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [62]:
xgboost_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.852561,0.480064,0.372497


The best hyperparameters are:  {'eta': 0.0001, 'min_child_weight': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.541766,0.403409,0.41369,0.434316,0.581047
recall,0.595801,0.367876,0.369681,0.452514,0.613158
f1-score,0.5675,0.384824,0.390449,0.443228,0.596671


## 3.11 Voting (Adaboost(log_reg_1)/XGBoost(log_reg_1))

In [63]:
params = {'voting':['hard','soft']}

In [64]:
adaboost_best = ('ada', adaboost_2.best_model)
xgboost_best = ('xgb', xgboost_2.best_model)

estimators = [adaboost_best,xgboost_best]

In [65]:
voting = Ensemble('Voting',estimators,x_train,x_val,y_train,y_val)

In [66]:
voting.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Voting,0.707248,0.49176,0.215488


The best hyperparameters are:  {'voting': 'hard'} 



Unnamed: 0,1,2,3,4,5
precision,0.542484,0.42033,0.416873,0.440104,0.686347
recall,0.653543,0.396373,0.446809,0.472067,0.489474
f1-score,0.592857,0.408,0.431322,0.455526,0.571429


## 3.12 Stacking (Adaboost(log_reg_1)/XGBoost(log_reg_1))

In [67]:
adaboost_best = ('ada', adaboost_2.best_model)
xgboost_best = ('xgb', xgboost_2.best_model)

estimators = [adaboost_best,xgboost_best]

In [68]:
stacking = Ensemble('Stacking',estimators,x_train,x_val,y_train,y_val)

In [69]:
stacking.get_scores({},skf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Stacking,0.752791,0.501329,0.251462


Unnamed: 0,1,2,3,4,5
precision,0.586902,0.426966,0.404092,0.455764,0.631868
recall,0.611549,0.393782,0.420213,0.47486,0.605263
f1-score,0.598972,0.409704,0.411995,0.465116,0.61828


## 3.12 All Models Compared

For the majority of models I created, I applied hyperparameter tuning, where I started with a broad range of hyperparameters, and tuned for optimal train accuracy and validation accuracy. 

In [70]:
all_models = pd.concat([dec_tree_1.scores_table,
                        ran_for_3.scores_table,
                        log_reg_1.scores_table,
                        svm_4.scores_table,
                        gnb_1.scores_table,
                        knn_3.scores_table,
                        adaboost_2.scores_table,
                        xgboost_1.scores_table,
                        voting.scores_table,
                        stacking.scores_table],
                        axis=0)

In [71]:
all_models

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.405458,0.349282,0.056176
0,Random Forest,0.57434,0.471026,0.103314
0,Logistic Regression,0.56353,0.49176,0.07177
0,SVM,0.570441,0.490696,0.079745
0,Naive Bayes,0.390218,0.38756,0.002658
0,KNN,0.51019,0.477937,0.032252
0,AdaBoost,0.528088,0.486443,0.041645
0,XGBoost,0.800106,0.493355,0.306752
0,Voting,0.707248,0.49176,0.215488
0,Stacking,0.752791,0.501329,0.251462


In [72]:
all_models.to_csv('Data/4.all_models.csv',index=False)

Initially, I thought the validation accuracy was low for most of the models I created, but when considering these models were attempting to classify for 5 different classes, 0.45 and greater seems very reasonable (where 0.2 = randomly guessing correctly).

## 3.13 Saving Models

I have saved all the models using the pickle library's dump function.

In [73]:
for model in [dec_tree_1,ran_for_3,log_reg_1,svm_4,gnb_1,knn_3,adaboost_2,xgboost_1,voting,stacking]:
    pickle.dump(model, open(f'Models/{model.model_type}.pkl', 'wb'))