In [2]:
# --- (5) Hyperparameter Tuning for Classification Models --- #
"""
Hyperparameters are model configuration properties that define a model and remain constant during training
They are part of the model design and do not change

Model Inputs - train data (this trains the parameters)
Model Parameters - found during training (these are learned, e.g., model coefficient and intercept)
Model Hyperparameters - part of the model design (e.g., depth of tree, k neighbors)

Grid search is a scikit utility that creates a grid of possible values for each hyperparameter, each cell is a candidate model
gridsearchcv evaluates each candidate model (using cross validation)
It is computationally very expensive (also actual cost in cloud can be expensive)
Does not differentiate between important and trivial hyperparameters

Alternatively, random search of hyperparameter space can be done
"""

'\nHyperparameters are model configuration properties that define a model and remain constant during training\nThey are part of the model design and do not change\n\nModel Inputs - train data (this trains the parameters)\nModel Parameters - found during training (these are learned, e.g., model coefficient and intercept)\nModel Hyperparameters - part of the model design (e.g., depth of tree, k neighbors)\n\nGrid search is a scikit utility that creates a grid of possible values for each hyperparameter, each cell is a candidate model\ngridsearchcv evaluates each candidate model (using cross validation)\nIt is computationally very expensive (also actual cost in cloud can be expensive)\nDoes not differentiate between important and trivial hyperparameters\n\nAlternatively, random search of hyperparameter space can be done\n'

In [3]:
# --- Hyperparameter Tuning a Decision Tree Classifier Using Grid Search --- #

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [8]:
titanic_df = pd.read_csv('datasets/titanic/processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,1,2.0,1,1,26.0,0,0,1
1,0,3,1,21.0,0,0,7.925,0,0,1
2,0,3,1,44.0,0,0,8.05,0,0,1
3,1,3,0,22.0,0,0,7.75,0,0,1
4,0,3,0,45.0,1,4,27.9,0,0,1


In [9]:
X = titanic_df.drop('Survived', axis=1)

Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [10]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("Test data count: ", len(y_test))
    print("accuracy_count: ", num_acc)
    print("accuracy_score: ", acc)
    print("precision_score: ", prec)
    print("recall_score: ", recall)
    print()

In [11]:
# Grid Search Cross Validation
from sklearn.model_selection import GridSearchCV

# We will find the max depth that gives us the best model (one model for each max_depth value)
parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

# cv=3: Use 3-fold cross validation to find the best model - split the dataset into 3 parts
    # 2 used to train, 1 used to evaluate (cross validation)
# return_train_score=True compares models using the default scoring mechanism for this estimator (i.e., accuracy)
    # can be configured to use a different scoring mechanism
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 5}

In [15]:
# Print results from each model
for i in range(6):
    print("Parameters: ", grid_search.cv_results_['params'][i])    
    print("Mean Test Score: ", grid_search.cv_results_['mean_test_score'][i])  
    print("Rank: ", grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7803211733036294
Rank:  4
Parameters:  {'max_depth': 4}
Mean Test Score:  0.8084099136730716
Rank:  2
Parameters:  {'max_depth': 5}
Mean Test Score:  0.8119465329991646
Rank:  1
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7926111575234382
Rank:  3
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7750487329434698
Rank:  6
Parameters:  {'max_depth': 10}
Mean Test Score:  0.780293325907361
Rank:  5


In [16]:
# Build model using best params
decision_tree_model = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [17]:
y_pred = decision_tree_model.predict(x_test)

In [18]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy_count:  115
accuracy_score:  0.8041958041958042
precision_score:  0.8717948717948718
recall_score:  0.5964912280701754



In [19]:
# --- Hyperparameter Tuning a Logistic Regression Classifier Using Grid Search --- #

In [20]:
parameters = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 0.4, 0.8, 1, 2, 5]
}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.8, 'penalty': 'l1'}

In [22]:
# 12 models given 2 params where 1 param has 2 options and the other has 6 options
for i in range(12):
    print("Parameters: ", grid_search.cv_results_['params'][i])    
    print("Mean Test Score: ", grid_search.cv_results_['mean_test_score'][i])  
    print("Rank: ", grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7662396732572172
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7838299452334541
Rank:  7
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.78378353290634
Rank:  10
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7785110925461803
Rank:  11
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7890652557319223
Rank:  1
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.785547201336675
Rank:  3
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7873015873015873
Rank:  2
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.785547201336675
Rank:  3
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.785547201336675
Rank:  3
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.785547201336675
Rank:  3
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7837928153717627
Rank:  8
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Score: 

In [23]:
logistic_model = LogisticRegression(solver='liblinear', penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']).fit(x_train, y_train)

In [24]:
y_pred = logistic_model.predict(x_test)

In [25]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy_count:  118
accuracy_score:  0.8251748251748252
precision_score:  0.8076923076923077
recall_score:  0.7368421052631579

