# Classification Hyperparameter Tuning

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2022-11-13 17:22:34.842729


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.0.2.


In [4]:
# Create the 'out' directory to store output images
import os
if not os.path.exists('out'):
    os.makedirs('out')

# Load Data

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/stepthom/869_course/main/data/GermanCredit.csv')
df['Class'] = df['Class'].map({'Good': 1, 'Bad': 0})
df.head()
X = df.drop(['Class'], axis=1)
y = df[['Class']]

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,1,...,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,0,...,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,1,...,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,1,...,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,0,1,0,0,1,0,0,1,0


In [6]:
# Helper function to print out the results of hyperparmater tuning in a nice table.

def cv_results_to_df(cv_results):
    results = pd.DataFrame(list(cv_results['params']))
    #results['mean_fit_time'] = cv_results['mean_fit_time']
    #results['mean_score_time'] = cv_results['mean_score_time']
    #results['mean_train_score'] = cv_results['mean_train_score']
    #results['std_train_score'] = cv_results['std_train_score']
    results['mean_val_score'] = cv_results['mean_test_score']
    #results['std_val_score'] = cv_results['std_test_score']
    results['rank_val_score'] = cv_results['rank_test_score']

    results = results.sort_values(['mean_val_score'], ascending=False)
    return results

# Decision Trees

In [7]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=0)

scores = cross_val_score(clf, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: {:.4f}".format(np.mean(scores)))

Mean Accuracy: 0.6970


In [8]:
clf2 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, random_state=0)

scores = cross_val_score(clf2, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: {:.4f}".format(np.mean(scores)))

Mean Accuracy: 0.7010


In [9]:
clf3 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=6, random_state=0)

scores = cross_val_score(clf3, X, y, cv=10, scoring="accuracy")
print("Mean Accuracy: {:.4f}".format(np.mean(scores)))

Mean Accuracy: 0.7040


## Hyperparameter Tuning

### Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeClassifier(random_state=42)

params = {'criterion': ('gini', 'entropy'), 
          'splitter': ('best', 'random'), 
          'class_weight': ('balanced', None), 
          'max_depth': [2, 5, 10, 20], 
          'min_samples_leaf': [1, 5, 10],
          'max_features':[0.25, 0.5, 0.75, 1.0]}

search = GridSearchCV(clf, params, scoring='f1_macro', cv=10, verbose=1)
search = search.fit(X, y)

Fitting 10 folds for each of 384 candidates, totalling 3840 fits


In [11]:
search.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': 1.0,
 'min_samples_leaf': 1,
 'splitter': 'best'}

In [12]:
search.best_score_

0.6631045623915302

In [13]:
cv_results_to_df(search.cv_results_)

Unnamed: 0,class_weight,criterion,max_depth,max_features,min_samples_leaf,splitter,mean_val_score,rank_val_score
66,balanced,gini,10,1.00,1,best,0.663105,1
339,,entropy,10,0.25,5,random,0.662809,2
145,balanced,entropy,10,0.25,1,random,0.659962,3
280,,gini,20,0.75,10,best,0.658818,4
364,,entropy,20,0.25,10,best,0.657111,5
...,...,...,...,...,...,...,...,...
196,,gini,2,0.25,10,best,0.411765,379
194,,gini,2,0.25,5,best,0.411765,379
288,,entropy,2,0.25,1,best,0.411765,379
290,,entropy,2,0.25,5,best,0.411765,379


### Random Search

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

clf = DecisionTreeClassifier(random_state=42)

params = {"criterion": ["gini", "entropy"],
          "splitter": ["best", "random"],
          "class_weight": ['balanced', None], 
          "max_depth": randint(2, 21),
          "min_samples_leaf": randint(1, 11),
          "max_features": uniform(0.0, 1.0)}

search = RandomizedSearchCV(clf, param_distributions=params, n_iter=1000, scoring='f1_macro', cv=10, verbose=1)
search = search.fit(X, y)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


In [16]:
search.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 8,
 'max_features': 0.3305176264926438,
 'min_samples_leaf': 7,
 'splitter': 'random'}

In [15]:
search.best_score_

0.6752725802591557

In [16]:
cv_results_to_df(search.cv_results_)

Unnamed: 0,class_weight,criterion,max_depth,max_features,min_samples_leaf,splitter,mean_val_score,rank_val_score
976,,entropy,7,0.336996,9,random,0.675273,1
409,,entropy,14,0.133708,8,best,0.674951,2
666,balanced,entropy,12,0.951757,3,best,0.673843,3
860,,entropy,15,0.332030,8,random,0.671735,4
345,,entropy,19,0.387415,8,best,0.666677,5
...,...,...,...,...,...,...,...,...
974,,gini,2,0.197313,4,random,0.411765,991
132,,entropy,3,0.010523,6,best,0.411765,991
153,,entropy,2,0.195254,6,best,0.411765,991
114,,entropy,2,0.234172,3,best,0.411765,991


# Halving Grid Search

In [17]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV

clf = DecisionTreeClassifier(random_state=42)

params = {'criterion': ('gini', 'entropy'), 
          'splitter': ('best', 'random'), 
          'class_weight': ('balanced', None), 
          'max_depth': [2, 5, 10, 20], 
          'min_samples_leaf': [1, 5, 10],
          'max_features':[0.25, 0.5, 0.75, 1.0]}

search = HalvingGridSearchCV(clf, params, scoring='f1_macro', cv=10, verbose=1)
search = search.fit(X, y)

  y = column_or_1d(y, warn=True)


n_iterations: 3
n_required_iterations: 6
n_possible_iterations: 3
min_resources_: 40
max_resources_: 1000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 384
n_resources: 40
Fitting 10 folds for each of 384 candidates, totalling 3840 fits
----------
iter: 1
n_candidates: 128
n_resources: 120
Fitting 10 folds for each of 128 candidates, totalling 1280 fits
----------
iter: 2
n_candidates: 43
n_resources: 360
Fitting 10 folds for each of 43 candidates, totalling 430 fits


In [18]:
search.best_params_

{'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 0.5,
 'min_samples_leaf': 10,
 'splitter': 'random'}

In [19]:
search.best_score_

0.6710221942836194

In [20]:
cv_results_to_df(search.cv_results_)

Unnamed: 0,class_weight,criterion,max_depth,max_features,min_samples_leaf,splitter,mean_val_score,rank_val_score
336,,entropy,10,0.25,1,best,0.796190,1
360,,entropy,20,0.25,1,best,0.796190,1
338,,entropy,10,0.25,5,best,0.765714,3
362,,entropy,20,0.25,5,best,0.765714,3
314,,entropy,5,0.25,5,best,0.765714,3
...,...,...,...,...,...,...,...,...
121,balanced,entropy,5,0.25,1,random,0.408571,551
182,balanced,entropy,20,0.75,5,best,0.382381,552
158,balanced,entropy,10,0.75,5,best,0.382381,552
134,balanced,entropy,5,0.75,5,best,0.382381,552
