# Hyper Parameter Tuning

Using hyper parameter to pick the best version of algorithm

We will use credit card default data from https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset

References

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
- https://www.analyticsvidhya.com/blog/2021/06/tune-hyperparameters-with-gridsearchcv/


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

## Step-1: Download Data

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default.csv
data_location: default.csv


In [3]:
data = pd.read_csv(data_location)
data.sample(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
24663,24664,210000,1,2,2,41,0,0,0,2,...,94537,97968,100387,5000,10000,0,5000,4000,5000,1
6186,6187,200000,1,1,2,33,-2,-2,-2,-2,...,-18,-18,-18,0,0,0,0,0,0,1
22702,22703,500000,2,2,2,34,-1,-1,-1,0,...,17194,9030,2180,5606,17501,2194,1030,2180,2012,0
18734,18735,50000,2,2,2,25,0,0,0,2,...,29187,31064,30487,1757,3237,0,2300,0,1100,0
8655,8656,110000,2,2,1,33,1,2,0,0,...,9986,9981,11714,0,5000,500,400,2000,0,0
20998,20999,200000,1,2,1,32,-1,-1,0,0,...,72739,389,389,90001,3935,3759,390,390,390,0
28697,28698,330000,2,5,1,34,0,0,0,0,...,121814,11383,13090,25000,15000,7000,3000,2000,90000,0
11685,11686,330000,2,2,2,27,0,0,0,0,...,201240,186444,158521,11012,11012,6548,6497,6037,5141,0
8759,8760,20000,2,1,2,22,0,0,0,0,...,7000,10199,1500,1336,1000,140,3199,1500,780,0
14432,14433,140000,1,2,1,41,0,0,0,0,...,117188,113368,119194,12682,13265,17188,10000,15000,0,0


## Step-2: EDA

In [4]:
## Check data skew
data['default'].value_counts()

0    23364
1     6636
Name: default, dtype: int64

In [5]:
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

## Step-3: Clean up 

TODO

## Step-4: Shape data

In [6]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)

In [7]:
X = data[feature_columns]
y = data[[label_col]]

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


## Step-5: Build a Parameter Grid

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

algo = LogisticRegression()

# find out parameters
algo.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [9]:
from sklearn.model_selection import ParameterGrid

# build a param-grid
param_grid =  {
                   # 'penalty' : ['l1', 'l2'],
                    'C' : [0.5, 1.0],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                     'max_iter' : [50, 100, 200, 300],
              }

Let's try a different algorithm

In [13]:
# ## Bulid a param grid for RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

algo = RandomForestClassifier()
algo.get_params()

# # build a param-grid
param_grid =  {
                    'n_estimators' : (50, 100, 150),
                    'max_depth' : (10,15,20,25),
               }

## Step-6: Grid Search

In [14]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=algo, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,np.ravel(y))

CPU times: user 6.49 s, sys: 171 ms, total: 6.66 s
Wall time: 5min 30s


## Step-7: Get Grid Search Results

In [15]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)

Best model :  RandomForestClassifier(max_depth=10)
Best hyper params :  {'max_depth': 10, 'n_estimators': 100}
Best score :  0.8201


In [16]:
# internal details

grid_search_results.cv_results_

{'mean_fit_time': array([ 3.80462718,  7.20232177, 10.6932693 ,  5.57554231,  9.11996531,
        14.19701781,  5.37169967, 11.23725386, 16.71335874,  5.81881337,
        11.85713692, 17.59400334]),
 'std_fit_time': array([0.79238479, 0.66756855, 0.71868014, 0.74015548, 0.48626647,
        0.37899028, 0.29356166, 0.06973172, 0.51061849, 0.52463225,
        0.07691828, 0.69850624]),
 'mean_score_time': array([0.05817523, 0.13293343, 0.18885069, 0.10584493, 0.18336763,
        0.24039087, 0.13422008, 0.17504835, 0.2742353 , 0.10177617,
        0.19726377, 0.28374362]),
 'std_score_time': array([0.00073489, 0.04653756, 0.02687341, 0.0434862 , 0.0672318 ,
        0.03731836, 0.05630165, 0.00493766, 0.01000784, 0.00311801,
        0.00643994, 0.01045992]),
 'param_max_depth': masked_array(data=[10, 10, 10, 15, 15, 15, 20, 20, 20, 25, 25, 25],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
 