# Hyper Parameter Tuning

Using hyper parameter to pick the best version of algorithm

We will use credit card default data from https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset

References

- https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
- https://www.analyticsvidhya.com/blog/2021/06/tune-hyperparameters-with-gridsearchcv/


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

## Step-1: Download Data

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default.csv
data_location: default.csv


In [3]:
data = pd.read_csv(data_location)
data.sample(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
8382,8383,160000,2,1,1,34,1,-2,-2,-1,...,708,560,0,0,0,708,560,0,560,0
10693,10694,370000,1,3,2,27,-1,0,0,0,...,103773,137735,234740,11378,36133,18773,37735,100000,5600,0
25764,25765,360000,2,1,2,32,0,0,0,0,...,241226,241673,233157,10130,12649,8916,7725,8900,6956,0
22338,22339,120000,2,2,1,25,0,0,0,0,...,70420,62549,47683,4000,3000,11299,1943,2000,1684,0
13700,13701,10000,2,2,2,28,0,0,0,0,...,9862,4854,1430,1228,1000,197,1597,1430,1184,0
2114,2115,70000,2,2,1,44,2,2,2,2,...,14123,13754,14724,1500,0,1200,0,1200,0,1
20448,20449,180000,1,1,2,26,-1,-1,-1,-1,...,770,4159,4159,9500,10122,770,4159,0,5243,0
29047,29048,120000,2,3,1,65,0,0,0,0,...,61253,57695,2530,2740,2407,1421,0,0,840,0
19622,19623,80000,2,1,1,37,-1,0,-1,-1,...,18449,2655,20808,10000,25919,18449,2655,20808,0,1
12923,12924,260000,2,1,1,61,1,-2,-2,-2,...,-20320,-20320,-20320,17000,2631,0,0,0,0,0


## Step-2: EDA

In [4]:
## Check data skew
data['default'].value_counts()

0    23364
1     6636
Name: default, dtype: int64

In [5]:
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

## Step-3: Clean up 

TODO

## Step-4: Shape data

In [6]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)

In [7]:
X = data[feature_columns]
y = data[[label_col]]

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


## Step-5: Build a Parameter Grid

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

algo = LogisticRegression()

# find out parameters
algo.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [9]:
from sklearn.model_selection import ParameterGrid

# build a param-grid
param_grid =  {
                   # 'penalty' : ['l1', 'l2'],
                    'C' : [0.5, 1.0],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                     'max_iter' : [50, 100, 200, 300],
              }

Let's try a different algorithm

In [None]:
# ## Bulid a param grid for RandomForest

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import ParameterGrid

# algo = RandomForestClassifier()
# algo.get_params()

# # build a param-grid
# param_grid =  {
#                    'n_estimators' : (50, 100, 150),
#                    'max_depth' : (10,15,20,25),
#               }

## Step-6: Grid Search

In [10]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=algo, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,np.ravel(y))



CPU times: user 5.99 s, sys: 2.04 s, total: 8.03 s
Wall time: 2min 36s




## Step-7: Get Grid Search Results

In [11]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)

Best model :  LogisticRegression(C=0.5, max_iter=300, solver='newton-cg')
Best hyper params :  {'C': 0.5, 'max_iter': 300, 'solver': 'newton-cg'}
Best score :  0.8099000000000001


In [12]:
# internal details

grid_search_results.cv_results_

{'mean_fit_time': array([0.98196201, 0.16895113, 0.79770575, 0.5090766 , 0.59977846,
        1.44155169, 0.21885309, 1.1955719 , 1.03318768, 1.17140656,
        3.96505637, 0.40793943, 0.8437736 , 2.33997178, 2.46143255,
        4.32705507, 0.4383595 , 0.83528194, 3.37656021, 3.81579413,
        0.63417616, 0.10402627, 0.71079969, 0.50865407, 0.57763143,
        1.67013402, 0.24380088, 0.73025837, 1.0421967 , 1.18917694,
        3.78207521, 0.37487926, 0.73912153, 2.42321229, 2.25571022,
        3.87173038, 0.37170234, 0.73226285, 3.40880384, 3.7215775 ]),
 'std_fit_time': array([0.15118899, 0.05557217, 0.08568664, 0.0222679 , 0.06416883,
        0.04025766, 0.00939477, 0.39019736, 0.07157194, 0.03098448,
        0.33287396, 0.03109578, 0.05814437, 0.29204572, 0.2401904 ,
        0.34514983, 0.0525884 , 0.04739396, 0.55076044, 0.55849116,
        0.04914781, 0.00658303, 0.08792713, 0.0246306 , 0.05151897,
        0.30076576, 0.06887129, 0.09860196, 0.04386064, 0.06857593,
        0.742