In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)
data = pd.read_csv(data_location)
data.sample(10)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv
data_location: default2.csv


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
26565,26566,210000,2,3,1,54,0,0,0,0,...,99763,99793,101879,3500,3500,4500,3600,3694,3728,0
14757,14758,30000,2,3,1,24,2,0,0,0,...,29621,27956,29484,2506,1748,1003,1022,2000,1000,0
4037,4038,150000,2,3,1,49,-1,-1,-1,-1,...,7642,8009,10800,23979,7838,7642,367,10800,5068,0
18230,18231,300000,1,3,1,56,0,0,0,0,...,99368,101600,560,4052,6122,1987,2232,560,0,0
18983,18984,120000,2,2,1,33,0,0,0,0,...,87364,87742,87943,3204,3214,4001,3254,3325,3500,0
26375,26376,100000,2,1,1,43,1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
15311,15312,240000,2,3,2,37,1,-2,-1,-1,...,2500,580,490,0,2489,2500,580,490,177,0
1979,1980,500000,2,1,1,35,0,0,0,0,...,48322,21593,13866,2504,10004,5178,1047,2019,1004,1
23434,23435,350000,2,1,1,36,-2,-2,-2,-2,...,2195,0,37800,0,1342,2195,0,37800,1000,0
12903,12904,100000,1,3,2,27,0,0,0,0,...,64691,49332,50366,4146,2556,2802,1786,1850,2000,0


In [3]:
## Check data skew
data['default'].value_counts()
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

In [4]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)
X = data[feature_columns].values
y = data[[label_col]].values

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


In [5]:
# ## Bulid a param grid for RandomForest

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

rf = RandomForestClassifier()
print(rf.get_params())

# # build a param-grid
param_grid =  {
                    'n_estimators' : (50, 100, 150),
                    'max_depth' : (10,15,20,25),
               }

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [6]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=rf, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,y.ravel())

CPU times: user 2.9 s, sys: 110 ms, total: 3.01 s
Wall time: 2min 27s


In [7]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)
# internal details

grid_search_results.cv_results_

Best model :  RandomForestClassifier(max_depth=10, n_estimators=50)
Best hyper params :  {'max_depth': 10, 'n_estimators': 50}
Best score :  0.8199333333333332


{'mean_fit_time': array([ 3.15055428,  6.06860819,  9.19327822,  4.45563383,  8.09551039,
        12.37597866,  4.81967158,  9.41386948, 14.45945787,  5.18042235,
        10.09518342, 14.25236578]),
 'std_fit_time': array([0.05981788, 0.09753196, 0.18613213, 0.39575028, 0.03777851,
        0.27893698, 0.26319214, 0.06507673, 0.25490671, 0.18722812,
        0.16339132, 1.76770792]),
 'mean_score_time': array([0.06633468, 0.1112155 , 0.17086535, 0.07839527, 0.15179291,
        0.22678227, 0.09057031, 0.18031096, 0.26572447, 0.09699321,
        0.22303572, 0.26899714]),
 'std_score_time': array([0.00661016, 0.00295229, 0.00723445, 0.00712367, 0.00402587,
        0.00549038, 0.0041279 , 0.0060357 , 0.00399706, 0.00354132,
        0.05680116, 0.02594676]),
 'param_max_depth': masked_array(data=[10, 10, 10, 15, 15, 15, 20, 20, 20, 25, 25, 25],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
 

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid

algo = LogisticRegression(max_iter=500)

# find out parameters
print(algo.get_params())

# build a param-grid
param_grid =  {
                    'penalty' : ['l1', 'l2'],
                    'C' : [0.5, 1.0, 1.5],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50,100,200,500,1000]
              }

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [9]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=algo, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,y.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 4.03 s, sys: 346 ms, total: 4.38 s
Wall time: 10min 12s


In [10]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)
# internal details

grid_search_results.cv_results_

Best model :  LogisticRegression(C=1.5, penalty='l1', solver='liblinear')
Best hyper params :  {'C': 1.5, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best score :  0.8099000000000001


{'mean_fit_time': array([4.35881615e-03, 3.81851196e-03, 8.03040600e-01, 5.59520721e-03,
        1.46377029e+00, 1.54708786e+00, 2.25059319e-01, 1.21974754e+00,
        1.13865447e+00, 1.12466998e+00, 5.30686378e-03, 5.76372147e-03,
        7.97983694e-01, 4.09607887e-03, 3.17761488e+00, 3.58897820e+00,
        4.67558098e-01, 1.42545767e+00, 1.79657059e+00, 2.04843192e+00,
        4.21447754e-03, 4.15420532e-03, 8.71366310e-01, 5.88488579e-03,
        5.38081298e+00, 8.80148997e+00, 8.70781898e-01, 1.75449314e+00,
        3.43515210e+00, 3.87818761e+00, 5.47995567e-03, 5.45063019e-03,
        8.36724567e-01, 4.14175987e-03, 1.34637706e+01, 8.50127225e+00,
        8.87172556e-01, 1.38111844e+00, 8.47997322e+00, 9.71671467e+00,
        5.51228523e-03, 5.26204109e-03, 7.32898378e-01, 5.67789078e-03,
        2.63600817e+01, 7.77342925e+00, 8.52341986e-01, 1.28653331e+00,
        1.67623563e+01, 1.90428264e+01, 5.72104454e-03, 5.82718849e-03,
        6.79952812e-01, 4.37417030e-03, 1.39560

In [11]:
"""Use this in multiparam cv
Best model :  LogisticRegression(C=1.5, max_iter=200, solver='newton-cg')
Best hyper params :  {'C': 1.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score :  0.8099000000000001"""

"Use this in multiparam cv\nBest model :  LogisticRegression(C=1.5, max_iter=200, solver='newton-cg')\nBest hyper params :  {'C': 1.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}\nBest score :  0.8099000000000001"