In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)
data = pd.read_csv(data_location)
data.sample(10)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv
data_location: default2.csv


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
20662,20663,130000,1,1,1,42,0,0,0,0,...,19960,81985,80189,1288,1700,2000,63000,2885,3000,0
23810,23811,240000,2,2,1,50,-2,-2,-2,-2,...,12208,790,2009,1086,7041,12208,1109,2009,1077,0
20282,20283,50000,2,1,1,50,2,2,2,2,...,41316,41998,42674,1960,2000,1700,1662,1493,0,1
26683,26684,30000,1,1,2,24,0,0,0,2,...,30248,30354,29088,1800,3300,0,1500,1500,1300,0
20659,20660,330000,1,2,1,32,2,2,2,2,...,64877,66398,72113,6000,2400,0,2600,8758,2700,0
16582,16583,180000,1,1,2,26,0,0,-2,-2,...,0,0,0,0,0,0,0,0,0,1
16218,16219,120000,2,1,2,45,0,0,0,0,...,89737,95451,97379,3079,3185,3251,7200,3600,3900,0
24433,24434,280000,1,1,2,47,-2,-2,-2,-2,...,1000,5507,5348,251,4250,1000,5507,5348,433,0
16797,16798,110000,1,2,2,32,0,0,0,0,...,20799,-1961,-1961,1575,1377,1007,0,0,66296,0
19790,19791,520000,2,1,2,35,0,0,0,0,...,55758,57911,54152,5000,7000,5000,5004,5000,5000,0


In [3]:
## Check data skew
data['default'].value_counts()
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

In [4]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)
X = data[feature_columns].values
y = data[[label_col]].values

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


In [5]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid

algo = LogisticRegression(max_iter=500)

# find out parameters
print(algo.get_params())

# build a param-grid
param_grid =  {
                    'penalty' : ['l1', 'l2'],
                    'C' : [0.5, 1.0, 1.5],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50,100,200,500,1000]
              }

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [6]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=algo, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,y.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 12.2 s, sys: 9.2 s, total: 21.4 s
Wall time: 8min 27s




In [7]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)
# internal details

grid_search_results.cv_results_

Best model :  LogisticRegression(C=1.5, max_iter=200, solver='newton-cg')
Best hyper params :  {'C': 1.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score :  0.8099000000000001


{'mean_fit_time': array([4.85262871e-03, 3.08933258e-03, 8.75681305e-01, 3.46860886e-03,
        9.19174576e-01, 1.26949139e+00, 1.90827084e-01, 1.41793032e+00,
        8.45028591e-01, 9.18437481e-01, 2.35466957e-03, 2.35228539e-03,
        8.98895454e-01, 3.91840935e-03, 1.92917495e+00, 2.63388796e+00,
        4.45323229e-01, 1.44725752e+00, 1.76248140e+00, 1.67835941e+00,
        2.17690468e-03, 2.11772919e-03, 8.93573236e-01, 3.46999168e-03,
        3.69845719e+00, 6.25170689e+00, 9.79533100e-01, 1.51927929e+00,
        3.20249057e+00, 3.29479799e+00, 3.21512222e-03, 3.28226089e-03,
        8.28665590e-01, 2.53181458e-03, 9.18815928e+00, 6.76446667e+00,
        1.02883759e+00, 1.47630630e+00, 7.72828336e+00, 8.54779615e+00,
        3.20858955e-03, 3.19857597e-03, 8.45303249e-01, 3.21507454e-03,
        1.82074837e+01, 5.77608562e+00, 9.28364038e-01, 1.43617120e+00,
        1.57970027e+01, 1.67887982e+01, 3.16720009e-03, 3.20844650e-03,
        8.42020702e-01, 3.20072174e-03, 9.15878

In [8]:
"""Use this in multiparam cv
Best model :  LogisticRegression(C=1.5, max_iter=200, solver='newton-cg')
Best hyper params :  {'C': 1.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score :  0.8099000000000001"""

"Use this in multiparam cv\nBest model :  LogisticRegression(C=1.5, max_iter=200, solver='newton-cg')\nBest hyper params :  {'C': 1.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}\nBest score :  0.8099000000000001"