In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)
data = pd.read_csv(data_location)
data.sample(10)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv
data_location: default2.csv


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
10975,10976,200000,2,2,2,24,0,0,0,0,...,29486,27166,25954,2649,3500,4008,1098,1500,1000,0
20149,20150,140000,2,2,1,37,0,0,0,0,...,62925,64280,67079,3000,3000,3000,3000,4000,4000,0
29363,29364,390000,1,2,2,29,-1,-1,-1,0,...,7151,-200,-200,6931,5985,7000,0,0,0,0
4995,4996,20000,2,2,1,36,0,0,0,0,...,19224,19050,19353,2000,1283,1003,1000,1002,1005,0
23911,23912,60000,1,1,2,23,0,0,0,0,...,19443,16597,18423,20109,5000,2000,1000,2000,1000,0
6066,6067,280000,2,1,1,41,2,-1,-1,-1,...,0,0,244,6859,486,0,0,244,0,0
7994,7995,100000,2,2,2,23,-1,-1,-1,-1,...,620,0,627,656,687,620,0,627,3620,1
7035,7036,240000,2,1,1,37,-1,-1,-2,-2,...,0,0,0,0,0,0,0,0,0,0
6344,6345,190000,2,2,2,27,2,2,2,0,...,7104,8973,10809,1000,1000,1000,2000,2000,2000,1
3580,3581,300000,1,1,1,54,-1,-1,-1,-1,...,718,468,366,555,0,718,468,366,0,1


In [3]:
## Check data skew
data['default'].value_counts()
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

In [4]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)
X = data[feature_columns].values
y = data[[label_col]].values

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


In [5]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid

algo = LogisticRegression(max_iter=500)

# find out parameters
print(algo.get_params())

# build a param-grid
param_grid =  {
                    'penalty' : ['l1', 'l2'],
                    'C' : [0.5, 1.0, 1.5],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50,100,200,500,1000]
              }

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [6]:
%%time 

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=algo, 
                             param_grid=param_grid, 
                             cv = 5, 
                             scoring='accuracy',
                             return_train_score=True,
                             n_jobs = -1)

grid_search_results = grid_search.fit (X,y.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

CPU times: user 23.9 s, sys: 11.1 s, total: 35 s
Wall time: 10min 10s




In [7]:
# get best parameters

best_model = grid_search_results.best_estimator_
print ("Best model : ", best_model)
print ("Best hyper params : ", grid_search_results.best_params_)
print ("Best score : ", grid_search_results.best_score_)
# internal details

grid_search_results.cv_results_

Best model :  LogisticRegression(C=1.5, max_iter=200, solver='newton-cg')
Best hyper params :  {'C': 1.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'newton-cg'}
Best score :  0.8099000000000001


{'mean_fit_time': array([7.09075928e-03, 3.88245583e-03, 9.40529919e-01, 4.50167656e-03,
        1.34290347e+00, 1.70719290e+00, 2.79060984e-01, 1.47844729e+00,
        9.00476551e-01, 9.44703579e-01, 4.27913666e-03, 4.18558121e-03,
        7.90745163e-01, 4.34117317e-03, 2.60201554e+00, 3.72612367e+00,
        5.10265112e-01, 1.43778167e+00, 1.68437481e+00, 1.86926436e+00,
        4.59346771e-03, 4.38675880e-03, 6.85246325e-01, 4.20131683e-03,
        5.54244952e+00, 9.62347312e+00, 9.41152716e-01, 1.44456878e+00,
        3.45708251e+00, 3.66763372e+00, 4.22406197e-03, 4.23936844e-03,
        7.87655926e-01, 4.50606346e-03, 1.30153435e+01, 8.80601315e+00,
        9.69140863e-01, 1.52746410e+00, 8.50322332e+00, 9.22402768e+00,
        4.22506332e-03, 4.22196388e-03, 6.53403425e-01, 4.13312912e-03,
        2.54810806e+01, 8.13117037e+00, 8.70671415e-01, 1.33268623e+00,
        1.63237795e+01, 1.84236846e+01, 4.20093536e-03, 4.13374901e-03,
        5.82593822e-01, 4.20169830e-03, 1.32896