In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default2.csv
data_location: default2.csv


In [3]:
data = pd.read_csv(data_location)
data.sample(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
16355,16356,200000,2,2,1,48,0,0,-2,-1,...,902,0,0,0,490,902,0,0,0,0
13778,13779,30000,2,2,2,25,2,2,2,2,...,25714,26353,26868,1700,1100,1000,1200,1100,1200,1
10135,10136,40000,2,1,2,24,-1,2,-1,-1,...,780,390,150,0,390,780,0,150,780,1
16274,16275,200000,2,1,2,31,-1,2,-1,2,...,390,390,390,0,2253,0,390,390,390,0
8023,8024,360000,1,5,1,31,0,0,0,0,...,83359,85171,87046,7500,3500,2500,2500,2600,3000,0
1465,1466,90000,1,2,1,27,0,0,0,0,...,59949,54029,48170,2580,2337,1634,1606,1371,620,0
17387,17388,140000,2,2,1,40,0,0,0,0,...,99004,88485,85218,5000,5000,5000,3000,3000,4000,0
28008,28009,240000,2,1,3,29,1,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
6166,6167,30000,2,2,1,27,0,0,2,2,...,12718,12829,13083,3000,901,0,465,467,485,1
1560,1561,50000,2,2,1,48,0,0,0,0,...,14555,14860,15155,2526,1237,519,538,541,560,0


In [4]:
## Check data skew
data['default'].value_counts()
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

In [5]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
#print (feature_columns)
X = data[feature_columns].values
y = data[[label_col]].values

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


In [6]:
%%time 

from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

cv_results = cross_validate(estimator=LogisticRegression(max_iter=500), 
                            X=X, 
                            y=y.ravel(), 
                            cv=5, 
                            scoring=['accuracy'],
                            return_train_score=True,
                            return_estimator=True,
                            n_jobs= -1)

CPU times: user 111 ms, sys: 85.1 ms, total: 196 ms
Wall time: 2.78 s


In [7]:
cv_results

{'fit_time': array([0.8729794 , 0.89039063, 0.79051065, 0.71387124, 0.55618525]),
 'score_time': array([0.00234723, 0.00237131, 0.00283766, 0.00249815, 0.00141811]),
 'estimator': [LogisticRegression(max_iter=500),
  LogisticRegression(max_iter=500),
  LogisticRegression(max_iter=500),
  LogisticRegression(max_iter=500),
  LogisticRegression(max_iter=500)],
 'test_accuracy': array([0.7785    , 0.77883333, 0.77883333, 0.77883333, 0.77883333]),
 'train_accuracy': array([0.77883333, 0.77870833, 0.77879167, 0.77870833, 0.778625  ])}

In [8]:
from numpy import mean

print ("average accuracy : ", mean(cv_results['test_accuracy']))

average accuracy :  0.7787666666666667


In [9]:
from sklearn.tree import DecisionTreeClassifier
cv_resultsDT = cross_validate(estimator=DecisionTreeClassifier(), 
                            X=X, 
                            y=y.ravel(), 
                            cv=5, 
                            scoring=['accuracy'],
                            return_train_score=True,
                            return_estimator=True,
                            n_jobs= -1)

In [10]:
cv_resultsDT

{'fit_time': array([0.85481644, 0.85362506, 0.83097363, 0.83690834, 0.57010913]),
 'score_time': array([0.00337887, 0.0037477 , 0.00360537, 0.0039432 , 0.00272083]),
 'estimator': [DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 'test_accuracy': array([0.71583333, 0.716     , 0.72233333, 0.7385    , 0.73433333]),
 'train_accuracy': array([0.999375  , 0.99958333, 0.99925   , 0.999375  , 0.999375  ])}

In [11]:
from numpy import mean

print ("average accuracy : ", mean(cv_resultsDT['test_accuracy']))

average accuracy :  0.7254


In [12]:
from sklearn.ensemble import RandomForestClassifier
cv_resultsRF = cross_validate(estimator=RandomForestClassifier(), 
                            X=X, 
                            y=y.ravel(), 
                            cv=5, 
                            scoring=['accuracy'],
                            return_train_score=True,
                            return_estimator=True,
                            n_jobs= -1)
cv_resultsRF

{'fit_time': array([10.4940536 , 10.44581676, 10.59675598, 10.68567467,  6.99031281]),
 'score_time': array([0.32402968, 0.24787521, 0.25932837, 0.20170116, 0.15892506]),
 'estimator': [RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier()],
 'test_accuracy': array([0.80433333, 0.80666667, 0.81933333, 0.828     , 0.82133333]),
 'train_accuracy': array([0.999375  , 0.99958333, 0.99925   , 0.99925   , 0.999375  ])}

In [13]:
from numpy import mean

print ("average accuracy : ", mean(cv_resultsRF['test_accuracy']))

average accuracy :  0.8159333333333333


In [14]:
%%time

import xgboost as xgb

cv_resultsXG = cross_validate(estimator=xgb.XGBClassifier(), 
                            X=X, 
                            y=y.ravel(), 
                            cv=5, 
                            scoring=['accuracy'],
                            return_train_score=True,
                            return_estimator=True,
                            n_jobs= -1)
cv_resultsXG

CPU times: user 416 ms, sys: 27.5 ms, total: 443 ms
Wall time: 18.3 s


{'fit_time': array([10.66167808, 10.59903932, 10.6652782 , 10.50067067,  6.1174593 ]),
 'score_time': array([0.05017638, 0.02904654, 0.04325628, 0.02923751, 0.01873016]),
 'estimator': [XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
                importance_type=None, interaction_constraints='',
                learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
                max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
                missing=nan, monotone_constraints='()', n_estimators=100,
                n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
                reg_alpha=0, reg_lambda=1, ...),
  XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_byle

In [15]:
from numpy import mean

print ("average accuracy : ", mean(cv_resultsXG['test_accuracy']))

average accuracy :  0.8141666666666667


In [16]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
    #loss_function='CrossEntropy'
)

In [17]:
%%time

cv_resultsCB = cross_validate(estimator=clf, 
                            X=X, 
                            y=y.ravel(), 
                            cv=5, 
                            scoring=['accuracy'],
                            return_train_score=True,
                            return_estimator=True,
                            n_jobs= -1)
cv_resultsCB

CPU times: user 24.7 ms, sys: 5.42 ms, total: 30.1 ms
Wall time: 2.58 s


{'fit_time': array([0.84116745, 0.75206041, 0.69936132, 0.83662939, 0.35738826]),
 'score_time': array([0.14091372, 0.12333608, 0.12163544, 0.11134362, 0.05572724]),
 'estimator': [<catboost.core.CatBoostClassifier at 0x7f0643566f50>,
  <catboost.core.CatBoostClassifier at 0x7f0643566410>,
  <catboost.core.CatBoostClassifier at 0x7f0643566590>,
  <catboost.core.CatBoostClassifier at 0x7f0643566610>,
  <catboost.core.CatBoostClassifier at 0x7f0643566e90>],
 'test_accuracy': array([0.80683333, 0.80966667, 0.82433333, 0.82966667, 0.825     ]),
 'train_accuracy': array([0.823125  , 0.82266667, 0.82045833, 0.818875  , 0.81891667])}

In [18]:
from numpy import mean

print ("average accuracy : ", mean(cv_resultsCB['test_accuracy']))

average accuracy :  0.8190999999999999
