# Cross Validation 1 

Using CV to test an algorithm

We will use credit card default data from https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:,.2f}'.format

## Step-1: Download Data

In [3]:
import os
import urllib.request

data_url = 'https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default.csv'
data_location = os.path.basename(data_url)

if not os.path.exists (data_location):
    print("Downloading : ", data_url)
    urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

Downloading :  https://raw.githubusercontent.com/elephantscale/datasets/master/credit-card-default/default.csv
data_location: default.csv


In [4]:
data = pd.read_csv(data_location)
data.sample(10)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
13647,13648,120000,2,1,2,30,-1,2,-1,-1,...,0,165,0,0,165,0,165,0,0,1
14327,14328,20000,1,2,2,26,1,2,2,2,...,14952,15983,15576,1600,535,0,1272,0,1463,0
6518,6519,20000,1,3,2,22,0,0,0,0,...,16992,19025,35744,5109,1600,2400,3800,739,2000,1
1726,1727,50000,2,3,1,54,1,2,0,0,...,53936,51740,49043,0,2031,1918,1993,1614,1789,0
29014,29015,350000,2,1,2,32,-2,-1,-1,-1,...,10978,9326,13255,18970,5594,11024,9317,13321,4019,0
9492,9493,50000,1,3,1,38,1,5,4,3,...,47917,47020,48027,0,0,0,0,1943,1727,1
19204,19205,20000,2,2,2,24,0,0,0,2,...,19928,18711,19408,1500,3000,0,1000,1000,1500,0
21127,21128,500000,1,1,1,41,-1,-1,2,2,...,0,3280,2500,780,0,0,3280,0,0,0
5223,5224,90000,2,2,2,48,2,2,2,0,...,38041,42605,41396,9100,0,1170,5000,18000,5000,1
16101,16102,570000,2,1,2,32,0,0,0,0,...,36502,37669,39102,3000,4880,3000,3000,3000,3000,0


## Step-2: EDA

In [5]:
## Check data skew
data['default'].value_counts()

0    23364
1     6636
Name: default, dtype: int64

In [6]:
data['default'].value_counts(normalize=True)

0   0.78
1   0.22
Name: default, dtype: float64

## Step-3: Clean up 

TODO

## Step-4: Shape data

In [8]:
label_col = 'default'

feature_columns = data.columns
## TODO : drop 'ID' and 'default' columns
feature_columns = feature_columns.drop (['ID', 'default'])
print (feature_columns)

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')


In [9]:
X = data[feature_columns]
y = data[[label_col]]

print (X.shape)
print (y.shape)

(30000, 23)
(30000, 1)


## Step-5: Cross Validation

In [22]:
%%time 

from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

#algo = LogisticRegression(max_iter=500)
algo = KNeighborsClassifier()
#algo =  SVC()
#algo = RandomForestClassifier()

cv_results = cross_validate(estimator= algo, 
                            X=X, 
                            y=y, 
                            cv=5, 
                            scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],
                            return_train_score=True,
                            return_estimator=True,
                            n_jobs= -1)

CPU times: user 449 ms, sys: 161 ms, total: 610 ms
Wall time: 2min 22s


In [23]:
cv_results

{'fit_time': array([0.02579618, 0.02352738, 0.01707411, 0.01445127, 0.02014852]),
 'score_time': array([13.8977046 , 11.58499575, 10.80774713, 10.82382011,  8.0532403 ]),
 'estimator': [KNeighborsClassifier(),
  KNeighborsClassifier(),
  KNeighborsClassifier(),
  KNeighborsClassifier(),
  KNeighborsClassifier()],
 'test_accuracy': array([0.7495    , 0.74966667, 0.752     , 0.76133333, 0.763     ]),
 'train_accuracy': array([0.82083333, 0.81725   , 0.817625  , 0.814     , 0.815375  ]),
 'test_precision': array([0.36349454, 0.35864297, 0.38613861, 0.41264559, 0.41410488]),
 'train_precision': array([0.68488628, 0.67500948, 0.68160561, 0.66306445, 0.66910632]),
 'test_recall': array([0.17545181, 0.16729465, 0.2057272 , 0.18688772, 0.17256971]),
 'train_recall': array([0.35173323, 0.33527971, 0.32944057, 0.32360143, 0.32718026]),
 'test_f1': array([0.23666836, 0.22816033, 0.26843658, 0.25726141, 0.24361702]),
 'train_f1': array([0.46477471, 0.44802416, 0.44419048, 0.43493671, 0.43946869]),

In [24]:
from numpy import mean

print ("accuracy range = ",  min(cv_results['test_accuracy']),  " to ", max(cv_results['test_accuracy']))
print ("average accuracy : ", mean(cv_results['test_accuracy']))

accuracy range =  0.7495  to  0.763
average accuracy :  0.7551
