In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


#### Class Imbalance

In [3]:
#downsample majority class
from sklearn.utils import resample

df_majority = df_raw[df_raw.Class==0]
df_minority = df_raw[df_raw.Class==1]
df_downsampled = resample(df_majority, replace=False, n_samples=492)
df = pd.concat([df_downsampled, df_minority])
print(df.Class.value_counts())

1    492
0    492
Name: Class, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split
y = df['Class']
X = df.loc[:, ~df.columns.isin(['Class'])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

1    395
0    392
Name: Class, dtype: int64
0    100
1     97
Name: Class, dtype: int64


In [8]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3, 4]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 125}

Best score:
 0.9504447268106735

runtime:
 50.990227999999995 seconds


In [11]:
#train with best params
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

clf2 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=0.1,
                                           n_estimators=125,
                                           max_depth=3)
start_time = time.clock()
clf2.fit(X_train, y_train)
scores_clf2 = cross_val_score(clf2, X_train, y_train, cv=5)

#AUROC score
prob_y = clf2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9556962  0.94303797 0.95541401 0.92356688 0.94904459]

score array mean:
 0.9453519309844391

AUROC score:
 1.0

runtime:
 1.3219640000000012 seconds


In [13]:
#run on test set
scores_clf2_test = cross_val_score(clf2, X_test, y_test, cv=5)
prob_y_test = clf2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf2_test)
print('\nscore array mean:\n', np.mean(scores_clf2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.925      0.85       0.97435897 0.92307692 0.8974359 ]

score array mean:
 0.913974358974359

AUROC score:
 0.9702061855670102

runtime:
 2.194814000000001 seconds
