In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


#### Class Imbalance

In [19]:
#downsample majority class
from sklearn.utils import resample

df_majority = df_raw[df_raw.Class==0]
df_minority = df_raw[df_raw.Class==1]
df_downsampled = resample(df_majority, replace=False, n_samples=492)
df = pd.concat([df_downsampled, df_minority])
print(df.Class.value_counts())

1    492
0    492
Name: Class, dtype: int64


In [17]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

y = df['Class']
X = df.loc[:, ~df.columns.isin(['Class'])]

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1, 10],
           'n_estimators':[250, 500, 750, 1000],
           'max_depth':[2, 3, 4]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X, y)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.01, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 250}

Best score:
 0.9390243902439024

runtime:
 218.40638800000002 seconds


In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

clf2 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=0.01,
                                           n_estimators=250,
                                           max_depth=3)
start_time = time.clock()
clf2.fit(X, y)
scores_clf2 = cross_val_score(clf2, X, y, cv=5)

#AUROC score
prob_y = clf2.predict_proba(X)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.95959596 0.93939394 0.92346939 0.95408163 0.93877551]

score array mean:
 0.9430632859204288

AUROC score:
 0.9976328574261353

runtime:
 3.1496830000000102 seconds
