In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


#### Class Imbalance

In [3]:
#downsample majority class
from sklearn.utils import resample

df_majority = df_raw[df_raw.Class==0]
df_minority = df_raw[df_raw.Class==1]
df_downsampled = resample(df_majority, replace=False, n_samples=492)
df = pd.concat([df_downsampled, df_minority])
print(df.Class.value_counts())

1    492
0    492
Name: Class, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
y = df['Class']
X = df.loc[:, ~df.columns.isin(['Class'])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

0    394
1    393
Name: Class, dtype: int64
1    99
0    98
Name: Class, dtype: int64


In [5]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3, 4]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 125}

Best score:
 0.9479034307496823

runtime:
 49.169527 seconds


In [6]:
#train with best params
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

clf2 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=0.1,
                                           n_estimators=125,
                                           max_depth=3)
start_time = time.clock()
clf2.fit(X_train, y_train)
scores_clf2 = cross_val_score(clf2, X_train, y_train, cv=10)

#AUROC score
prob_y = clf2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9        0.925      0.9625     0.96202532 0.93589744 0.98717949
 0.93589744 0.93589744 0.93589744 0.94871795]

score array mean:
 0.9429012495942877

AUROC score:
 1.0

runtime:
 2.619609000000004 seconds


In [7]:
#run on test set
scores_clf2_test = cross_val_score(clf2, X_test, y_test, cv=10)
prob_y_test = clf2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf2_test)
print('\nscore array mean:\n', np.mean(scores_clf2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9        0.95       0.9        0.85       0.8        0.95
 0.9        1.         0.89473684 0.94444444]

score array mean:
 0.9089181286549708

AUROC score:
 0.976602762317048

runtime:
 3.520240000000001 seconds


__Result:__ shows some overfitting, try applying subsample parameter

In [8]:
params = [{'loss':['exponential'],
           'learning_rate':[1],
           'n_estimators':[125],
           'max_depth':[3],
           'subsample':[0.5, 0.6, 0.7, 0.8, 0.9]}]

clf3 = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf3, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 125, 'subsample': 0.8}

Best score:
 0.9428208386277002

runtime:
 1.914601999999995 seconds


In [9]:
clf3 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=125,
                                           max_depth=3,
                                           subsample=0.8)
start_time = time.clock()
clf3.fit(X_train, y_train)
scores_clf3 = cross_val_score(clf3, X_train, y_train, cv=10)

#AUROC score
prob_y = clf3.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf3)
print('\nscore array mean:\n', np.mean(scores_clf3))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.925      0.9125     0.9625     0.94936709 0.93589744 0.98717949
 0.92307692 0.91025641 0.93589744 0.96153846]

score array mean:
 0.9403213242453748

AUROC score:
 1.0

runtime:
 1.7049020000000041 seconds


In [10]:
#run on test set
scores_clf3_test = cross_val_score(clf3, X_test, y_test, cv=10)
prob_y_test = clf3.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf3_test)
print('\nscore array mean:\n', np.mean(scores_clf3_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9        0.95       0.9        0.85       0.85       0.9
 0.95       1.         0.89473684 0.94444444]

score array mean:
 0.9139181286549709

AUROC score:
 0.9651618223046794

runtime:
 2.174100000000003 seconds


__Result:__ Overfitting slightly less but still there

### Feature Selection

In [17]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

k=10
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X_train, y_train) #fit 
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X_train.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df[k_features]

X_ktrain, X_ktest, y_train, y_test = train_test_split(X_kbest, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

10 kbest features:
['Time', 'V4', 'V8', 'V14', 'V16', 'V21', 'V23', 'V25', 'V28', 'Amount']
1    394
0    393
Name: Class, dtype: int64
0    99
1    98
Name: Class, dtype: int64


In [18]:
clf4 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=125,
                                           max_depth=3,
                                           subsample=0.8)
start_time = time.clock()
clf4.fit(X_ktrain, y_train)
scores_clf4 = cross_val_score(clf4, X_ktrain, y_train, cv=10)

#AUROC score
prob_y = clf4.predict_proba(X_ktrain)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf4)
print('\nscore array mean:\n', np.mean(scores_clf4))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.95       0.9125     0.9375     0.89873418 0.93589744 0.88461538
 0.93589744 0.87179487 0.92307692 0.93589744]

score array mean:
 0.9185913664394677

AUROC score:
 1.0

runtime:
 1.1496199999999988 seconds


In [19]:
scores_clf4_test = cross_val_score(clf4, X_ktest, y_test, cv=10)
prob_y_test = clf4.predict_proba(X_ktest)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf4_test)
print('\nscore array mean:\n', np.mean(scores_clf4_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [1.         1.         0.9        0.95       0.9        0.95
 1.         0.85       0.94736842 0.94444444]

score array mean:
 0.9441812865497077

AUROC score:
 0.9820655534941248

runtime:
 1.549309000000008 seconds
