In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


#### Class Imbalance

In [3]:
#downsample majority class
from sklearn.utils import resample

df_majority = df_raw[df_raw.Class==0]
df_minority = df_raw[df_raw.Class==1]
df_downsampled = resample(df_majority, replace=False, n_samples=492)
df = pd.concat([df_downsampled, df_minority])
print(df.Class.value_counts())

1    492
0    492
Name: Class, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
y = df['Class']
X = df.loc[:, ~df.columns.isin(['Class'])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

1    397
0    390
Name: Class, dtype: int64
0    102
1     95
Name: Class, dtype: int64


### Gradient Boosting Classifier

In [5]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3, 4]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 250}

Best score:
 0.951715374841169

runtime:
 48.602552 seconds


In [6]:
#train with best params
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

clf2 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=0.1,
                                           n_estimators=125,
                                           max_depth=3)
start_time = time.clock()
clf2.fit(X_train, y_train)
scores_clf2 = cross_val_score(clf2, X_train, y_train, cv=10)

#AUROC score
prob_y = clf2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.94936709 0.92405063 0.94936709 0.91139241 0.96202532 0.98734177
 0.92405063 0.96153846 0.97435897 0.93589744]

score array mean:
 0.9479389808503733

AUROC score:
 1.0

runtime:
 2.496974999999999 seconds


In [7]:
#run on test set
scores_clf2_test = cross_val_score(clf2, X_test, y_test, cv=10)
prob_y_test = clf2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf2_test)
print('\nscore array mean:\n', np.mean(scores_clf2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.95238095 0.85714286 0.85       0.95       0.95       0.89473684
 0.89473684 0.84210526 1.         0.89473684]

score array mean:
 0.9085839598997494

AUROC score:
 0.9720330237358101

runtime:
 3.435350999999997 seconds


__Result:__ shows some overfitting, try applying subsample parameter

In [8]:
params = [{'loss':['exponential'],
           'learning_rate':[1],
           'n_estimators':[125],
           'max_depth':[3],
           'subsample':[0.5, 0.6, 0.7, 0.8, 0.9]}]

clf3 = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf3, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 1, 'loss': 'exponential', 'max_depth': 3, 'n_estimators': 125, 'subsample': 0.5}

Best score:
 0.9428208386277002

runtime:
 1.9390590000000003 seconds


In [9]:
clf3 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=125,
                                           max_depth=3,
                                           subsample=0.8)
start_time = time.clock()
clf3.fit(X_train, y_train)
scores_clf3 = cross_val_score(clf3, X_train, y_train, cv=10)

#AUROC score
prob_y = clf3.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf3)
print('\nscore array mean:\n', np.mean(scores_clf3))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.94936709 0.92405063 0.94936709 0.89873418 0.94936709 0.96202532
 0.93670886 0.96153846 0.98717949 0.91025641]

score array mean:
 0.9428594612138917

AUROC score:
 1.0

runtime:
 1.7450700000000055 seconds


In [10]:
#run on test set
scores_clf3_test = cross_val_score(clf3, X_test, y_test, cv=10)
prob_y_test = clf3.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf3_test)
print('\nscore array mean:\n', np.mean(scores_clf3_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9047619  0.9047619  0.9        0.95       0.95       0.89473684
 0.94736842 0.84210526 1.         0.94736842]

score array mean:
 0.924110275689223

AUROC score:
 0.9699690402476779

runtime:
 2.2192819999999998 seconds


__Result:__ Overfitting slightly less but still there

### Feature Selection

In [11]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

k=10
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X_train, y_train) #fit 
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X_train.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df[k_features]

X_ktrain, X_ktest, y_train, y_test = train_test_split(X_kbest, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

10 kbest features:
['V2', 'V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']
0    402
1    385
Name: Class, dtype: int64
1    107
0     90
Name: Class, dtype: int64


In [12]:
clf4 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=125,
                                           max_depth=3,
                                           subsample=0.8)
start_time = time.clock()
clf4.fit(X_ktrain, y_train)
scores_clf4 = cross_val_score(clf4, X_ktrain, y_train, cv=10)

#AUROC score
prob_y = clf4.predict_proba(X_ktrain)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf4)
print('\nscore array mean:\n', np.mean(scores_clf4))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.95       0.95       0.96202532 0.94936709 0.89873418 0.93589744
 0.96153846 0.94871795 0.92307692 0.93589744]

score array mean:
 0.9415254787406686

AUROC score:
 1.0

runtime:
 1.0537500000000009 seconds


In [13]:
scores_clf4_test = cross_val_score(clf4, X_ktest, y_test, cv=10)
prob_y_test = clf4.predict_proba(X_ktest)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf4_test)
print('\nscore array mean:\n', np.mean(scores_clf4_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.95       1.         0.9        1.         1.         0.95
 0.9        1.         0.94736842 0.89473684]

score array mean:
 0.9542105263157895

AUROC score:
 0.9771547248182761

runtime:
 1.4591670000000008 seconds


### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=.01, penalty='l1')
lr.fit(X_ktrain, y_train)

scores_lr = cross_val_score(lr, X_ktrain, y_train, cv=10)
prob_y = lr.predict_proba(X_ktrain)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr)
print('\nscore array mean:\n', np.mean(scores_lr))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9125     0.95       0.93670886 0.94936709 0.84810127 0.93589744
 0.94871795 0.91025641 0.8974359  0.94871795]

score array mean:
 0.9237702856215515

AUROC score:
 0.9755831233443174

runtime:
 1.6436940000000035 seconds


In [15]:
scores_lr_test = cross_val_score(lr, X_ktest, y_test, cv=10)
prob_y_test = lr.predict_proba(X_ktest)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_lr_test)
print('\nscore array mean:\n', np.mean(scores_lr_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.7        0.85       0.85       0.85       0.9        0.7
 0.6        0.78947368 0.73684211 0.89473684]

score array mean:
 0.7871052631578948

AUROC score:
 0.9757009345794393

runtime:
 1.683790000000002 seconds


### Ridge Classifier

In [16]:
from sklearn.linear_model import RidgeClassifier
rclf = RidgeClassifier(alpha=.01, fit_intercept=False)
rclf.fit(X_ktrain,y_train)

scores_rclf = cross_val_score(rclf, X_ktrain, y_train, cv=10)
#prob_y = rclf.predict_proba(X_ktrain)
#prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_rclf)
print('\nscore array mean:\n', np.mean(scores_rclf))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.7625     0.675      0.7721519  0.70886076 0.6835443  0.75641026
 0.80769231 0.76923077 0.73076923 0.71794872]

score array mean:
 0.7384108244076598

runtime:
 1.8225029999999975 seconds


In [17]:
scores_rclf_test = cross_val_score(rclf, X_ktest, y_test, cv=10)

print('score array:\n', scores_rclf_test)
print('\nscore array mean:\n', np.mean(scores_rclf_test))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.8        0.8        0.8        0.9        0.75       0.65
 0.65       0.78947368 0.73684211 0.68421053]

score array mean:
 0.7560526315789474

runtime:
 1.8727810000000034 seconds
