In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())

0    284315
1       492
Name: Class, dtype: int64


#### Class Imbalance

In [3]:
#downsample majority class
from sklearn.utils import resample

df_majority = df_raw[df_raw.Class==0]
df_minority = df_raw[df_raw.Class==1]
df_downsampled = resample(df_majority, replace=False, n_samples=492)
df = pd.concat([df_downsampled, df_minority])
print(df.Class.value_counts())

1    492
0    492
Name: Class, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
y = df['Class']
X = df.loc[:, ~df.columns.isin(['Class'])]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

1    397
0    390
Name: Class, dtype: int64
0    102
1     95
Name: Class, dtype: int64


### Gradient Boosting Classifier

In [6]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3, 4]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.01, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 250}

Best score:
 0.9428208386277002

runtime:
 50.221919 seconds


¿ GridSearchCV often finds different parameters each time I run it, how should I handle this ?

In [7]:
#train with best params
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

clf2 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=0.01,
                                           n_estimators=250,
                                           max_depth=2)
start_time = time.clock()
clf2.fit(X_train, y_train)
scores_clf2 = cross_val_score(clf2, X_train, y_train, cv=10)

#AUROC score
prob_y = clf2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.97468354 0.91139241 0.96202532 0.93670886 0.97468354 0.91139241
 0.96202532 0.92307692 0.96153846 0.93589744]

score array mean:
 0.9453424212917885

AUROC score:
 0.9932958728928503

runtime:
 3.458393000000001 seconds


In [8]:
#run on test set
scores_clf2_test = cross_val_score(clf2, X_test, y_test, cv=10)
prob_y_test = clf2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf2_test)
print('\nscore array mean:\n', np.mean(scores_clf2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9047619  1.         0.8        0.95       0.9        1.
 0.94736842 0.89473684 0.89473684 0.94736842]

score array mean:
 0.9238972431077694

AUROC score:
 0.9883384932920537

runtime:
 4.716825 seconds


__Result:__ shows some overfitting (degree varies depending on output), try applying subsample parameter

In [9]:
params = [{'loss':['exponential'],
           'learning_rate':[.01],
           'n_estimators':[250],
           'max_depth':[2],
           'subsample':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

clf3 = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf3, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.01, 'loss': 'exponential', 'max_depth': 2, 'n_estimators': 250, 'subsample': 0.4}

Best score:
 0.9453621346886912

runtime:
 5.990197000000009 seconds


In [12]:
clf3 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=.01,
                                           n_estimators=250,
                                           max_depth=2,
                                           subsample=0.4)
start_time = time.clock()
clf3.fit(X_train, y_train)
scores_clf3 = cross_val_score(clf3, X_train, y_train, cv=10)

#AUROC score
prob_y = clf3.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf3)
print('\nscore array mean:\n', np.mean(scores_clf3))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.96202532 0.91139241 0.96202532 0.93670886 0.98734177 0.92405063
 0.96202532 0.91025641 0.97435897 0.94871795]

score array mean:
 0.9478902953586499

AUROC score:
 0.9927920945553188

runtime:
 2.709973000000005 seconds


In [14]:
#run on test set
scores_clf3_test = cross_val_score(clf3, X_test, y_test, cv=10)
prob_y_test = clf3.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf3_test)
print('\nscore array mean:\n', np.mean(scores_clf3_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.9047619  1.         0.8        0.95       0.95       1.
 0.94736842 0.89473684 0.89473684 0.89473684]

score array mean:
 0.9236340852130327

AUROC score:
 0.9886480908152735

runtime:
 5.153395000000003 seconds


__Result:__ Overfitting slightly less but still there

### Feature Selection

Features are already principle components

In [15]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

k=10
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X_train, y_train) #fit 
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X_train.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df[k_features]

X_ktrain, X_ktest, y_train, y_test = train_test_split(X_kbest, y, test_size=.2)
print(y_train.value_counts())
print(y_test.value_counts())

10 kbest features:
['V2', 'V3', 'V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']
0    398
1    389
Name: Class, dtype: int64
1    103
0     94
Name: Class, dtype: int64


In [16]:
clf4 = ensemble.GradientBoostingClassifier(loss='exponential',
                                           learning_rate=1,
                                           n_estimators=125,
                                           max_depth=3,
                                           subsample=0.8)
start_time = time.clock()
clf4.fit(X_ktrain, y_train)
scores_clf4 = cross_val_score(clf4, X_ktrain, y_train, cv=10)

#AUROC score
prob_y = clf4.predict_proba(X_ktrain)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf4)
print('\nscore array mean:\n', np.mean(scores_clf4))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.94936709 0.89873418 0.89873418 0.94936709 0.96202532 0.93670886
 0.96202532 0.94936709 0.93589744 0.92207792]

score array mean:
 0.9364304471899407

AUROC score:
 1.0

runtime:
 1.0831489999999917 seconds


In [17]:
scores_clf4_test = cross_val_score(clf4, X_ktest, y_test, cv=10)
prob_y_test = clf4.predict_proba(X_ktest)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf4_test)
print('\nscore array mean:\n', np.mean(scores_clf4_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.95238095 0.95238095 0.9047619  1.         1.         0.94736842
 0.94736842 1.         0.78947368 0.89473684]

score array mean:
 0.9388471177944862

AUROC score:
 0.9806858087172071

runtime:
 1.4745650000000126 seconds


### Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=.01, penalty='l1')
lr.fit(X_ktrain, y_train)

scores_lr = cross_val_score(lr, X_ktrain, y_train, cv=10)
prob_y = lr.predict_proba(X_ktrain)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr)
print('\nscore array mean:\n', np.mean(scores_lr))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.92405063 0.89873418 0.92405063 0.92405063 0.94936709 0.87341772
 1.         0.92405063 0.88461538 0.88311688]

score array mean:
 0.9185453786719611

AUROC score:
 0.9755913242304066

runtime:
 1.7359280000000012 seconds


In [19]:
scores_lr_test = cross_val_score(lr, X_ktest, y_test, cv=10)
prob_y_test = lr.predict_proba(X_ktest)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_lr_test)
print('\nscore array mean:\n', np.mean(scores_lr_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.85714286 0.66666667 0.71428571 0.75       0.84210526 0.84210526
 0.84210526 0.78947368 0.57894737 0.84210526]

score array mean:
 0.7724937343358396

AUROC score:
 0.9795496798182194

runtime:
 1.7764520000000061 seconds


### Ridge Classifier

In [20]:
from sklearn.linear_model import RidgeClassifier
rclf = RidgeClassifier(alpha=.01, fit_intercept=False)
rclf.fit(X_ktrain,y_train)

scores_rclf = cross_val_score(rclf, X_ktrain, y_train, cv=10)
#prob_y = rclf.predict_proba(X_ktrain)
#prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_rclf)
print('\nscore array mean:\n', np.mean(scores_rclf))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.78481013 0.65822785 0.73417722 0.69620253 0.78481013 0.6835443
 0.87341772 0.74683544 0.71794872 0.68831169]

score array mean:
 0.7368285722716102

runtime:
 1.9633680000000027 seconds


In [21]:
scores_rclf_test = cross_val_score(rclf, X_ktest, y_test, cv=10)

print('score array:\n', scores_rclf_test)
print('\nscore array mean:\n', np.mean(scores_rclf_test))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.71428571 0.80952381 0.66666667 0.7        0.89473684 0.68421053
 0.73684211 0.84210526 0.57894737 0.73684211]

score array mean:
 0.7364160401002506

runtime:
 2.007283000000001 seconds
