In [1]:
import numpy as np
import pandas as pd
import sklearn
import imblearn
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print('Class value counts:')
print(df_raw['Class'].value_counts())
print('\nPercent fraud: {}%'.format(
    ((df_raw['Class']==1).sum()/(df_raw['Class']==0).sum())*100))

Class value counts:
0    284315
1       492
Name: Class, dtype: int64

Percent fraud: 0.17304750013189596%


## Data Cleaning

- Dataset is huge and very imbalanced
- Variables are already principle components, perform some feature selection
- Address imbalance with under/oversampling techniques
- Need to select train & test sets that won't undersample the minority class

### Feature selection

In [3]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

X = df_raw.loc[:, ~df_raw.columns.isin(['Class'])] #data
y = df_raw['Class'] #target

k=5
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X, y)

#unmask k features selected
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df_raw[k_features]

5 kbest features:
['V10', 'V12', 'V14', 'V16', 'V17']


### Create train & test sets

In [4]:
#put k features and outcomes together
df_kbest = pd.concat([X_kbest, df_raw['Class']], axis=1)

#separate majority and minority classes
df_class0 = df_kbest.loc[df_kbest['Class'] == 0]
df_class1 = df_kbest.loc[df_kbest['Class'] == 1]

#set new feature/target variables for each class for train_test_split
X_0 = df_class0.drop(['Class'], axis=1)
y_0 = pd.DataFrame(df_class0['Class'])
X_1 = df_class1.drop(['Class'], axis=1)
y_1 = pd.DataFrame(df_class1['Class'])

#check df classes
#print(df_class0['Class'].value_counts())
#print(df_class1['Class'].value_counts())

In [5]:
from sklearn.model_selection import train_test_split

#majority class
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_0,
                                                        y_0,
                                                        test_size=0.2)
#minority class
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1,
                                                        y_1,
                                                        test_size=0.2)
#combine to create class proportional train & test sets
X_train = pd.concat([X_train0, X_train1])
X_test = pd.concat([X_test0, X_test1]) 
y_train = pd.concat([y_train0, y_train1])
y_test = pd.concat([y_test0, y_test1])

#check train & test class ratio against original data
print('df_raw fraud: {}%'.format(
    ((df_raw['Class']==1).sum()/(df_raw['Class']==0).sum())*100))
print('y_train fraud: {}%'.format(
    ((y_train['Class']==1).sum()/(y_train['Class']==0).sum())*100))
print('y_test fraud: {}%'.format(
    ((y_test['Class']==1).sum()/(y_test['Class']==0).sum())*100))

df_raw fraud: 0.17304750013189596%
y_train fraud: 0.1727837082109632%
y_test fraud: 0.17410266781562705%


### Class Imbalance
SKLearn is lacking in this area, supplement with __imblearn's random sampling methods__
- Cluster the records of the majority class
- __Under-sample:__ remove records from each cluster, thus seeking to preserve information
- __Over-sample:__ instead of creating exact copies of the minority class records, this introduces small variations into those copies, creating more diverse synthetic samples

In [6]:
from imblearn.under_sampling import RandomUnderSampler

#run random undersample on data
rus = RandomUnderSampler()
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
X_test_rus, y_test_rus = rus.fit_sample(X_test, y_test)

#check results
print('oversampled fraud: {}%'.format(
    len(y_train_rus[y_train_rus == 1]) / len(y_train_rus) * 100))
print(len(y_train_rus))
print(len(y_test_rus))

oversampled fraud: 50.0%
786
198


In [7]:
#rename oversampled variables names for consistency
X_train = X_train_rus
y_train = y_train_rus
X_test = X_test_rus
y_test = y_test_rus

## Random Forest Classifier

In [8]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

#parameter search
params = [{'n_estimators':[5, 10, 25, 50],
           'criterion':['entropy','gini'],
           'max_features':[1]}]

rfc = ensemble.RandomForestClassifier()
grid = GridSearchCV(estimator=rfc, param_grid=params, scoring='f1')

#start_time = time.clock()
grid.fit(X_train, y_train)
print('parameters:\n', grid.best_params_)
print('\nf1 score:\n', grid.best_score_)
#print('\nruntime:\n',time.clock() - start_time, 'seconds')

parameters:
 {'criterion': 'gini', 'max_features': 1, 'n_estimators': 50}

f1 score:
 0.9243074977073649


In [9]:
#train with params
rfc = ensemble.RandomForestClassifier(criterion='gini',
                                      max_features=1,
                                      n_estimators=50)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

__To evaluate, calculate:__
- __Confusion matrix:__ rows are actual, columns are prediction
- __Type I Error (false positive):__ identify as 1 (fraud) but 0
- __Type II Error (false negative):__ identify as 0 but 1
- __Sensitivity (recall):__ percentage that 1 was correctly identified
- __Specificity (precision):__ percentage that 0 was correctly identified

In [10]:
from sklearn.metrics import confusion_matrix

#confusion matrix
y_pred_rfc = rfc.predict(X_train)
rfc_cm = confusion_matrix(y_train, y_pred_rfc)

#error
ti_rfc = rfc_cm[0,1]
tii_rfc = rfc_cm[1,0]
sens_rfc = rfc_cm[1,1] / (rfc_cm[1,0] + rfc_cm[1,1])
spec_rfc = rfc_cm[0,0] / (rfc_cm[0,0] + rfc_cm[0,1])

print('RFC train confusion matrix:\n',rfc_cm)
print('\nRFC train type i error:\n',ti_rfc)
print('\nRFC train type ii error:\n',tii_rfc)
print('\nRFC train sensitivity (recall):\n',sens_rfc)
print('\nRFC train specificity (precision):\n',spec_rfc)

RFC train confusion matrix:
 [[393   0]
 [  0 393]]

RFC train type i error:
 0

RFC train type ii error:
 0

RFC train sensitivity (recall):
 1.0

RFC train specificity (precision):
 1.0


In [11]:
#evaluate on test set
#confusion matrix
y_pred_rfc = rfc.predict(X_test)
rfc_cm = confusion_matrix(y_test, y_pred_rfc)

#error
ti_rfc = rfc_cm[0,1]
tii_rfc = rfc_cm[1,0]
sens_rfc = rfc_cm[1,1] / (rfc_cm[1,0] + rfc_cm[1,1])
spec_rfc = rfc_cm[0,0] / (rfc_cm[0,0] + rfc_cm[0,1])

print('RFC test confusion matrix:\n',rfc_cm)
print('\nRFC test type i error:\n',ti_rfc)
print('\nRFC test type ii error:\n',tii_rfc)
print('\nRFC test sensitivity (recall):\n',sens_rfc)
print('\nRFC test specificity (precision):\n',spec_rfc)

RFC test confusion matrix:
 [[96  3]
 [12 87]]

RFC test type i error:
 3

RFC test type ii error:
 12

RFC test sensitivity (recall):
 0.8787878787878788

RFC test specificity (precision):
 0.9696969696969697


__Result:__ RFC is definitely overfitting. Even though classes are perfectly balanced after undersampling, there may not be enough data left for RFC to properly estimate

## Gradient Boosting Classifier

In [18]:
#parameter search
params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3, 4],
           'subsample':[0.25, 0.5, 0.75, 1]}]

gbc = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=gbc, param_grid=params, scoring='f1')

start_time = time.clock()
grid.fit(X_train, y_train)
print('parameters:\n', grid.best_params_)
print('\nf1 score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')

parameters:
 {'learning_rate': 0.01, 'loss': 'exponential', 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.25}

f1 score:
 0.9257450304185104

runtime:
 105.267665 seconds


In [19]:
#train with best params
gbc = ensemble.GradientBoostingClassifier(loss='exponential',
                                          learning_rate=0.01,
                                          n_estimators=500,
                                          max_depth=4,
                                          subsample=0.25)
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='exponential', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=0.25, verbose=0,
              warm_start=False)

In [20]:
#confusion matrix
y_pred_gbc = gbc.predict(X_train)
gbc_cm = confusion_matrix(y_train, y_pred_gbc)

#error
ti_gbc = gbc_cm[0,1]
tii_gbc = gbc_cm[1,0]
sens_gbc = gbc_cm[1,1] / (gbc_cm[1,0] + gbc_cm[1,1])
spec_gbc = gbc_cm[0,0] / (gbc_cm[0,0] + gbc_cm[0,1])

print('GBC train confusion matrix:\n',gbc_cm)
print('\nGBC train type i error:\n',ti_gbc)
print('\nGBC train type ii error:\n',tii_gbc)
print('\nGBC train sensitivity (recall):\n',sens_gbc)
print('\nGBC train specificity (precision):\n',spec_gbc)

GBC train confusion matrix:
 [[386   7]
 [ 37 356]]

GBC train type i error:
 7

GBC train type ii error:
 37

GBC train sensitivity (recall):
 0.905852417302799

GBC train specificity (precision):
 0.9821882951653944


In [21]:
#evaluate clf_gbc on test set
gbc.fit(X_test, y_test)

#confusion matrix
y_pred_gbc = gbc.predict(X_test)
gbc_cm = confusion_matrix(y_test, y_pred_gbc)

#error
ti_gbc = gbc_cm[0,1]
tii_gbc = gbc_cm[1,0]
sens_gbc = gbc_cm[1,1] / (gbc_cm[1,0] + gbc_cm[1,1])
spec_gbc = gbc_cm[0,0] / (gbc_cm[0,0] + gbc_cm[0,1])

print('GBC test confusion matrix:\n',gbc_cm)
print('\nGBC test type i error:\n',ti_gbc)
print('\nGBC test type ii error:\n',tii_gbc)
print('\nGBC test sensitivity (recall):\n',sens_gbc)
print('\nGBC test specificity (precision):\n',spec_gbc)

GBC test confusion matrix:
 [[99  0]
 [10 89]]

GBC test type i error:
 0

GBC test type ii error:
 10

GBC test sensitivity (recall):
 0.898989898989899

GBC test specificity (precision):
 1.0


__Result:__ This model is very inconsistent, often has higher test scores than train scores, could data limitations on tree based classifiers, I don't trust what this model is doing. Varying train/test results that are not very reproducible excludes GBC as best choice for this task.

## Support Vector Classifier

In [26]:
#parameter search
from sklearn.svm import SVC

params = [{'C':[0.01, 0.1, 1, 10],
           'kernel':['rbf','linear','poly']}]

svc = SVC()
grid = GridSearchCV(estimator=svc, param_grid=params, scoring='f1')

#start_time = time.clock()
grid.fit(X_train, y_train)
print('parameters:\n', grid.best_params_)
print('\nf1 score:\n', grid.best_score_)
#print('\nruntime:\n',time.clock() - start_time, 'seconds')

parameters:
 {'C': 1, 'kernel': 'rbf'}

f1 score:
 0.9193043016107612


In [27]:
#train with params
svc = SVC(C=1,kernel='rbf')
svc.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
#confusion matrix
y_pred_svc = svc.predict(X_train)
svc_cm = confusion_matrix(y_train, y_pred_svc)

#error
ti_svc = svc_cm[0,1]
tii_svc = svc_cm[1,0]
sens_svc = svc_cm[1,1] / (svc_cm[1,0] + svc_cm[1,1])
spec_svc = svc_cm[0,0] / (svc_cm[0,0] + svc_cm[0,1])

print('SVC train confusion matrix:\n',svc_cm)
print('\nSVC train type i error:\n',ti_svc)
print('\nSVC train type ii error:\n',tii_svc)
print('\nSVC train sensitivity (recall):\n',sens_svc)
print('\nSVC train specificity (precision):\n',spec_svc)

SVC train confusion matrix:
 [[389   4]
 [ 44 349]]

SVC train type i error:
 4

SVC train type ii error:
 44

SVC train sensitivity (recall):
 0.8880407124681934

SVC train specificity (precision):
 0.989821882951654


In [29]:
#confusion matrix
y_pred_svc = svc.predict(X_test)
svc_cm = confusion_matrix(y_test, y_pred_svc)

#error
ti_svc = svc_cm[0,1]
tii_svc = svc_cm[1,0]
sens_svc = svc_cm[1,1] / (svc_cm[1,0] + svc_cm[1,1])
spec_svc = svc_cm[0,0] / (svc_cm[0,0] + svc_cm[0,1])

print('SVC test confusion matrix:\n',svc_cm)
print('\nSVC test type i error:\n',ti_svc)
print('\nSVC test type ii error:\n',tii_svc)
print('\nSVC test sensitivity (recall):\n',sens_svc)
print('\nSVC test specificity (precision):\n',spec_svc)

SVC test confusion matrix:
 [[98  1]
 [12 87]]

SVC test type i error:
 1

SVC test type ii error:
 12

SVC test sensitivity (recall):
 0.8787878787878788

SVC test specificity (precision):
 0.98989898989899


__Result:__ most trustworthy results so far, decently accurate without much evidence of overfitting

## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

#parameter search
params = [{'penalty':['l1','l2'],
           'C':[0.01, 0.1, 1, 10],
           'fit_intercept':[True, False],
           'max_iter':[50,100,200]}]

clf_lr = LogisticRegression()
grid = GridSearchCV(estimator=clf_lr, param_grid=params, scoring='f1')

#start_time = time.clock()
grid.fit(X_train, y_train)
print('parameters:\n', grid.best_params_)
print('\nf1 score:\n', grid.best_score_)
#print('\nruntime:\n',time.clock() - start_time, 'seconds')

parameters:
 {'C': 0.01, 'fit_intercept': True, 'max_iter': 50, 'penalty': 'l2'}

f1 score:
 0.9211407007018042


In [31]:
#train
clf_lr = LogisticRegression(penalty='l2',
                            C=0.01,
                            fit_intercept=True,
                            max_iter=50)
clf_lr.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=50, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
#confusion matrix
y_pred_lr = clf_lr.predict(X_train)
lr_cm = confusion_matrix(y_train, y_pred_lr)

#error
ti_lr = lr_cm[0,1]
tii_lr = lr_cm[1,0]
sens_lr = lr_cm[1,1] / (lr_cm[1,0] + lr_cm[1,1])
spec_lr = lr_cm[0,0] / (lr_cm[0,0] + lr_cm[0,1])

print('LR train confusion matrix:\n',lr_cm)
print('\nLR train type i error:\n',ti_lr)
print('\nLR train type ii error:\n',tii_lr)
print('\nLR train sensitivity (recall):\n',sens_lr)
print('\nLR train specificity (precision):\n',spec_lr)

LR train confusion matrix:
 [[382  11]
 [ 47 346]]

LR train type i error:
 11

LR train type ii error:
 47

LR train sensitivity (recall):
 0.8804071246819338

LR train specificity (precision):
 0.9720101781170484


In [33]:
#test
y_pred_lr = clf_lr.predict(X_test)
lr_cm = confusion_matrix(y_test, y_pred_lr)

#error
ti_lr = lr_cm[0,1]
tii_lr = lr_cm[1,0]
sens_lr = lr_cm[1,1] / (lr_cm[1,0] + lr_cm[1,1])
spec_lr = lr_cm[0,0] / (lr_cm[0,0] + lr_cm[0,1])

print('LR test confusion matrix:\n',lr_cm)
print('\nLR test type i error:\n',ti_lr)
print('\nLR test type ii error:\n',tii_lr)
print('\nLR test sensitivity (recall):\n',sens_lr)
print('\nLR test specificity (precision):\n',spec_lr)

LR test confusion matrix:
 [[96  3]
 [10 89]]

LR test type i error:
 3

LR test type ii error:
 10

LR test sensitivity (recall):
 0.898989898989899

LR test specificity (precision):
 0.9696969696969697


__Result:__ LR with ridge parameter also gives results on the trustworthy spectrum, performance is similar to SVC though slightly less accurate. The test set will occasionally outperform train by about 1%, still prefer SVC for its slightly higher accuracy and more consistent/reproducable results.