In [1]:
import numpy as np
import pandas as pd
#import scipy
#import matplotlib.pyplot as plt
#import seaborn as sns
import sklearn
import imblearn
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())
print('Percent fraud: {}%'.format(
    ((df_raw['Class']==1).sum()/(df_raw['Class']==0).sum())*100))

0    284315
1       492
Name: Class, dtype: int64
Percent fraud: 0.17304750013189596%


## Data Cleaning

- Dataset is huge and very imbalanced
- Variables are already principle components, perform some feature selection
- Address imbalance with under/oversampling techniques
- Need to select train & test sets that won't undersample the minority class

### Sample dataset

In [3]:
#sample 10% of the full dataset
#keep the fraud ratio close using random state
#df_sample = df_raw.sample(frac=0.1, replace=True, random_state=6)
#print(df_sample.Class.value_counts())
#print('\nPercent Fraud:')
#print('\nSample df: {}%'.format(
#    ((df_sample['Class']==1).sum() / (df_sample['Class']==0).sum())*100))
#print('\nFull df: {}%'.format(
#    ((df_raw['Class']==1).sum() / (df_raw['Class']==0).sum())*100))

#y_sample = df_sample['Class'] #target
#X_sample = df_sample.loc[:, ~df_sample.columns.isin(['Class'])] #data

### Feature selection

In [15]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

X = df_raw.loc[:, ~df_raw.columns.isin(['Class'])] #data
y = df_raw['Class'] #target

k=3
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X, y)

#unmask k features selected
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df_raw[k_features]
X_kbest.head()

3 kbest features:
['V12', 'V14', 'V17']


Unnamed: 0,V12,V14,V17
0,-0.617801,-0.311169,0.207971
1,1.065235,-0.143772,-0.114805
2,0.066084,-0.165946,1.109969
3,0.178228,-0.287924,-0.684093
4,0.538196,-1.11967,-0.237033


### Create train & test sets

In [16]:
#put k features and outcomes together
df_kbest = pd.concat([X_kbest, df_raw['Class']], axis=1)

#separate majority and minority classes
df_class0 = df_kbest.loc[df_kbest['Class'] == 0]
df_class1 = df_kbest.loc[df_kbest['Class'] == 1]

#set new feature/target variables for each class for train_test_split
X_0 = df_class0.drop(['Class'], axis=1)
y_0 = pd.DataFrame(df_class0['Class'])
X_1 = df_class1.drop(['Class'], axis=1)
y_1 = pd.DataFrame(df_class1['Class'])

In [17]:
from sklearn.model_selection import train_test_split

#majority class
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_0,
                                                        y_0,
                                                        test_size=0.2)
#minority class
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1,
                                                        y_1,
                                                        test_size=0.2)
#combine to create class proportional train & test sets
X_train = pd.concat([X_train0, X_train1])
X_test = pd.concat([X_test0, X_test1]) 
y_train = pd.concat([y_train0, y_train1])
y_test = pd.concat([y_test0, y_test1])

#check train & test class ratio against original data
print('df_raw fraud: {}%'.format(
    ((df_raw['Class']==1).sum()/(df_raw['Class']==0).sum())*100))
print('y_train fraud: {}%'.format(
    ((y_train['Class']==1).sum()/(y_train['Class']==0).sum())*100))
print('y_test fraud: {}%'.format(
    ((y_test['Class']==1).sum()/(y_test['Class']==0).sum())*100))

df_raw fraud: 0.17304750013189596%
y_train fraud: 0.1727837082109632%
y_test fraud: 0.17410266781562705%


### Class Imbalance
SKLearn is lacking in this area, supplement with __imblearn's random sampling methods__
- Cluster the records of the majority class
- __Under-sample:__ remove records from each cluster, thus seeking to preserve information
- __Over-sample:__ instead of creating exact copies of the minority class records, this introduces small variations into those copies, creating more diverse synthetic samples

In [18]:
#undersample
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_train_rus, y_train_rus = rus.fit_sample(X_train, y_train)
X_test_rus, y_test_rus = rus.fit_sample(X_test, y_test)

#easier variables names
X_train = X_train_rus
y_train = y_train_rus
X_test = X_test_rus
y_test = y_test_rus

## Gradient Boosting Classifier

In [19]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV

#parameter search
params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3, 4],
           'subsample':[0.25, 0.5, 0.75, 1]}]

clf_gbc = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf_gbc, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.01, 'loss': 'exponential', 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.25}

Best score:
 0.9236641221374046

runtime:
 101.303015 seconds


In [20]:
#train with best params
clf_gbc = ensemble.GradientBoostingClassifier(loss='deviance',
                                              learning_rate=0.01,
                                              n_estimators=500,
                                              max_depth=4,
                                              subsample=0.25)
clf_gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=0.25, verbose=0,
              warm_start=False)

### Evaluating gradient boosting classifier
Calculate...
- __Confusion matrix:__ rows are actual, columns are prediction
- __Type I Error (false positive):__ identify as 1 (fraud) but 0
- __Type II Error (false negative):__ identify as 0 but 1
- __Sensitivity (recall):__ percentage that 1 was correctly identified
- __Specificity:__ percentage that 0 was correctly identified

In [21]:
from sklearn.metrics import confusion_matrix

#confusion matrix
y_pred_gbc = clf_gbc.predict(X_train)
gbc_cm = confusion_matrix(y_train, y_pred_gbc)

#error
ti_gbc = gbc_cm[0,1]
tii_gbc = gbc_cm[1,0]
sens_gbc = gbc_cm[1,1] / (gbc_cm[1,0] + gbc_cm[1,1])
spec_gbc = gbc_cm[0,0] / (gbc_cm[0,0] + gbc_cm[0,1])

print('GBC train confusion matrix:\n',gbc_cm)
print('\nGBC train type i error:\n',ti_gbc)
print('\nGBC train type ii error:\n',tii_gbc)
print('\nGBC train sensitivity (recall):\n',sens_gbc)
print('\nGBC train specificity:\n',spec_gbc)

GBC train confusion matrix:
 [[387   6]
 [ 34 359]]

GBC train type i error:
 6

GBC train type ii error:
 34

GBC train sensitivity (recall):
 0.9134860050890585

GBC train specificity:
 0.9847328244274809


In [22]:
#evaluate clf_gbc on test set
clf_gbc.fit(X_test, y_test)

#confusion matrix
y_pred_gbc = clf_gbc.predict(X_test)
gbc_cm = confusion_matrix(y_test, y_pred_gbc)

#error
ti_gbc = gbc_cm[0,1]
tii_gbc = gbc_cm[1,0]
sens_gbc = gbc_cm[1,1] / (gbc_cm[1,0] + gbc_cm[1,1])
spec_gbc = gbc_cm[0,0] / (gbc_cm[0,0] + gbc_cm[0,1])

print('GBC test confusion matrix:\n',gbc_cm)
print('\nGBC test type i error:\n',ti_gbc)
print('\nGBC test type ii error:\n',tii_gbc)
print('\nGBC test sensitivity (recall):\n',sens_gbc)
print('\nGBC test specificity:\n',spec_gbc)

GBC test confusion matrix:
 [[99  0]
 [ 3 96]]

GBC test type i error:
 0

GBC test type ii error:
 3

GBC test sensitivity (recall):
 0.9696969696969697

GBC test specificity:
 1.0


## Disregard below: old approach

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

params_lr = [{'penalty':['l1','l2'],
           'C':[0.01, 0.1, 1, 10],
           'fit_intercept':['True','False']}]

lr = LogisticRegression()
grid = GridSearchCV(estimator=lr, param_grid=params_lr)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')

In [None]:
#train
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=.01, penalty='l1', fit_intercept=True)

start_time = time.clock()
lr2.fit(X_train, y_train)
scores_lr2 = cross_val_score(lr2, X_train, y_train, cv=cv)
prob_y = lr2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr2)
print('\nscore array mean:\n', np.mean(scores_lr2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

In [None]:
#test
start_time = time.clock()
scores_lr2_test = cross_val_score(lr2, X_test, y_test, cv=cv)
prob_y = lr2.predict_proba(X_test)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr2_test)
print('\nscore array mean:\n', np.mean(scores_lr2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

__Question:__ model performs much better on test set than train set?

### Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

params_rclf = [{'alpha':[0.01, 0.1, 1, 10],
              'fit_intercept':['True','False']}]

rclf = RidgeClassifier()
grid = GridSearchCV(estimator=rclf, param_grid=params_rclf)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')

In [None]:
#train
rclf2 = RidgeClassifier(alpha=10, fit_intercept=True)
rclf2.fit(X_train,y_train)

scores_rclf2 = cross_val_score(rclf2, X_train, y_train, cv=cv)
#Ridge has no AUROC
#prob_y = rclf.predict_proba(X_ktrain)
#prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_rclf2)
print('\nscore array mean:\n', np.mean(scores_rclf2))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

In [None]:
#test
start_time = time.clock()
scores_rclf2_test = cross_val_score(rclf2, X_test, y_test, cv=cv)

print('score array:\n', scores_rclf2_test)
print('\nscore array mean:\n', np.mean(scores_rclf2_test))
#print('\nAUROC score:\n', roc_auc_score(y_test, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

This model behaves similar to logistic regression

### Additional: Try undersampling with a Support Vector Classifier

In [None]:
y_rus = df_raw['Class'] #target
X_rus = df_raw.loc[:, ~df_raw.columns.isin(['Class'])] #data

print(len(X_rus), len(y_rus))

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_sample(X_rus, y_rus)
print(len(X_rus), len(y_rus))

In [None]:
#parameter search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

params_svc = [{'C':[0.01, 0.1, 1, 10],
               'kernel':['rbf','linear']}]

svc = SVC()
grid_svc = GridSearchCV(estimator=svc, param_grid=params_svc)

start_time = time.clock()
grid_svc.fit(X_rus, y_rus)
print('\nBest parameters:\n', grid_svc.best_params_)
print('\nBest score:\n', grid_svc.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')

In [None]:
from sklearn.model_selection import cross_val_score

svc2 = SVC(C=1, kernel='linear')

start_time = time.clock()
svc2.fit(X_rus, y_rus)
scores_svc2 = cross_val_score(svc2, X_rus, y_rus, cv=5)

print('score array:\n', scores_svc2)
print('\nscore array mean:\n', np.mean(scores_svc2))
print('\nruntime:\n',time.clock() - start_time, 'seconds')