In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
import time
%matplotlib inline

In [2]:
df_raw = pd.read_csv('creditcard.csv')
print(df_raw['Class'].value_counts())
print('Percent fraud: {}%'.format(
    ((df_raw['Class']==1).sum()/(df_raw['Class']==0).sum())*100))

0    284315
1       492
Name: Class, dtype: int64
Percent fraud: 0.17304750013189596%


## Data Cleaning

- Dataset is huge and very imbalanced
- Take a subset of data, keep ratio intact
- Components are already principle components, perform some feature selection
- Address imbalance with under/oversampling techniques

### Sample dataset

In [3]:
#sample 10% of the full dataset
#keep the fraud ratio close using random state
df_sample = df_raw.sample(frac=0.1, replace=True, random_state=6)
print(df_sample.Class.value_counts())
print('\nPercent Fraud:')
print('\nSample df: {}%'.format(
    ((df_sample['Class']==1).sum() / (df_sample['Class']==0).sum())*100))
print('\nFull df: {}%'.format(
    ((df_raw['Class']==1).sum() / (df_raw['Class']==0).sum())*100))

y_sample = df_sample['Class'] #target
X_sample = df_sample.loc[:, ~df_sample.columns.isin(['Class'])] #data

0    28432
1       49
Name: Class, dtype: int64

Percent Fraud:

Sample df: 0.17234102419808667%

Full df: 0.17304750013189596%


### Feature selection

In [4]:
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

k=5
kbest = SelectKBest(f_classif, k=k) #instantiate
kbest.fit(X_sample, y_sample)
mask = kbest.get_support()
k_features = []
for bool, feature in zip(mask, X_sample.columns):
    if bool:
        k_features.append(feature)
print('{} kbest features:'.format(k))
print(k_features)

X_kbest = df_sample[k_features]

5 kbest features:
['V10', 'V12', 'V14', 'V16', 'V17']


In [5]:
from sklearn.model_selection import train_test_split

#random state to keep ratio intact
X_ktrain, X_ktest, y_train, y_test = train_test_split(X_kbest,
                                                      y_sample,
                                                      test_size=.2,
                                                      random_state=112)
print(y_train.value_counts())
print(y_test.value_counts())

0    22745
1       39
Name: Class, dtype: int64
0    5687
1      10
Name: Class, dtype: int64


### Class Imbalance
SKLearn's undersample removes a lot of data, try another method

__imblearn random sampling:__
- Cluster the records of the majority class
- Under-sample: remove records from each cluster, thus seeking to preserve information
- Over-sample: instead of creating exact copies of the minority class records, this introduces small variations into those copies, creating more diverse synthetic samples

In [6]:
#Oversample train sets to retain data
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_ros_train, y_ros_train = ros.fit_sample(X_ktrain, y_train)
print(len(X_ros_train), len(y_ros_train))

45490 45490


#### Final train & test sets

In [7]:
X_train = X_ros_train

X_test = X_ktest

y_train = y_ros_train

y_test = y_test

## Gradient Boosting Classifier

In [8]:
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
#parameter search

params = [{'loss':['deviance','exponential'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[125, 250, 500],
           'max_depth':[2, 3]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 2, 'n_estimators': 500}

Best score:
 0.9997362057595076

runtime:
 275.153378 seconds


__Question:__ GridSearchCV often finds different parameters each time I run it, how should I handle this?

In [9]:
#train with best params
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

cv = 10
clf2 = ensemble.GradientBoostingClassifier(loss='deviance',
                                           learning_rate=0.1,
                                           n_estimators=500,
                                           max_depth=2)

start_time = time.clock()
clf2.fit(X_train, y_train)
scores_clf2 = cross_val_score(clf2, X_train, y_train, cv=cv)

#AUROC score
prob_y = clf2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf2)
print('\nscore array mean:\n', np.mean(scores_clf2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99978022 0.99956044 0.99978022 0.99978022 1.         0.99956025
 0.99956025 0.99978012 0.99978012 0.99956025]

score array mean:
 0.9997142083949472

AUROC score:
 0.999931185792031

runtime:
 65.88323100000002 seconds


In [10]:
#run on test set
start_time = time.clock()
scores_clf2_test = cross_val_score(clf2, X_test, y_test, cv=cv)
prob_y_test = clf2.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf2_test)
print('\nscore array mean:\n', np.mean(scores_clf2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 0.99824561 0.99824561 0.99824561 0.99824561 1.
 1.         0.99824253 0.99648506 0.99824253]

score array mean:
 0.9984198193198287

AUROC score:
 0.897705292772991

runtime:
 4.9366829999999595 seconds


__Question:__ AUROC Score from .999 to .897, overfitting? Array scores look fishy as well

### Apply subsample parameter

In [11]:
params = [{'loss':['deviance'],
           'learning_rate':[0.1],
           'n_estimators':[500],
           'max_depth':[2],
           'subsample':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

clf3 = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf3, param_grid=params)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 2, 'n_estimators': 500, 'subsample': 0.8}

Best score:
 0.9997142229061332

runtime:
 134.902422 seconds


In [12]:
clf4 = ensemble.GradientBoostingClassifier(loss='deviance',
                                           learning_rate=0.1,
                                           n_estimators=500,
                                           max_depth=2,
                                           subsample=0.8)
start_time = time.clock()
clf4.fit(X_train, y_train)
scores_clf4 = cross_val_score(clf4, X_train, y_train, cv=cv)

#AUROC score
prob_y = clf4.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_clf4)
print('\nscore array mean:\n', np.mean(scores_clf4))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [1.         0.99956044 0.99978022 0.99978022 1.         0.99956025
 0.99978012 0.99978012 0.99978012 0.99956025]

score array mean:
 0.9997581741038205

AUROC score:
 0.999931185792031

runtime:
 76.20387799999997 seconds


In [13]:
#run on test set
start_time = time.clock()
scores_clf4_test = cross_val_score(clf4, X_test, y_test, cv=cv)
prob_y_test = clf4.predict_proba(X_test)
prob_y_test = [p[1] for p in prob_y_test]

print('score array:\n', scores_clf4_test)
print('\nscore array mean:\n', np.mean(scores_clf4_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y_test))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 0.99824561 1.         1.         0.99649123 1.
 1.         0.99824253 0.99824253 0.99824253]

score array mean:
 0.9987710048407488

AUROC score:
 0.8970195181994023

runtime:
 7.175768999999946 seconds


AUROC scores similar to model without subsample parameter

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

params_lr = [{'penalty':['l1','l2'],
           'C':[0.01, 0.1, 1, 10],
           'fit_intercept':['True','False']}]

lr = LogisticRegression()
grid = GridSearchCV(estimator=lr, param_grid=params_lr)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'C': 0.01, 'fit_intercept': 'True', 'penalty': 'l1'}

Best score:
 0.840976038689822

runtime:
 1.7522259999999505 seconds


In [15]:
#train
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=.01, penalty='l1', fit_intercept=True)

start_time = time.clock()
lr2.fit(X_train, y_train)
scores_lr2 = cross_val_score(lr2, X_train, y_train, cv=cv)
prob_y = lr2.predict_proba(X_train)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr2)
print('\nscore array mean:\n', np.mean(scores_lr2))
print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.84       0.83538462 0.8432967  0.84571429 0.84153846 0.84498681
 0.83333333 0.83948989 0.84212841 0.84058927]

score array mean:
 0.8406461770419554

AUROC score:
 0.920612701604729

runtime:
 0.46249199999999746 seconds


In [16]:
#test
start_time = time.clock()
scores_lr2_test = cross_val_score(lr2, X_test, y_test, cv=cv)
prob_y = lr2.predict_proba(X_test)
prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_lr2_test)
print('\nscore array mean:\n', np.mean(scores_lr2_test))
print('\nAUROC score:\n', roc_auc_score(y_test, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 0.99824561 0.99824561 0.99824561 0.99824561 0.99824561
 0.99824561 0.99824253 0.99824253 0.99824253]

score array mean:
 0.9982446890512751

AUROC score:
 0.9886759275540706

runtime:
 0.06200000000001182 seconds


__Question:__ model performs much better on test set than train set?

### Ridge Classifier

In [17]:
from sklearn.linear_model import RidgeClassifier

params_rclf = [{'alpha':[0.01, 0.1, 1, 10],
              'fit_intercept':['True','False']}]

rclf = RidgeClassifier()
grid = GridSearchCV(estimator=rclf, param_grid=params_rclf)

start_time = time.clock()
grid.fit(X_train, y_train)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'alpha': 0.01, 'fit_intercept': 'True'}

Best score:
 0.8551549791162893

runtime:
 0.29482199999995373 seconds


In [18]:
#train
rclf2 = RidgeClassifier(alpha=10, fit_intercept=True)
rclf2.fit(X_train,y_train)

scores_rclf2 = cross_val_score(rclf2, X_train, y_train, cv=cv)
#Ridge has no AUROC
#prob_y = rclf.predict_proba(X_ktrain)
#prob_y = [p[1] for p in prob_y]

print('score array:\n', scores_rclf2)
print('\nscore array mean:\n', np.mean(scores_rclf2))
#print('\nAUROC score:\n', roc_auc_score(y_train, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.84989011 0.85538462 0.86       0.85626374 0.85516484 0.8564204
 0.8526825  0.8564204  0.85444151 0.85444151]

score array mean:
 0.8551109629157123

runtime:
 0.4044930000000022 seconds


In [19]:
#test
start_time = time.clock()
scores_rclf2_test = cross_val_score(rclf2, X_test, y_test, cv=cv)

print('score array:\n', scores_rclf2_test)
print('\nscore array mean:\n', np.mean(scores_rclf2_test))
#print('\nAUROC score:\n', roc_auc_score(y_test, prob_y))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.99824561 0.99824561 0.99824561 0.99824561 0.99824561 1.
 1.         0.99824253 1.         1.        ]

score array mean:
 0.998947060093115

runtime:
 0.03697599999998147 seconds


This model behaves similar to logistic regression

### Additional: Try undersampling with a Support Vector Classifier

In [20]:
y_rus = df_raw['Class'] #target
X_rus = df_raw.loc[:, ~df_raw.columns.isin(['Class'])] #data

print(len(X_rus), len(y_rus))

284807 284807


In [21]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler()
X_rus, y_rus = rus.fit_sample(X_rus, y_rus)
print(len(X_rus), len(y_rus))

984 984


In [22]:
#parameter search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

params_svc = [{'C':[0.01, 0.1, 1, 10],
               'kernel':['rbf','linear']}]

svc = SVC()
grid_svc = GridSearchCV(estimator=svc, param_grid=params_svc)

start_time = time.clock()
grid_svc.fit(X_rus, y_rus)
print('\nBest parameters:\n', grid_svc.best_params_)
print('\nBest score:\n', grid_svc.best_score_)
print('\nruntime:\n',time.clock() - start_time, 'seconds')


Best parameters:
 {'C': 1, 'kernel': 'linear'}

Best score:
 0.8099593495934959

runtime:
 586.4183800000001 seconds


In [23]:
from sklearn.model_selection import cross_val_score

svc2 = SVC(C=1, kernel='linear')

start_time = time.clock()
svc2.fit(X_rus, y_rus)
scores_svc2 = cross_val_score(svc2, X_rus, y_rus, cv=5)

print('score array:\n', scores_svc2)
print('\nscore array mean:\n', np.mean(scores_svc2))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.94949495 0.92929293 0.8622449  0.92857143 0.73979592]

score array mean:
 0.8818800247371676

runtime:
 367.17626599999994 seconds
