In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

### Data Cleaning

In [2]:
df = pd.read_csv('creditcard.csv')
print(df['Class'].value_counts())
print(len(df.columns))

0    284315
1       492
Name: Class, dtype: int64
31


Target is very unbalanced, need to address this

#### Class Imbalance

In [13]:
#try sklearn upsample (randomly duplicating obvs from minority class)
from sklearn.utils import resample

#separate majority and minority classes
df_majority = df[df.Class==0]
df_minority = df[df.Class==1]

#upsample minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=284315)

#combine with majority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
print('Upsampled class counts:')
print(df_upsampled.Class.value_counts())

#set data and target
X_up = df_upsampled.drop('Class', 1)
y_up = df_upsampled['Class']

#set training and test sets
offset = int(X_up.shape[0] * 0.9)
X_up_train, y_up_train = X_up[:offset], y_up[:offset]
X_up_test, y_up_test = X_up[offset:], y_up[offset:]

Upsampled class counts:
1    284315
0    284315
Name: Class, dtype: int64


In [4]:
#also downsample (remove random obvs from majority class) to compare results
df_majority_downsampled = resample(df_majority, replace=False, n_samples=492)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled.Class.value_counts()

#set data and target
X_down = df_downsampled.drop('Class', 1)
y_down = df_downsampled['Class']

#### Feature Selection

In [6]:
#selectkbest on upsampled df
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

kbest = SelectKBest(f_classif, k=5) #instantiate
kbest.fit(X_up, y_up) #fit 
mask_up = kbest.get_support()
features_up = []
for bool, feature in zip(mask_up, X_up.columns):
    if bool:
        features_up.append(feature)
print('5 kbest features:')
print(features_up)

X_kbest = df_upsampled[features_up]
X_kbest_train, X_kbest_test = X_kbest[:offset], X_kbest[offset:]

5 kbest features:
['V4', 'V10', 'V11', 'V12', 'V14']


### Models to try:
- Random Forest
- Support Vector
- Gradient Boost

### Random Forest

In [8]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
rfc.fit(X_up_train, y_up_train)
scores_train = cross_val_score(rfc, X_up_train, y_up_train, cv=5)
print(scores_train)

[0.99910116 1.         0.99985345 0.99996092 0.9039696 ]


In [9]:
scores_test = cross_val_score(rfc, X_up_test, y_up_test, cv=5)
print(scores_test)

[1. 1. 1. 1. 1.]


In [12]:
from sklearn.metrics import roc_auc_score
y_pred = rfc.predict(X_up_test)
print(roc_auc_score(y_up_test, y_pred))

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
#RF on upsampled data, all features
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

start_time = time.clock()
rfc = ensemble.RandomForestClassifier().fit(X_up_train, y_up_train)
scores_rfc = cross_val_score(rfc, X_up_test, y_up_test, cv=5)

print('score array:\n', scores_rfc)
print('\nscore array mean:\n', np.mean(scores_rfc))
print('\nscore std dev:\n', np.std(scores_rfc))
print('\nruntime:\n',time.clock() - start_time, "seconds")

In [None]:
#AUROC score
#from sklearn.metrics import roc_auc_score
#prob_y = rfc.predict_proba(X_up)
#prob_y = [p[1] for p in prob_y]
#print('AUROC score:\n', roc_auc_score(y_up, prob_y))

In [None]:
#rfc on upsampled kbest features
start_time = time.clock()
rfc_kbest = ensemble.RandomForestClassifier().fit(X_kbest, y_up)
scores_rfc_kbest = cross_val_score(rfc_kbest, X_kbest, y_up, cv=5)

print('score array:\n', scores_rfc_kbest)
print('\nscore array mean:\n', np.mean(scores_rfc_kbest))
print('\nscore std dev:\n', np.std(scores_rfc_kbest))
print('\nruntime:\n',time.clock() - start_time, "seconds")

In [None]:
#RF on downsampled data
X_down = df_downsampled.drop('Class', 1)
Y_down = df_downsampled['Class']

start_time = time.clock()
rfc_down = ensemble.RandomForestClassifier().fit(X_down,Y_down)
scores_rfc_down = cross_val_score(rfc_down, X_down, Y_down, cv=5)

print('score array:\n', scores_rfc_down)
print('\nscore array mean:\n', np.mean(scores_rfc_down))
print('\nscore std dev:\n', np.std(scores_rfc_down))
print('\nruntime:\n',time.clock() - start_time, "seconds")

### Resample dataset

- SVC and Gradient Boost are taking way too long to run on upsampled data (>500,000 observations)
- Use df.sample() to take 10% of the data and upsample again

In [None]:
#random state is used for n_samples consistency with upsampling code
df10 = df.sample(frac=0.1, replace=True, random_state=1)
print(df10['Class'].value_counts())


#separate majority and minority classes
df10_majority = df10[df10.Class==0]
df10_minority = df10[df10.Class==1]

#upsample minority class
df10_minority_upsampled = resample(df10_minority, replace=True, n_samples=28433)

#combine with majority class
df10_upsampled = pd.concat([df10_majority, df10_minority_upsampled])
df10_upsampled.Class.value_counts()

In [None]:
#set target, data, train, & test sets
X10 = df10_upsampled.drop('Class', 1)
Y10 = df10_upsampled['Class']

offset = int(X10.shape[0] * .9)
X10_train, Y10_train = X10[:offset], Y10[:offset]
X10_test, Y10_test = X10[offset:], Y10[offset:]

### SVC

In [None]:
#SVC on upsampled data
#from sklearn.svm import SVC

#start_time = time.clock()
#svc = SVC().fit(X10,Y10)
#scores_svc = cross_val_score(svc, X10, Y10, cv=5)

#print('score array:\n', scores_svc)
#print('\nscore array mean:\n', np.mean(scores_svc))
#print('\nscore array std dev:\n', np.std(scores_svc))
#print('\nruntime:\n',time.clock() - start_time, "seconds")

In [None]:
#SVC on downsampled data
#start_time = time.clock()
#svc_down = SVC(C=.01, kernel='linear').fit(X_down, Y_down)
#scores_svc_down = cross_val_score(svc_down, X_down, Y_down, cv=5)

#print('score array:\n', scores_svc_down)
#print('\nscore array mean:\n', np.mean(scores_svc_down))
#print('\nscore std dev:\n', np.std(scores_svc_down))
#print('\nruntime:\n',time.clock() - start_time, "seconds")

### Gradient Boost

In [None]:
from sklearn.model_selection import GridSearchCV

params = [{'loss':['deviance'],
           'learning_rate':[0.01, 0.1, 1],
           'n_estimators':[250, 500, 750],
           'max_depth':[2, 3]}]

clf = ensemble.GradientBoostingClassifier()
grid = GridSearchCV(estimator=clf, param_grid=params)

start_time = time.clock()
grid.fit(X10, Y10)
print('\nBest parameters:\n', grid.best_params_)
print('\nBest score:\n', grid.best_score_)
print('\nruntime:\n',time.clock() - start_time, "seconds")

In [None]:
#set parameters, try simple first
params = {'n_estimators':500, 'max_depth': 2, 'loss':'deviance'}

#gradient boost on upsampled data
start_time = time.clock()
clf = ensemble.GradientBoostingClassifier(**params).fit(X10, Y10)
scores_clf = cross_val_score(clf, X10, Y10, cv=5)

print('score array:\n', scores_clf)
print('\nscore array mean:\n', np.mean(scores_clf))
print('\nscore std dev:\n', np.std(scores_clf))
print('\nruntime:\n',time.clock() - start_time, "seconds")

In [None]:
#gradient boost on downsampled data

start_time = time.clock()
clf_down = ensemble.GradientBoostingClassifier(**params).fit(X_down, Y_down)
scores_clf_down = cross_val_score(clf_down, X_down, Y_down, cv=5)

print('score array:\n', scores_clf_down)
print('\nscore array mean:\n', np.mean(scores_clf_down))
print('\nscore std dev:\n', np.std(scores_clf_down))
print('\nruntime:\n',time.clock() - start_time, "seconds")