In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
df = pd.read_csv('creditcard.csv')
print(df['Class'].value_counts())
df.head()

0    284315
1       492
Name: Class, dtype: int64


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Target is very unbalanced, need to address this

In [11]:
#try sklearn upsample (randomly duplicating obvs from minority class)
from sklearn.utils import resample

#separate majority and minority classes
df_majority = df[df.Class==0]
df_minority = df[df.Class==1]

#upsample minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=284315)

#combine with majority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.Class.value_counts()

1    284315
0    284315
Name: Class, dtype: int64

In [6]:
#also downsample (remove random obvs from majority class) to compare results
df_majority_downsampled = resample(df_majority, replace=False, n_samples=492)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled.Class.value_counts()

1    492
0    492
Name: Class, dtype: int64

### Models to try:
- Random Forest
- Support Vector
- Gradient Boost

### Random Forest

In [4]:
#RF on upsampled data
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

X = df_upsampled.drop('Class', 1)
Y = df_upsampled['Class']

start_time = time.clock()
rfc = ensemble.RandomForestClassifier().fit(X,Y)
scores_rfc = cross_val_score(rfc, X, Y, cv=5)

print('score array:\n', scores_rfc)
print('\nscore array mean:\n', np.mean(scores_rfc))
print('\nscore std dev:\n', np.std(scores_rfc))
print('\nruntime:\n',time.clock() - start_time, "seconds")

score array:
 [0.92585688 0.99999121 0.99990328 0.99999121 0.70970578]

score array mean:
 0.927089671666989

score std dev:
 0.11241740350754435

runtime:
 105.95232 seconds


__Result:__ ok accuracy but often when running cross validation I run I get a single score that is much lower than the rest, reason for concern?

In [7]:
#RF on downsampled data
X_down = df_downsampled.drop('Class', 1)
Y_down = df_downsampled['Class']

start_time = time.clock()
rfc_down = ensemble.RandomForestClassifier().fit(X_down,Y_down)
scores_rfc_down = cross_val_score(rfc_down, X_down, Y_down, cv=5)

print('score array:\n', scores_rfc_down)
print('\nscore array mean:\n', np.mean(scores_rfc_down))
print('\nscore std dev:\n', np.std(scores_rfc_down))
print('\nruntime:\n',time.clock() - start_time, "seconds")

score array:
 [0.95959596 0.91414141 0.90816327 0.95408163 0.8877551 ]

score array mean:
 0.9247474747474748

score std dev:
 0.027679847828726786

runtime:
 0.19017000000000905 seconds


__Result:__ similar accuracy, same single cross val score that is lower, runs much faster

### Resample dataset

- SVC and Gradient Boost are taking way too long to run on upsampled data (>500,000 observations)
- Use df.sample() to take 10% of the data and upsample again

In [8]:
#random state is used for n_samples consistency with upsampling code
df10 = df.sample(frac=0.1, replace=True, random_state=1)
print(df10['Class'].value_counts())


#separate majority and minority classes
df10_majority = df10[df10.Class==0]
df10_minority = df10[df10.Class==1]

#upsample minority class
df10_minority_upsampled = resample(df10_minority, replace=True, n_samples=28433)

#combine with majority class
df10_upsampled = pd.concat([df10_majority, df10_minority_upsampled])
df10_upsampled.Class.value_counts()

0    28433
1       48
Name: Class, dtype: int64


1    28433
0    28433
Name: Class, dtype: int64

In [9]:
#set target, data, train, & test sets
X10 = df10_upsampled.drop('Class', 1)
Y10 = df10_upsampled['Class']

offset = int(X10.shape[0] * .9)
X10_train, Y10_train = X10[:offset], Y10[:offset]
X10_test, Y10_test = X10[offset:], Y10[offset:]

### SVC

In [12]:
#SVC on upsampled data
from sklearn.svm import SVC

start_time = time.clock()
svc = SVC().fit(X10,Y10)
scores_svc = cross_val_score(svc, X10, Y10, cv=5)

print('score array:\n', scores_svc)
print('\nscore array mean:\n', np.mean(scores_svc))
print('\nscore array std dev:\n', np.std(scores_svc))
print('\nruntime:\n',time.clock() - start_time, "seconds")

score array:
 [1. 1. 1. 1. 1.]

score array mean:
 1.0

score array std dev:
 0.0

runtime:
 527.311423 seconds


In [16]:
#SVC on downsampled data
start_time = time.clock()
svc_down = SVC(C=.01, kernel='linear').fit(X_down, Y_down)
scores_svc_down = cross_val_score(svc_down, X_down, Y_down, cv=5)

print('score array:\n', scores_svc_down)
print('\nscore array mean:\n', np.mean(scores_svc_down))
print('\nscore std dev:\n', np.std(scores_svc_down))
print('\nruntime:\n',time.clock() - start_time, "seconds")

score array:
 [0.96969697 0.92424242 0.85714286 0.92857143 0.75510204]

score array mean:
 0.8869511440940012

score std dev:
 0.07515647585656969

runtime:
 191.11606700000004 seconds


__Result:__ SVC rbf kernel scores are weird, is this right?

### Gradient Boost

In [17]:
#set parameters, try simple first
params = {'n_estimators':500, 'max_depth': 2, 'loss':'deviance'}

#gradient boost on upsampled data
start_time = time.clock()
clf = ensemble.GradientBoostingClassifier(**params).fit(X10, Y10)
scores_clf = cross_val_score(clf, X10, Y10, cv=5)

print('score array:\n', scores_clf)
print('\nscore array mean:\n', np.mean(scores_clf))
print('\nscore std dev:\n', np.std(scores_clf))
print('\nruntime:\n',time.clock() - start_time, "seconds")

score array:
 [0.99973624 0.99991208 0.99991208 0.99973619 0.99982413]

score array mean:
 0.999824144903239

score std dev:
 7.864824937925089e-05

runtime:
 170.55128100000002 seconds


In [19]:
#gradient boost on downsampled data

start_time = time.clock()
clf_down = ensemble.GradientBoostingClassifier().fit(X_down, Y_down)
scores_clf_down = cross_val_score(clf_down, X_down, Y_down, cv=5)

print('score array:\n', scores_clf_down)
print('\nscore array mean:\n', np.mean(scores_clf_down))
print('\nscore std dev:\n', np.std(scores_clf_down))
print('\nruntime:\n',time.clock() - start_time, "seconds")

score array:
 [0.95959596 0.93939394 0.91836735 0.93877551 0.90816327]

score array mean:
 0.9328592042877759

score std dev:
 0.017958078152308248

runtime:
 1.288808999999901 seconds


__Result:__ on upsampled (more) data, gradient boost has very high accuracy and very low variance. It is less accurate on downsampled (less) data, but still more accurate than the Random Forest models