In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import confusion_matrix
%matplotlib inline



Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

In [2]:
# Data comes from https://www.kaggle.com/dalpozz/creditcardfraud/data

df = pd.read_csv("./data/creditcard.csv")
df.shape

(284807, 31)

In [7]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [16]:
df.Class.value_counts() #unbalanced 

0    284315
1       492
Name: Class, dtype: int64

In [22]:
X = df[["V1", "Amount"]]
y = df.Class

In [23]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [24]:
mlp.score(X, y)

0.99824793632178987

In [25]:
from sklearn.model_selection import cross_val_score
cross_val_score(mlp, X, y, cv=5)

array([ 0.998262  ,  0.997946  ,  0.99822686,  0.99827952,  0.99827952])

In [26]:
y_pred= mlp.predict(X)

In [27]:
confusion_matrix(y, y_pred) #type 2 error is high 

array([[284298,     17],
       [   482,     10]])

In [30]:
from imblearn.over_sampling import SMOTE  #we can use SMOTE to balance this by oversampling fraud 

In [31]:
sm = SMOTE(random_state=42)

In [32]:
X_res, y_res = sm.fit_sample(X, y)

In [36]:
print('Resampled dataset shape {}'.format(Counter(y_res)))

Resampled dataset shape Counter({0: 284315, 1: 284315})


In [37]:
mlp = MLPClassifier(hidden_layer_sizes=(100,))
mlp.fit(X_res, y_res)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [40]:
mlp.score(X_res, y_res)

0.84069781756150752

In [39]:
cross_val_score(mlp, X_res, y_res, cv=5)



array([ 0.82680302,  0.82952007,  0.83208765,  0.8479943 ,  0.84998153])

In [41]:
y_pred=mlp.predict(X_res)

In [42]:
confusion_matrix(y_res, y_pred)

array([[230923,  53392],
       [ 37192, 247123]])

In [12]:
# Question: neural networks need a lot of data, but here we only have 492 examples of fraud.
## How should we handle this? Should we re-balance our sample? If so, we'd only end with ~1,000 datapoints.

In [13]:
# Let's compare this to random forrest 
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=3)
rf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
rf.score(X, y)

0.99827251436937992

In [15]:
cross_val_score(rf, X, y, cv=5)

array([ 0.998262  ,  0.99822689,  0.99827952,  0.99827952,  0.99827952])

In [43]:
rf = RandomForestClassifier(max_depth=3)
rf.fit(X_res,y_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [44]:
rf.score(X_res, y_res)

0.80999595519054568

In [46]:
y_pre=rf.predict(X_res)

In [47]:
confusion_matrix(y_pred, y_res) 
#type 2 error is a lot worse than mlp, which is what a bank cares about in terms of fraud detection

array([[230923,  37192],
       [ 53392, 247123]])

In [None]:
# now do we want to use other fields as well? 

In [19]:
X = df.drop(['Class'], axis=1)
y = df.Class