# Fraud Detection
----

In [244]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from imblearn.ensemble import EasyEnsemble
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [223]:
df = pd.read_csv('./data/creditcard.csv')

In [224]:
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [225]:
# Set X and Y
X = df.loc[:, ~df.columns.isin(['Class'])]
Y = df['Class']

In [226]:
Y.value_counts()

0    284315
1       492
Name: Class, dtype: int64

## Normalize the data

In [227]:
cols_to_norm = ['V1','V2','V3','V4','V5',
                'V6','V7','V8','V9','V10',
                'V11','V12','V13','V14',
                'V15','V16','V17','V18',
                'V19','V20','V21','V22',
                'V23','V24','V25','V26','V27','V28',]
X[cols_to_norm] = X[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


## Splits

In [228]:
# Train splits
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.40, random_state=42, stratify=Y)

In [229]:
X_train.shape

(170884, 30)

In [230]:
# Dev and Test splits
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp)

In [231]:
print(X_dev.shape)
print(X_test.shape)

(56961, 30)
(56962, 30)


## Random Forests

In [233]:
rfc = ensemble.RandomForestClassifier(n_estimators=300, class_weight='balanced', max_features='log2', n_jobs=-1)

In [234]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='log2',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [236]:
# Train accuracy
print(f'Train Score: {rfc.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {rfc.score(X_dev, y_dev)}')

Train Score: 1.0
Dev Score: 0.9993855444953564


In [237]:
y_pred_train = rfc.predict_proba(X_train)[:,1]

In [238]:
auc_train = roc_auc_score(y_train, y_pred_train)
print('AUC train', auc_train)

AUC train 1.0


In [239]:
y_pred_dev = rfc.predict_proba(X_dev)[:,1]

In [240]:
auc_dev = roc_auc_score(y_dev, y_pred_dev)
print('AUC dev', auc_dev)

AUC dev 0.9456117585876833


In [241]:
dev_pred = rfc.predict(X_dev)
dev_table = pd.crosstab(y_dev, dev_pred, margins=True)
print(pd.crosstab(y_dev, dev_pred))

col_0      0   1
Class           
0      56858   5
1         30  68


## Imbalanced Package - EasyEnsemble

In [255]:
ee = EasyEnsemble(random_state=0, n_splits = 10)
X_resampled, y_resampled = ee.fit_sample(X_train, y_train)
print(X_resampled.shape)

(10, 590, 30)


In [247]:
rfc_2 = ensemble.RandomForestClassifier(n_estimators=300, class_weight='balanced', max_features='log2', n_jobs=-1)

In [260]:
for x, y in zip(X_resampled, y_resampled):
    rfc_2.fit(x, y)

In [261]:
# Train accuracy
print(f'Train Score: {rfc_2.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {rfc_2.score(X_dev, y_dev)}')

Train Score: 0.978002621661478
Dev Score: 0.9771071434841383


In [263]:
y_pred_dev = rfc_2.predict_proba(X_dev)[:,1]
auc_dev = roc_auc_score(y_dev, y_pred_dev)
print('AUC dev', auc_dev)

AUC dev 0.9830568423138033


In [264]:
dev_pred = rfc_2.predict(X_dev)
dev_table = pd.crosstab(y_dev, dev_pred, margins=True)
print(pd.crosstab(y_dev, dev_pred))

col_0      0     1
Class             
0      55571  1292
1         12    86


## Imbalance Package - BalancedBaggingClassifier

In [269]:
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(base_estimator=ensemble.RandomForestClassifier(),
                                 ratio='auto',
                                 replacement=False,
                                 random_state=0)

bbc.fit(X_train, y_train) 
y_pred = bbc.predict(X_dev)
confusion_matrix(y_dev, y_pred)

array([[55662,  1201],
       [   15,    83]])

In [270]:
# Train accuracy
print(f'Train Score: {bbc.score(X_train, y_train)}')
# Dev accuracy
print(f'Dev Score: {bbc.score(X_dev, y_dev)}')

Train Score: 0.9793719716298775
Dev Score: 0.9786520601815277


In [272]:
# AUC
y_pred_train = bbc.predict_proba(X_train)[:,1]
auc_train = roc_auc_score(y_train, y_pred_train)
print('AUC train', auc_train)

AUC train 0.9978692667111189


In [274]:
# AUC
y_pred_dev = bbc.predict_proba(X_dev)[:,1]
auc_dev = roc_auc_score(y_dev, y_pred_dev)
print('AUC dev', auc_dev)

AUC dev 0.9790900937340625


In [275]:
dev_pred = bbc.predict(X_dev)
dev_table = pd.crosstab(y_dev, dev_pred, margins=True)
print(pd.crosstab(y_dev, dev_pred))

col_0      0     1
Class             
0      55662  1201
1         15    83


## Test set

In [278]:
y_pred_test = bbc.predict(X_test)
# Test accuracy
print(f'Test Score: {bbc.score(X_test, y_test)}')

y_pred_test = bbc.predict_proba(X_test)[:,1]
auc_test = roc_auc_score(y_test, y_pred_test)
print('AUC test', auc_test)
print('\n')

test_pred = bbc.predict(X_test)
test_table = pd.crosstab(y_test, test_pred, margins=True)
print(pd.crosstab(y_test, test_pred))

Test Score: 0.9785646571398476
AUC test 0.9752966415646893


col_0      0     1
Class             
0      55653  1210
1         11    88
