In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# machine learning
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest

In [2]:
df=pd.read_csv('creditcard.csv', sep=',')
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [3]:
inliers = df[df.Class==0]
inliers = inliers.drop(['Class'], axis=1)
outliers = df[df.Class==1]
outliers = outliers.drop(['Class'], axis=1)
inliers_train, inliers_test = train_test_split(inliers, test_size=0.30, random_state=42)


In [4]:
model = IsolationForest(n_estimators=110,max_features=5,bootstrap=False,contamination=0.01,max_samples=500,verbose=1,n_jobs=4,random_state=42)
model.fit(df.drop(['Class'], axis=1))
inlier_pred_train=model.predict(inliers_train)
inlier_pred_test=model.predict(inliers_test)
outlier_pred_test=model.predict(outliers)

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.9s remaining:    0.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.0s finished


In [5]:
print("Accuracy in Detecting Legit Cases:", list(inlier_pred_test).count(1)/inlier_pred_test.shape[0])
print("Accuracy in Detecting Fraud Cases:", list(outlier_pred_test).count(-1)/outlier_pred_test.shape[0])

Accuracy in Detecting Legit Cases: 0.9912656075971628
Accuracy in Detecting Fraud Cases: 0.6422764227642277


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Class'],axis=1), df.Class, test_size=0.3, random_state=42)

In [7]:
y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)
y_pred_train[y_pred_train==1]=0
y_pred_train[y_pred_train==-1]=1
y_pred_test[y_pred_test==1]=0
y_pred_test[y_pred_test==-1]=1


In [8]:
roc_auc_score(y_train,y_pred_train)

0.81158679070662398

In [9]:
roc_auc_score(y_test,y_pred_test)

0.83003398969397035

In [10]:
from sklearn.metrics import f1_score
f1_score(y_train,y_pred_train)

0.19214346712211786

In [11]:
f1_score(y_test,y_pred_test)

0.18218218218218218

In [12]:
print(classification_report(y_train,y_pred_train))


             precision    recall  f1-score   support

          0       1.00      0.99      1.00    199008
          1       0.11      0.63      0.19       356

avg / total       1.00      0.99      0.99    199364



In [13]:
print(classification_report(y_test,y_pred_test))


             precision    recall  f1-score   support

          0       1.00      0.99      1.00     85307
          1       0.11      0.67      0.18       136

avg / total       1.00      0.99      0.99     85443



In [16]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,y_pred_train))


[[197247   1761]
 [   131    225]]


In [17]:
confusion_matrix(y_test,y_pred_test)

array([[84535,   772],
       [   45,    91]])

In [18]:
outlier_pred_test[outlier_pred_test==-1]=1
outlier_pred_test[outlier_pred_test==1]=0

In [19]:
clf=IsolationForest(n_estimators=150,max_features=9,max_samples=5500,bootstrap=True,contamination=0.001,n_jobs=-1)

In [20]:
clf.fit(X_train)

IsolationForest(bootstrap=True, contamination=0.001, max_features=9,
        max_samples=5500, n_estimators=150, n_jobs=-1, random_state=None,
        verbose=0)

In [21]:
y_pred_train=clf.predict(X_train)
y_pred_test=clf.predict(X_test)

In [22]:
y_pred_train[y_pred_train==1]=0
y_pred_train[y_pred_train==-1]=1
y_pred_test[y_pred_test==1]=0
y_pred_test[y_pred_test==-1]=1

In [23]:
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    199008
          1       0.40      0.22      0.28       356

avg / total       1.00      1.00      1.00    199364



In [24]:
print(classification_report(y_test,y_pred_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     85307
          1       0.49      0.30      0.37       136

avg / total       1.00      1.00      1.00     85443



In [25]:
roc_auc_score(y_train,y_pred_train)

0.61065104830069505

In [26]:
roc_auc_score(y_test,y_pred_test)

0.65048326321748651

In [27]:
len(y_pred_test[y_test!=y_pred_test])


138

In [28]:
len(y_pred_train[y_train!=y_pred_train])

398

In [29]:
from sklearn.metrics import confusion_matrix


In [30]:
confusion_matrix(y_train,y_pred_train)

array([[198887,    121],
       [   277,     79]])

In [31]:
confusion_matrix(y_test,y_pred_test)

array([[85264,    43],
       [   95,    41]])

In [32]:
df.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [36]:
from sklearn.preprocessing import StandardScaler
scale=StandardScaler()
scaled_df=scale.fit_transform(df)
y=df.Class

In [37]:
scaled_df=pd.DataFrame(scaled_df)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df, y, test_size=0.3, random_state=42)

In [39]:
clf_scaled=IsolationForest(n_estimators=110,bootstrap=False,max_features=1.0,contamination=0.001,max_samples=5500,verbose=3,n_jobs=-1)

In [40]:
clf_scaled.fit(X_train)

Building estimator 1 of 28 for this parallel run (total 110)...
Building estimator 1 of 28 for this parallel run (total 110)...
Building estimator 1 of 27 for this parallel run (total 110)...
Building estimator 2 of 28 for this parallel run (total 110)...
Building estimator 2 of 28 for this parallel run (total 110)...
Building estimator 2 of 27 for this parallel run (total 110)...
Building estimator 1 of 27 for this parallel run (total 110)...
Building estimator 3 of 28 for this parallel run (total 110)...
Building estimator 3 of 28 for this parallel run (total 110)...
Building estimator 3 of 27 for this parallel run (total 110)...
Building estimator 4 of 28 for this parallel run (total 110)...
Building estimator 4 of 28 for this parallel run (total 110)...
Building estimator 2 of 27 for this parallel run (total 110)...
Building estimator 4 of 27 for this parallel run (total 110)...
Building estimator 5 of 28 for this parallel run (total 110)...
Building estimator 3 of 27 for this para

[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.1s finished


IsolationForest(bootstrap=False, contamination=0.001, max_features=1.0,
        max_samples=5500, n_estimators=110, n_jobs=-1, random_state=None,
        verbose=3)

In [41]:
y_pred_train=clf_scaled.predict(X_train)
y_pred_test=clf_scaled.predict(X_test)

In [42]:
y_pred_train[y_pred_train==1]=0
y_pred_train[y_pred_train==-1]=1
y_pred_test[y_pred_test==1]=0
y_pred_test[y_pred_test==-1]=1

In [43]:
print(classification_report(y_test,y_pred_test))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     85307
          1       0.58      0.34      0.43       136

avg / total       1.00      1.00      1.00     85443



In [44]:
print(classification_report(y_train,y_pred_train))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    199008
          1       0.48      0.27      0.35       356

avg / total       1.00      1.00      1.00    199364



In [45]:
len(y_pred_test[y_test!=y_pred_test])


123

In [46]:
len(y_pred_train[y_train!=y_pred_train])


364

In [47]:
confusion_matrix(y_train,y_pred_train)

array([[198904,    104],
       [   260,     96]])

In [48]:
confusion_matrix(y_test,y_pred_test)

array([[85274,    33],
       [   90,    46]])

In [49]:
roc_auc_score(y_train,y_pred_train)

0.63457016464585692

In [50]:
roc_auc_score(y_test,y_pred_test)

0.66892422799590967