In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#import lightgbm as lgb

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('SampleData.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.groupby("Class")['Time'].count()

Class
0    284315
1       492
Name: Time, dtype: int64

## Basic Model 

In [4]:
Xtrain, Xtest, ytrain, ytest = train_test_split(df.drop(columns = 'Class'), df.Class, test_size=0.20, random_state=27)
print("Train Shape", Xtrain.shape)
print("Test Shape", Xtest.shape)


Train Shape (227845, 30)
Test Shape (56962, 30)


In [5]:
# Lets Predict All 0 (Most Frequent)
pred_0 = [0]*len(ytest)

# Accuracy
print('Test score: ', accuracy_score(ytest, pred_0))

Test score:  0.9981742214107651


# Logistic Regression

In [6]:
lrclf = LogisticRegression().fit(Xtrain, ytrain)
pred_test_lr = lrclf.predict(Xtest)
# Checking accuracy
accuracy_score(ytest, pred_test_lr)

0.9991046662687406

In [7]:
pd.DataFrame(pred_test_lr)[0].value_counts()

0    56885
1       77
Name: 0, dtype: int64

### Now We will use different Evalution Metrics
- Confusion Matrix (Precision, Recall)
- f1 Score

In [11]:
def eval_metrics(actual, predicted):
    print("f1 Score :", f1_score(actual, predicted))
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("Recall Score :",recall_score(actual, predicted))
    print("AUC :",  roc_auc_score(actual, predicted))

In [12]:
eval_metrics(ytest, pred_test_lr)

f1 Score : 0.7182320441988951
Confusion Matrix
       0   1
0  56846  12
1     39  65
Recall Score : 0.625
AUC : 0.812394473952654


## Findings
- Low F1 Score
- Many Missclassifications (low recall score)

# Resampling Techniques 

## Undersampling Majority Class 

In [13]:
X = pd.concat([Xtrain, ytrain], axis=1)
zero = X[X.Class == 0]
one = X[X.Class == 1]

In [14]:
# Undersampling Zeros
zero_under = resample(zero, replace = False, n_samples = int(round(len(one)*1.25,0)), random_state = 111)
# Final Undersampled Data
undersampled = pd.concat([zero_under, one]).reset_index(drop = True)
undersampled.shape

(873, 31)

In [15]:
us_lrclf = LogisticRegression().fit(undersampled.drop(columns = 'Class'), undersampled.Class)
pred_test_lr_us = us_lrclf.predict(Xtest)
# Checking accuracy
accuracy_score(ytest, pred_test_lr_us)

0.9804957691092308

In [16]:
eval_metrics(ytest, pred_test_lr_us)

f1 Score : 0.13809154383242822
Confusion Matrix
       0     1
0  55762  1096
1     15    89
Recall Score : 0.8557692307692307
AUC : 0.9182465697270122


## Oversampling Minority Class 

In [17]:
# Over Sampling Class 1
one_over = resample(one, replace = True, n_samples = int(round(len(zero)*0.9,0)), random_state = 111)
# Oversampled Data
oversampled = pd.concat([zero, one_over]).reset_index(drop = True)
oversampled.shape

(432168, 31)

In [18]:
os_lrclf = LogisticRegression().fit(oversampled.drop(columns = 'Class'), oversampled.Class)
pred_test_lr_os = os_lrclf.predict(Xtest)
# Checking accuracy
accuracy_score(ytest, pred_test_lr_os)

0.9854639935395527

In [19]:
eval_metrics(ytest, pred_test_lr_os)

f1 Score : 0.17857142857142858
Confusion Matrix
       0    1
0  56044  814
1     14   90
Recall Score : 0.8653846153846154
AUC : 0.925534124147336


## Generate Synthetic Samples

In [20]:
sm = SMOTE(random_state=27, ratio=1.0)
Xtrain_sm, ytrain_sm = sm.fit_sample(Xtrain, ytrain)

In [23]:
print("Normal Data Shape", Xtrain.shape)
print("Data After Synthetic Data", Xtrain_sm.shape)

Normal Data Shape (227845, 30)
Data After Synthetic Data (454914, 30)


In [24]:
sm_lrclf = LogisticRegression().fit(Xtrain_sm, ytrain_sm)
pred_test_lr_sm = sm_lrclf.predict(Xtest)

In [25]:
print("Accuracy : ", accuracy_score(ytest, pred_test_lr_sm))
eval_metrics(ytest, pred_test_lr_sm)

Accuracy :  0.9834977704434535
f1 Score : 0.1622103386809269
Confusion Matrix
       0    1
0  55931  927
1     13   91
Recall Score : 0.875
AUC : 0.9293481128425198


# Change in algorithms 

## Balanced Bagging Classifier

In [26]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Create an object of the classifier.
bbclf = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)

#Train the classifier.
bbclf.fit(Xtrain, ytrain)
pred_test_bb = bbclf.predict(Xtest)

In [27]:
print("Accuracy : ", accuracy_score(ytest, pred_test_bb))
eval_metrics(ytest, pred_test_bb)

Accuracy :  0.9803553246023665
f1 Score : 0.13989239046887011
Confusion Matrix
       0     1
0  55752  1106
1     13    91
Recall Score : 0.875
AUC : 0.9277740159696085


## Random Forest 

In [28]:
# Random Forest
rfclf = RandomForestClassifier(n_estimators=10).fit(Xtrain, ytrain)
# predict on test set
pred_test_rf = rfclf.predict(Xtest)

In [29]:
print("Accuracy : ", accuracy_score(ytest, pred_test_rf))
eval_metrics(ytest, pred_test_rf)

Accuracy :  0.9995786664794073
f1 Score : 0.8723404255319148
Confusion Matrix
       0   1
0  56856   2
1     22  82
Recall Score : 0.7884615384615384
AUC : 0.8942131815562114
