In [1]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report,accuracy_score

from imblearn.over_sampling import SMOTE

In [12]:
data = pd.read_csv('creditcard.csv',sep=',')

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data['Amount'].values.reshape(-1, 1))
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


In [13]:
fraud = data[data['Class']==1]
normal = data[data['Class']==0]

print(fraud.Amount.describe())
print(normal.Amount.describe())

count    492.000000
mean       0.135382
std        1.026242
min       -0.353229
25%       -0.349231
50%       -0.316247
75%        0.070128
max        8.146182
Name: Amount, dtype: float64
count    284315.000000
mean         -0.000234
std           0.999942
min          -0.353229
25%          -0.330640
50%          -0.265271
75%          -0.045177
max         102.362243
Name: Amount, dtype: float64


In [35]:
data1= data.sample(frac = 1,random_state=1)
data1.shape

fraud = data1[data1['Class']==1]
normal = data1[data1['Class']==0]

print(fraud.Amount.describe())
print(normal.Amount.describe())

count    492.000000
mean       0.135382
std        1.026242
min       -0.353229
25%       -0.349231
50%       -0.316247
75%        0.070128
max        8.146182
Name: Amount, dtype: float64
count    284315.000000
mean         -0.000234
std           0.999942
min          -0.353229
25%          -0.330640
50%          -0.265271
75%          -0.045177
max         102.362243
Name: Amount, dtype: float64


In [36]:
columns = data1.columns.tolist()

columns = [c for c in columns if c not in ["Class", "Time"]]

target = "Class"

state = np.random.RandomState(42)
X = data1[columns]
Y = data1[target]
Y = np.array(Y)

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)
X = X_train
Y = y_train


In [39]:
sm = SMOTE(random_state = 2,sampling_strategy=0.5)
X,Y  = sm.fit_resample(X, Y)
print('After OverSampling, the shape of train_X: {}'.format(X.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(Y.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(Y == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(Y == 0)))

After OverSampling, the shape of train_X: (298506, 29)
After OverSampling, the shape of train_y: (298506,) 

After OverSampling, counts of label '1': 99502
After OverSampling, counts of label '0': 199004


In [40]:
from tensorflow import keras
from keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
Dense(input_dim = 29, units = 16, activation = 'relu'),
Dense(units = 24, activation = 'relu'),
Dropout(0.5),
Dense(units = 20, activation = 'relu'),
Dense(units = 24, activation = 'relu'),
Dense(units =1, activation = 'sigmoid'),])

In [41]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(X, Y, batch_size = 15, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ef8026a518>

In [42]:
###################### Evaluation on Train Dataset ############################
y_pred = model.predict_classes(X)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y, y_pred, labels=[1,0])
print("Confusion Matrix:\n",cm)

p = cm[0][0]/(cm[0][0] + cm[0][1])
r = cm[0][0]/(cm[0][0] + cm[1][0])
print("Precision = ",p)
print("Recall = ", r)
print("F1 Score = ", 2*p*r/(p+r))



Confusion Matrix:
 [[ 99305    197]
 [   302 198702]]
Precision =  0.9980201402986875
Recall =  0.9969680845723694
F1 Score =  0.9974938350350814


In [43]:
###################### Evaluation on Whole Dataset ############################
y_pred = model.predict_classes(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred, labels=[1,0])
print(cm)

p = cm[0][0]/(cm[0][0] + cm[0][1])
r = cm[0][0]/(cm[0][0] + cm[1][0])
print("Precision = ",p)
print("Recall = ", r)
print("F1 Score = ", 2*p*r/(p+r))



[[  110    22]
 [  138 85173]]
Precision =  0.8333333333333334
Recall =  0.4435483870967742
F1 Score =  0.5789473684210527


In [None]:
###################### Helper #############################

y_pred = model.predict(X)

oulier = [0,0]
normal = [0,0]
for i in range(len(Y)):
    if(Y[i]==0 and y_pred[i]==0):
        normal[0] = normal[0]+1
        normal[1] = normal[1]+1
    elif(Y[i]==0 and y_pred[i]==1):
        normal[1] = normal[1]+1
    if(Y[i]==1 and y_pred[i]==1):
        oulier[0] = oulier[0]+1
        oulier[1] = oulier[1]+1
    elif(Y[i]==1 and y_pred[i]==0):
        oulier[1] = oulier[1]+1
        
print(oulier)
print(normal)