In [1]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import Adam
from keras.metrics import SparseCategoricalAccuracy


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [3]:
import tensorflow as tf
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.18.0
Num GPUs Available:  0


In [4]:
X_train = pd.read_csv('../data/splits/X_train.csv').values
y_train = pd.read_csv('../data/splits/y_train.csv').values
original_Xtest = pd.read_csv('../data/splits/Xtest.csv').values
original_ytest = pd.read_csv('../data/splits/ytest.csv').values
Xsm_train = pd.read_csv('../data/splits/Xsm_train.csv').values
ysm_train = pd.read_csv('../data/splits/ysm_train.csv').values

# Keras + UnderSampling

In [5]:
n_inputs = X_train.shape[1]

undersample_model = Sequential([
    Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [6]:
undersample_model.summary()

In [7]:
print(X_train.dtype, y_train.dtype)
print(np.isnan(X_train).sum(), np.isnan(y_train).sum())
print(np.isinf(X_train).sum(), np.isinf(y_train).sum())
print(type(X_train), X_train.shape)
print(type(y_train), y_train.shape)
print(X_train.shape, y_train.shape)

float64 int64
0 0
0 0
<class 'numpy.ndarray'> (756, 30)
<class 'numpy.ndarray'> (756, 1)
(756, 30) (756, 1)


In [8]:
y_train = y_train.flatten()
X_train = X_train.astype('float32')

lr = 0.001
valid_split = 0.2
batch_size = 8
epochs = 1

undersample_model.compile(optimizer = Adam(learning_rate = lr),
                          loss=SparseCategoricalCrossentropy(),
                          metrics=[SparseCategoricalAccuracy()]
)

undersample_model.fit(X_train,
                      y_train,
                      validation_split = valid_split,
                      batch_size = batch_size,
                      epochs = epochs,
                      shuffle = True,
                      verbose = 2)

: 

In [None]:
# Probabilities
undersample_predictions = undersample_model.predict(original_Xtest, batch_size=200, verbose=0)

undersample_fraud_predictions = (undersample_predictions > 0.5).astype("int32")

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
undersample_cm = confusion_matrix(original_ytest, undersample_fraud_predictions)
actual_cm = confusion_matrix(original_ytest, original_ytest)
labels = ['No Fraud', 'Fraud']

fig = plt.figure(figsize=(16,8))

fig.add_subplot(221)
plot_confusion_matrix(undersample_cm, labels, title="Random UnderSample \n Confusion Matrix", cmap=plt.cm.Reds)

fig.add_subplot(222)
plot_confusion_matrix(actual_cm, labels, title="Confusion Matrix \n (with 100% accuracy)", cmap=plt.cm.Greens)

# Keras + OverSampling

In [None]:
n_inputs = Xsm_train.shape[1]

oversample_model = Sequential([
    Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

In [None]:
oversample_model.compile(Adam(lr=0.001),
                         loss='sparse_categorical_crossentropy',
                         metrics=['accuracy'])

lr = 0.001
valid_split = 0.2
batch_size = 300
epochs = 20

oversample_model.compile(Adam(learning_rate = lr),
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

oversample_model.fit(Xsm_train,
                     ysm_train,
                     validation_split = valid_split,
                     batch_size = batch_size,
                     epochs = epochs,
                     shuffle = True,
                     verbose = 2)

In [None]:
# Probabilities
oversample_predictions = oversample_model.predict(original_Xtest, batch_size=200, verbose=0)

oversample_fraud_predictions = (oversample_predictions > 0.5).astype("int32")

In [None]:
oversample_smote = confusion_matrix(original_ytest, oversample_fraud_predictions)
actual_cm = confusion_matrix(original_ytest, original_ytest)
labels = ['No Fraud', 'Fraud']

fig = plt.figure(figsize=(16,8))

fig.add_subplot(221)
plot_confusion_matrix(oversample_smote, labels, title="OverSample (SMOTE) \n Confusion Matrix", cmap=plt.cm.Oranges)

fig.add_subplot(222)
plot_confusion_matrix(actual_cm, labels, title="Confusion Matrix \n (with 100% accuracy)", cmap=plt.cm.Greens)