In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns

from keras import regularizers, layers
from keras.callbacks import ModelCheckpoint, TensorBoard

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.preprocessing import RobustScaler, StandardScaler

from tensorflow.keras.models import Model

from time import time


In [None]:
data = pd.read_csv('./dataset/creditCardActivity/creditcard.csv', engine='pyarrow')
raw_data = data.values
data.head()

In [None]:
# Separate normal transactions from fraudulent transactions
normal_data = data[data['Class'] == 0]
normal_data_sample = normal_data.sample(4000)
fraud_data = data[data['Class'] == 1]

reduced_set = pd.concat([normal_data_sample, fraud_data]).reset_index(drop=True)

# Splitting the dataset into X and y features
y = reduced_set['Class']
X = reduced_set.drop('Class', axis=1)
X = X.drop('Time', axis=1)

X.to_json('test_data.json', orient='records')


y = data['Class']
X = data.drop('Class', axis=1)
X = X.drop('Time', axis=1)

test_normal = normal_data.drop('Class', axis=1)
test_fraud = fraud_data.drop('Class', axis=1)
test_normal = test_normal.drop('Time', axis=1)
test_fraud = test_fraud.drop('Time', axis=1)

# # Normalize and scale the data
# scaler = RobustScaler().fit_transform(X)

# # Scaled data
# X_scaled_normal = scaler[y == 0]
# X_scaled_fraud = scaler[y == 1]

print(f"Reduced dataset shape : {reduced_set.shape}")
print(f"Shape of Features : {X.shape} and Target: {y.shape}")

In [None]:
# Visualise the data
def dimensionality_plot(X, y):
    sns.set(style='whitegrid', palette='muted')
    # Initializing TSNE object with 2 principal components
    tsne = TSNE(n_components=2, random_state=42, init='random', learning_rate=200)
    # Fitting the data
    X_trans = tsne.fit_transform(X)

    plt.figure(figsize=(12,8))
    
    plt.scatter(X_trans[np.where(y == 0), 0], X_trans[np.where(y==0), 1], marker='o', color='green', linewidth=1, alpha=0.8, label='Normal')
    plt.scatter(X_trans[np.where(y == 1), 0], X_trans[np.where(y==1), 1], marker='o', color='red', linewidth=1, alpha=0.8, label='Fraud')
    
    plt.legend(loc = 'best')
    
    plt.show()

In [None]:
dimensionality_plot(X, y)

In [None]:
print(f"Shape of the input data : {X.shape[1]}")

In [None]:
# Convert relative 'Time' to hour of day
data['Time'] = data['Time'].apply(lambda t: (t / 3600) % 24)

# Scale Time and Amount
# data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))
data = data.drop(['Time'], axis=1)
# data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))

train_x, test_x = train_test_split(data, test_size=0.2, random_state=736)
train_x = train_x[train_x['Class'] == 0]    # train only on normal transactions data
train_x = train_x.drop(['Class'], axis=1)   # drop the class column

test_x_normal = test_x[test_x['Class'] == 0].sample(5000)
test_x_normal = test_x_normal.drop(['Class'], axis=1).values
test_x_fraud = test_x[test_x['Class'] == 1]
test_x_fraud = test_x_fraud.drop(['Class'], axis=1).values

test_normal_sample = normal_data_sample.drop(['Class'], axis=1) 
test_fraud_sample = fraud_data.drop(['Class'], axis=1)

test_y = test_x['Class']                    # save the class column for the test set
test_x = test_x.drop(['Class'], axis=1)     # drop the class column

test = test_x.sample(1)

train_x = train_x.values                    # transform to ndarray
test_x = test_x.values                      # transform to ndarray




In [None]:
hyperparams = {
    "epochs": 150,
    "batch_size": 32,
    "threshold": 0.75
}

In [None]:
# Build the model :) 
class FraudDetector(Model):

    def __init__(self):
        super(FraudDetector, self).__init__()

        self.encoder = tf.keras.Sequential([
            layers.Dense(18, activation='tanh', activity_regularizer=regularizers.l1(1e-7)),
            layers.Dense(10, activation='relu'),
            layers.Dense( 6, activation='tanh', activity_regularizer=regularizers.l1(1e-7))]        )

        self.decoder = tf.keras.Sequential([
            layers.Dense(10, activation='relu'),
            layers.Dense(18, activation='tanh', activity_regularizer=regularizers.l1(1e-7)),
            layers.Dense(30, activation='relu')]
        )

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [None]:
# Build the model :) 
class FraudDetector_2(Model):

    def __init__(self):
        super(FraudDetector_2, self).__init__()

        self.encoder = tf.keras.Sequential([
            layers.Dense(29, activation='relu', input_shape=(29, )),
            layers.Dense(14, activation='tanh', activity_regularizer=regularizers.l1(1e-7)),
            layers.Dense( 7, activation='tanh', activity_regularizer=regularizers.l1(1e-7))]    
        )

        self.decoder = tf.keras.Sequential([
            layers.Dense(14, activation='tanh', activity_regularizer=regularizers.l1(1e-7)),
            layers.Dense(29, activation='relu')]
        )

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

    def encode(self, x):
        return self.encoder(x)

    def decode(self, encoded_x):
        return self.decoder(excoded_x)


In [None]:
# Build the model :) 
class FraudDetector_3(Model):

    def __init__(self):
        super(FraudDetector_3, self).__init__()

        self.encoder = tf.keras.Sequential([
            layers.Dense(30, activation='relu', input_shape=(30, )),
            layers.Dense(14, activation='relu'),
            layers.Dense( 7, activation='relu')]
        )

        self.decoder = tf.keras.Sequential([
            layers.Dense(14, activation='relu'),
            layers.Dense(30, activation='relu')]
        )

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
autoencoder = FraudDetector_2()

# Compule the model
autoencoder.compile(optimizer='adam', metrics = ['accuracy'], loss='mse')

# Train the model
history = autoencoder.fit(x=train_x, y=train_x, batch_size=hyperparams['batch_size'], epochs=hyperparams['epochs'], shuffle=True, validation_data=(test_x, test_x))

In [None]:
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()

In [None]:
# normal_points = autoencoder.predict(test_normal_sample)
# fraud_points = autoencoder.predict(test_fraud_sample)

# normal_points = autoencoder.predict(test_x_normal)
# fraud_points = autoencoder.predict(test_x_fraud)

normal_points = autoencoder.predict(test_normal)
fraud_points = autoencoder.predict(test_fraud)

encoded_X = np.append(normal_points, fraud_points, axis=0)
y_normal = np.zeros(normal_points.shape[0])
y_fraud = np.ones(fraud_points.shape[0])
encoded_y = np.append(y_normal, y_fraud, axis=0)

In [None]:
dimensionality_plot(encoded_X, encoded_y)

In [None]:
normal_points = autoencoder.predict(test_normal_sample)
fraud_points = autoencoder.predict(test_fraud_sample)

mse = np.mean(np.square(test_fraud_sample - fraud_points), axis = 1)
mse

In [None]:
predictions = autoencoder.predict(test)
mse = np.mean(np.power(test_x - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': test_y})
error_df.describe()
test

In [None]:
# Reconstruction error without fraud
fig = plt.figure()
ax = fig.add_subplot(111)
normal_error_df = error_df[(error_df['true_class']== 0) & (error_df['reconstruction_error'] < 10)]
_ = ax.hist(normal_error_df.reconstruction_error.values, bins=10)


In [None]:
# Reconstruction error with fraud
fig = plt.figure()
ax = fig.add_subplot(111)
fraud_error_df = error_df[error_df['true_class'] == 1]
_ = ax.hist(fraud_error_df.reconstruction_error.values, bins=10)


In [None]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
precision, recall, th = precision_recall_curve(error_df.true_class, error_df.reconstruction_error)
plt.plot(recall, precision, 'b', label='Precision-Recall curve')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [None]:
# Prediction analysis

error_threshold = 2.9

groups = error_df.groupby('true_class')
fig, ax = plt.subplots()

for name, group in groups:
    ax.plot(group.index, group.reconstruction_error, marker='o', color="r" if name == 1 else "g", ms=3.5, linestyle='',
            label="Fraud" if name == 1 else "Normal")
ax.hlines(error_threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="b", zorder=100, label='Threshold')
ax.legend()
plt.title("Reconstruction error for different classes")
plt.ylabel("Reconstruction error")
plt.xlabel("Data point index")
plt.show()


In [None]:
# confusion matrix

y_pred = [1 if e > error_threshold else 0 for e in error_df.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.true_class, y_pred)
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix, xticklabels=["Normal", "Fraud"], yticklabels=["Normal", "Fraud"], annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()


In [None]:
ts = int(time())
filepath = f'../models/fraud_detector/{ts}'
autoencoder.save(filepath=filepath, save_format='tf')