In [None]:
# Libraries for data manipulation
import pandas as pd 
import numpy as np
# Libraries for test-train split & deploying isolation forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix
# Libraries for viz
import matplotlib.pyplot as plt
import seaborn as sns
# Libraries for autoencoder
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Input, Dense
from keras import regularizers
from keras.optimizers import Adam
# Supress warnings
import warnings
warnings.filterwarnings('ignore')

We will deploy and try to understand the methods while eventually comparing results of different chosen approaches. We will be using GPT 4 for guidance to choose and apply the different approaches given our problem statement. 
**Our objective will be to maximise the recall value with a high precision value.**

# 1. Data Pre-processing

In [None]:
# Reading in the data
data = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

In [None]:
# A glance at the data
data.describe()

In [None]:
# A look at the predictor variable
data["Class"].value_counts()

In [None]:
# A look at the shape of the data
original_rows = len(data)
data.shape

## 1.1 Removing Duplicate Values

In [None]:
# Removing duplicate values 
data.drop_duplicates(subset = None, keep = "first", inplace = True, ignore_index = True)

In [None]:
# Again having a look at the shape of the data to check the number of removed rows
dedup_rows = len(data)
data.shape

In [None]:
# Total rows removed 
print("Total duplicate rows removed : ", original_rows -dedup_rows)

In [None]:
# A look at the predictor variable
data["Class"].value_counts()

## 1.2 Variable Correlation

In [None]:
# Looking at the variable correlations
corr= round(data.corr(),2)
plt.figure(figsize=(12,8))
sns.set(font_scale=0.8)
sns.heatmap(corr, cmap = 'viridis', annot = True)
plt.show()

In [None]:
# Visually inspecting the impact of Amount over frauds
sns.set_style(style='dark')
sns.FacetGrid(data=data, col='Class').map(sns.scatterplot, 'Time', 'Amount', palette='muted')

## 1.3 Variable Selection

In [None]:
# As there doesn't seem to be any impact of Amount & Time over frauds we're removing both variables from our datasets
new_data = data.drop(["Time","Amount"], axis = 1)

In [None]:
# Data split to test and train 
X = new_data.drop("Class", axis = 1)
y = new_data["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

# 2. Isolation Trees

Isolation Trees (iTrees) are a type of algorithm primarily used for anomaly detection. They work by isolating observations, assuming that anomalies are easier to isolate compared to normal points due to their fewer numbers and distinct attribute values. An iTree recursively partitions the data by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature. This process continues until the points are isolated or a limit in tree depth is reached. Anomalies tend to have shorter paths in the tree, indicating they are easier to isolate.

![](http://https://www.notion.so/image/https%3A%2F%2Fs3-us-west-2.amazonaws.com%2Fsecure.notion-static.com%2Fcf1c1d99-47bb-4a34-aec5-3431a929335f%2FUntitled.png?table=block&id=79c1ba86-2c87-4f38-8c7f-c496f1763aaf&cache=v2)

This is a highly imbalanced dataset and it makes perfect sense to apply Isolation trees as :
1. the algorithm focusses on isolating anomalies
2. they work well with high dimension data and don't suffer from the curse of dimensionality
3. they use random features to work with which makes them highly scalable and efficient in terms of required computational power

In [None]:
tree = [10, 25, 50, 100, 150, 200, 250, 500,1000]
for value in tree :
    iso_forest = IsolationForest(n_estimators=value, contamination='auto', random_state=42)
    iso_forest.fit(X_train)
    test_predictions = iso_forest.predict(X_test)
    test_binary_predictions = np.where(test_predictions == -1, 1, 0)
    print("No of trees used for prediction ",value, "\n", confusion_matrix(y_test, test_binary_predictions), "\n", classification_report(y_test, test_binary_predictions))

Upon analyzing we observe that we get the **best results with 50 trees where we get a recall value of 80% and precision of around 5%**

Going forward we would like to improve upon the recall value first of all and then focus on the precision value

# 3. Autoencoders

Autoencoders are trained to compress (encode) the input data into a lower-dimensional representation and then reconstruct (decode) it back to the original input. By training exclusively on normal (non-fraudulent) transactions, the autoencoder learns to capture the typical patterns of normal behavior.

When a new transaction is input into the trained autoencoder, if the transaction is normal, the autoencoder should be able to reconstruct it well, resulting in a low reconstruction error. However, if the transaction is fraudulent (thus differing from the normal pattern it has learned), the reconstruction error will be high, signaling a potential anomaly.

## 3.1 Data Creation for Auto Encoders

In [None]:
# Reset index to make index a column
X_train_reset = X_train.reset_index()
y_train_reset = y_train.reset_index()

# Now merge using 'index' as a column
merged_df = X_train_reset.merge(y_train_reset, on='index', how='left')

# Creatig copy of merged_df
X_train_normal = merged_df.copy()

# Selecting only non-fraudulent transactions
X_train_normal = X_train_normal[X_train_normal["Class"] == 0]

# Dropping the column Class 
X_train_normal = X_train_normal.drop(columns = ["Class", "index"])

In [None]:
X_train_normal.describe()

We can observe that the variables have values between -100 to 100 and hence before making any decision we would like to see the distribution of the variables

## 3.2 Variable Distribution

In [None]:
for col in X_train_normal.columns:
    plt.figure(figsize=(10, 6))  # Specify your desired figure size
    sns.histplot(data=X_train_normal[col], kde=True)
    plt.title(f'Distribution of {col}')  # Optional: Adds a title to each plot
    plt.show()

We observe that all variables are normally distributed hence we can go ahead with z-scale normalisation

## 3.3 Variable Scaling

In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler to your data and transform it
X_train_normal_scaled = scaler.fit_transform(X_train_normal)

# Convert the scaled data back to a DataFrame (optional, for convenience)
X_train_normal_scaled = pd.DataFrame(X_train_normal_scaled, columns=X_train_normal.columns)

## 3.4 Algorithm Application

In [None]:
learning_rates = [0.001, 0.0001, 0.00001]  # Define learning rates to test
threshold_percentiles = [88, 92, 97]  # Define percentile values for setting thresholds

# Initialize the StandardScaler
scaler = StandardScaler()
# Assuming X_test is your test dataset
X_test_scaled = scaler.fit_transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

for lr in learning_rates:
    # Define the architecture
    input_dim = X_train_normal_scaled.shape[1]
    input_layer = Input(shape=(input_dim,))
    encoded = Dense(64, activation='relu')(input_layer)
    encoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)

    # Compile the autoencoder with the current learning rate
    autoencoder = Model(input_layer, decoded)
    autoencoder.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')

    # Train the autoencoder
    autoencoder.fit(X_train_normal_scaled, X_train_normal_scaled,
                    epochs=100,
                    batch_size=256,
                    shuffle=True,
                    validation_split=0.2)

    # Predict on the test set
    reconstructed = autoencoder.predict(X_test_scaled)

    # Calculate MSE for each instance
    mse = np.mean(np.square(X_test_scaled - reconstructed), axis=1)

    for percentile in threshold_percentiles:
        # Determine a threshold for anomaly detection
        threshold = np.percentile(mse, percentile)  # Set threshold based on the defined percentile of MSE

        # Detect anomalies
        anomalies = mse > threshold

        # Evaluate the model
        print(f"Results for learning rate: {lr} and threshold percentile: {percentile}")
        print(confusion_matrix(y_test, anomalies))
        print(classification_report(y_test, anomalies))

We can see that there is a minor improvement using an autoencode. This is a WIP and we will use more values for the optimizers and try to explain autoencoders and the steps with more details. 

To be continued !