# A Fool Fraud:
## Data Augmentation

# Contents 
## Load and Preprocess Data 
## Data Augmentation
### Split Data for Augmentation
### GANs
### SMOTE and ADASYN
## Build and Evaluate Models

In [None]:
from foolfraud import *

import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN

# Load and Preprocess Data

In [None]:
file_name = "creditcard.csv"
data = load_and_describe(file_name)

In [None]:
data.head()

In [None]:
data = preprocess_data(data)

In [None]:
data.head()

In [None]:
data.groupby("Class").mean()

# Data Augmentation

## Split Data for Augmentation

In [None]:
fraudulent, legitimate = split_by_class(data)

In [None]:
seed = 0
test_size = 100
train_fraud, test_fraud = train_test_split(
    fraudulent, test_size=test_size, random_state=seed
)
test_size = int(test_size * (len(legitimate) / len(fraudulent)))
train_legit, test_legit = train_test_split(
    legitimate, test_size=test_size, random_state=seed
)
print(
    f"No. training fraudulent transactions: {len(train_fraud)}, testing: {len(test_fraud)}"
)
train_data = pd.concat([train_fraud, train_legit]).sample(frac=1)
print(f"Original training data shape: {train_data.shape}")

In [None]:
test_data = pd.concat([test_fraud, test_legit]).sample(frac=1)
X_test, y_test = test_data.iloc[:, :-1], test_data.iloc[:, -1]
print(f"Testing data shape: {test_data.shape}")

In [None]:
pct_test = 100 * (len(test_fraud) + len(test_legit)) / len(data)
print(f"Pct. data used for testing: {pct_test:.1f}")

## GANs

In [None]:
n_to_generate = len(train_legit) - len(train_fraud)
print(f"No. fraudulent transactions to generate: {n_to_generate}\n")

gan_train_data = train_fraud.drop("Class", axis=1)
print("GAN training data head:")
gan_train_data.head()

In [None]:
shape = (gan_train_data.shape[0], 1, gan_train_data.shape[1])
gan_train_data = torch.tensor(gan_train_data.values).reshape(shape).float()
gan_train_data.shape

In [None]:
# Configuration
epochs = 1000
batch_size = 28
sample_size = 64  # Number of random values to sample
g_lr = 1.0e-3  # Generator's learning rate
d_lr = 1.0e-4  # Discriminator's learning rate
n_batches = int(gan_train_data.shape[0] / batch_size)
print(f"No. batches: {n_batches}")

In [None]:
# Real and fake labels
real_targets = torch.ones(batch_size, 1)
fake_targets = torch.zeros(batch_size, 1)

# Generator and Discriminator networks
generator = Generator(sample_size)
discriminator = Discriminator()

# Optimizers
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=d_lr)
g_optimizer = torch.optim.Adam(generator.parameters(), lr=g_lr)

In [None]:
# Training loop
for epoch in range(epochs):
    d_losses = []
    g_losses = []

    for batch in range(n_batches):
        # Discriminator Network Training
        # Loss with fraud transaction inputs and real_targets as labels
        start = int(batch * batch_size)
        stop = start + batch_size
        transactions = gan_train_data[start:stop]

        discriminator.train()
        d_loss = discriminator(transactions, real_targets)

        # Generate transactions in eval mode
        generator.eval()
        with torch.no_grad():
            generated_transactions = generator(batch_size)

        # Loss with generated fraud transaction inputs and fake_targets as labels
        d_loss += discriminator(generated_transactions, fake_targets)

        # Optimizer updates the discriminator parameters
        d_optimizer.zero_grad()
        d_loss.backward()
        d_optimizer.step()

        # Generator Network Training
        # Generate transactions in train mode
        generator.train()
        generated_transactions = generator(batch_size)

        # Loss with generated fraud transaction inputs and real_targets as labels
        g_loss = discriminator(generated_transactions, real_targets)

        # Optimizer updates the generator parameters
        g_optimizer.zero_grad()
        g_loss.backward()
        g_optimizer.step()

        # Keep losses for logging
        d_losses.append(d_loss.item())
        g_losses.append(g_loss.item())

    # Print average losses
    if epoch % 10 == 0:
        outstr = f"epoch: {epoch} d_loss: {np.mean(d_losses):.3f} g_loss: {np.mean(g_losses):.3f}"
        print(outstr)

In [None]:
# calculate batches to generate
full_batch_size = 100
n_full_batches, last_batch_size = divmod(n_to_generate, full_batch_size)
# generate and concatenate data from GAN
gan_generated = [generate_transactions(generator) for _ in range(n_full_batches)]
gan_generated += [generate_transactions(generator, last_batch_size)]
gan_generated = pd.concat(gan_generated)
gan_generated.columns = test_fraud.columns[:-1]
print(f"Generated data shape: {gan_generated.shape}\n")
print("Generated data head:")
gan_generated.head()

In [None]:
# build GAN training sets
gan_generated.columns = test_fraud.columns[:-1]
gan_generated["Class"] = 1
GAN_training_data = pd.concat([gan_generated, train_data]).sample(frac=1)
X_train_GAN, y_train_GAN = GAN_training_data.iloc[:, :-1], GAN_training_data.iloc[:, -1]
print(f"GAN balance: {y_train_GAN.mean():.2f}")

## SMOTE and ADASYN

In [None]:
# SMOTE and ADASYN training sets
X_train, y_train = train_data.iloc[:, :-1], train_data.iloc[:, -1]
X_train_SMOTE, y_train_SMOTE = SMOTE(random_state=seed).fit_resample(X_train, y_train)
X_train_ADASYN, y_train_ADASYN = ADASYN(random_state=seed).fit_resample(
    X_train, y_train
)
print(f"SMOTE balance: {y_train_SMOTE.mean():.2f}")
print(f"ADASYN balance: {y_train_ADASYN.mean():.2f}")

# Build and Evaluate Models

In [None]:
ext = "smote"
smote_results = build_and_evaluate(
    X_train_SMOTE, y_train_SMOTE, X_test, y_test, ext=ext
)

In [None]:
ext = "adasyn"
adasyn_results = build_and_evaluate(
    X_train_ADASYN, y_train_ADASYN, X_test, y_test, ext=ext
)

In [None]:
ext = "gan"
gan_results = build_and_evaluate(X_train_GAN, y_train_GAN, X_test, y_test, ext=ext)