In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, recall_score
from torch.utils.data import DataLoader, TensorDataset
import warnings
import random
import optuna

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

##############################################################################
# 1) LOAD & PREPARE DATA
##############################################################################
df = pd.read_csv("../data/transaction_data.csv")
if "Unnamed: 0" in df.columns:
    df.drop("Unnamed: 0", axis=1, inplace=True)

X_train_df, X_test_df = train_test_split(df, test_size=0.25, random_state=42)

X_train_vae_df, X_test_vae_df = X_train_df.copy(), X_test_df.copy()

# Only train on legitimate transactions for the VAE
X_train_vae_df = X_train_vae_df[X_train_vae_df['Class'] == 0]
y_test_vae = X_test_vae_df['Class']

X_train_vae_df = X_train_vae_df.drop(['Class'], axis=1)
X_test_vae_df = X_test_vae_df.drop(['Class'], axis=1)

scaler = StandardScaler()
X_train_vae_np = scaler.fit_transform(X_train_vae_df)
X_test_vae_np = scaler.transform(X_test_vae_df)

##############################################################################
# 2) DEFINE VAE CLASSES & FUNCTIONS
##############################################################################
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_1_dim, hidden_2_dim, z_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_1_dim)
        self.fc2 = nn.Linear(hidden_1_dim, hidden_2_dim)
        self.fc_mu = nn.Linear(hidden_2_dim, z_dim)
        self.fc_logvar = nn.Linear(hidden_2_dim, z_dim)

    def forward(self, x):
        x1 = F.leaky_relu(self.fc1(x), 0.01)
        h = F.leaky_relu(self.fc2(x1), 0.01)
        mu = self.fc_mu(h)
        log_var = self.fc_logvar(h)
        return mu, log_var

class Decoder(nn.Module):
    def __init__(self, z_dim, hidden_1_dim, hidden_2_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(z_dim, hidden_1_dim)
        self.fc2 = nn.Linear(hidden_1_dim, hidden_2_dim)
        self.fc_out = nn.Linear(hidden_2_dim, output_dim)
        self.fc_logvar = nn.Linear(hidden_2_dim, output_dim)

    def forward(self, z):
        x1 = F.leaky_relu(self.fc1(z), 0.01)
        h = F.leaky_relu(self.fc2(x1), 0.01)
        x_mu = self.fc_out(h)
        x_logvar = self.fc_logvar(h)
        return x_mu, x_logvar

class BetaELBO(Trace_ELBO):
    def __init__(self, beta=1.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.beta = beta

    def _differentiable_loss_particle(self, model_trace, guide_trace):
        log_pz = 0.0
        log_qz = 0.0
        log_px = 0.0

        for name, site in guide_trace.nodes.items():
            if site["type"] == "sample" and not site["is_observed"]:
                log_qz += site["log_prob"].sum()
                log_pz += model_trace.nodes[name]["log_prob"].sum()

        for name, site in model_trace.nodes.items():
            if site["type"] == "sample" and site["is_observed"]:
                log_px += site["log_prob"].sum()

        elbo_particle = log_px + log_pz - (self.beta * log_qz)
        loss_particle = -elbo_particle
        return loss_particle, loss_particle

def kl_annealing_factor(epoch, start_epoch=0, end_epoch=50, max_beta=1.0):
    if epoch < start_epoch:
        return 0.0
    elif epoch > end_epoch:
        return max_beta
    else:
        progress = float(epoch - start_epoch) / float(end_epoch - start_epoch)
        return progress * max_beta

def vae_model(x, decoder, z_dim):
    pyro.module("decoder", decoder)
    with pyro.plate("data", x.shape[0]):
        z = pyro.sample("z", dist.Normal(0., 1.).expand([x.shape[0], z_dim]).to_event(1))
        x_mu, x_logvar = decoder(z)
        x_scale = torch.exp(0.5 * x_logvar) + 1e-7
        pyro.sample("obs", dist.Normal(x_mu, x_scale).to_event(1), obs=x)

def vae_guide(x, encoder, z_dim):
    pyro.module("encoder", encoder)
    with pyro.plate("data", x.shape[0]):
        mu, log_var = encoder(x)
        scale = torch.exp(0.5 * log_var) + 1e-7
        pyro.sample("z", dist.Normal(mu, scale).to_event(1))

def compute_recon_error(x_batch, encoder, decoder, z_dim, n_samples=1000):
    with torch.no_grad():
        batch_size = x_batch.size(0)
        mu, log_var = encoder(x_batch)
        scale = torch.exp(0.5 * log_var) + 1e-7

        z = dist.Normal(mu.unsqueeze(0), scale.unsqueeze(0)).sample((n_samples,))

        z_flat = z.view(-1, z_dim)
        x_mu_flat, x_logvar_flat = decoder(z_flat)
        x_scale_flat = torch.exp(0.5 * x_logvar_flat) + 1e-7

        x_mu = x_mu_flat.view(n_samples, batch_size, x_batch.size(1))
        x_scale = x_scale_flat.view(n_samples, batch_size, x_batch.size(1))

        x_expand = x_batch.unsqueeze(0).expand(n_samples, -1, -1)
        recon_error = -dist.Normal(x_mu, x_scale).log_prob(x_expand)
        recon_error = recon_error.sum(dim=-1)
        recon_error_mean = recon_error.mean(dim=0)
        return recon_error_mean.cpu().numpy()

##############################################################################
# 3) OPTUNA OBJECTIVE (F-Beta=30 THRESHOLD STRATEGY, RETURN RECALL(1))
##############################################################################
def objective(trial):
    pyro.clear_param_store()

    # Hyperparam search spaces
    z_dim = trial.suggest_categorical("z_dim", [1,2,3])
    hidden_1_dim = trial.suggest_categorical("hidden_1_dim", [64,128,256])
    possible_hidden_2 = [v for v in [32, 64, 128, 256, 512] if v < hidden_1_dim]
    hidden_2_dim = trial.suggest_categorical("hidden_2_dim", possible_hidden_2)
    num_epochs = trial.suggest_categorical("num_epochs", [50])
    start_epoch = trial.suggest_categorical("start_epoch", [0,10,20])
    end_epoch = trial.suggest_categorical("end_epoch", [50,80,100])
    max_beta = trial.suggest_categorical("max_beta", [1.0,1.5,2.0])
    batch_size = trial.suggest_categorical("batch_size", [128])
    test_batch_size = trial.suggest_categorical("test_batch_size", [128])

    # Create data loaders
    X_train_tensor = torch.tensor(X_train_vae_np, dtype=torch.float).to(device)
    train_dataset = TensorDataset(X_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    input_dim_vae = X_train_tensor.shape[1]
    enc = Encoder(input_dim_vae, hidden_1_dim, hidden_2_dim, z_dim).to(device)
    dec = Decoder(z_dim, hidden_2_dim, hidden_1_dim, input_dim_vae).to(device)

    optim_dict = {
        "optimizer": optim.Adam,
        "optim_args": {"lr": 1e-4},
        "factor": 0.2,
        "patience": 3,
        "threshold": 1e-4,
        "threshold_mode": 'rel',
        "verbose": False
    }
    scheduler = ReduceLROnPlateau(optim_dict)
    beta_elbo = BetaELBO(beta=0.0)
    svi = SVI(lambda x: vae_model(x, dec, z_dim),
              lambda x: vae_guide(x, enc, z_dim),
              scheduler,
              loss=beta_elbo)

    # Train loop
    n_samples_train = X_train_tensor.size(0)
    for epoch in range(num_epochs):
        current_beta = kl_annealing_factor(epoch, start_epoch, end_epoch, max_beta)
        beta_elbo.beta = current_beta

        epoch_loss = 0.0
        for batch in train_loader:
            x_batch = batch[0]
            loss = svi.step(x_batch)
            epoch_loss += loss

        avg_loss = epoch_loss / n_samples_train
        scheduler.step(avg_loss)

    # Evaluate on test
    X_test_tensor = torch.tensor(X_test_vae_np, dtype=torch.float).to(device)
    n_test = X_test_tensor.size(0)
    recon_errors = []
    for i in range(0, n_test, test_batch_size):
        batch_x = X_test_tensor[i : i+test_batch_size]
        errs = compute_recon_error(batch_x, enc, dec, z_dim)
        recon_errors.extend(errs)

    # Create error_df
    error_df = pd.DataFrame({
        "recon_error": recon_errors,
        "true_class": y_test_vae.values
    })

    # F-Beta approach with Beta=30
    y_true = error_df["true_class"].values
    y_scores = error_df["recon_error"].values
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)

    beta = 30.0
    # compute f-beta for each threshold
    # note: length(thresholds) = len(precision)-1 = len(recall)-1
    # so we skip the last precision/recall for f-beta
    precision_ = precision[:-1]
    recall_ = recall[:-1]
    fbeta_scores = (1+beta**2) * (precision_*recall_)/(beta**2*precision_ + recall_ + 1e-8)
    best_idx = np.argmax(fbeta_scores)
    best_threshold = thresholds[best_idx]

    # Once we pick that threshold, we compute recall(1)
    y_pred = (y_scores >= best_threshold).astype(int)
    recall_fraud = recall_score(y_true, y_pred, pos_label=1)

    # Our objective = recall on fraudulent transactions
    return recall_fraud

##############################################################################
# 4) RUN OPTUNA STUDY
##############################################################################
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("\n=== Best Trial ===")
best_trial = study.best_trial
print(f"Best Value (Recall Fraud) = {best_trial.value}")
print("Best Params:")
for k,v in best_trial.params.items():
    print(f"  {k} = {v}")

Using device: cpu


[I 2025-01-29 19:03:28,776] A new study created in memory with name: no-name-ec5d89bd-12cd-4793-80a9-5f9df1f89375
[I 2025-01-29 19:10:31,016] Trial 0 finished with value: 0.9292035398230089 and parameters: {'z_dim': 1, 'hidden_1_dim': 256, 'hidden_2_dim': 64, 'num_epochs': 50, 'start_epoch': 20, 'end_epoch': 50, 'max_beta': 1.0, 'batch_size': 128, 'test_batch_size': 128}. Best is trial 0 with value: 0.9292035398230089.
[W 2025-01-29 19:10:31,018] Trial 1 failed with parameters: {'z_dim': 1, 'hidden_1_dim': 128} because of the following error: ValueError('CategoricalDistribution does not support dynamic value space.').
Traceback (most recent call last):
  File "/Users/abencarrington/Downloads/DS_Projects/Credit-Card-Fraud-Detection/myenv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/7y/3gfjsqv1527blyqnj1bgm7br0000gn/T/ipykernel_85025/84615325.py", line 167, in obje

ValueError: CategoricalDistribution does not support dynamic value space.