# Model Training and Hyperparameter Tuning

This notebook demonstrates the process of training fraud detection models and tuning their hyperparameters. We'll work with both classification and autoencoder models.

In [None]:
# Import necessary libraries
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import mlflow
import mlflow.tensorflow

# Add the src directory to the path to import our modules
sys.path.append('..')
from src.models.fraud_model import (
    create_classification_model, 
    train_classification_model,
    create_autoencoder_model,
    train_autoencoder_model,
    compute_anomaly_scores
)
from src.utils.data_utils import handle_class_imbalance

# Set plot style
plt.style.use('seaborn-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Load Processed Data

First, we'll load the processed data that was created by our PySpark preprocessing pipeline.

In [None]:
# Path to processed data
processed_data_path = '../data/processed/transactions.parquet'

# Check if the file exists
if os.path.exists(processed_data_path):
    # Load the data
    df = pd.read_parquet(processed_data_path)
    print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns")
else:
    print(f"File not found: {processed_data_path}")
    print("Please run the data processing script first.")

## 2. Data Preparation

Let's prepare the data for modeling by splitting it into training, validation, and test sets, and scaling the features.

In [None]:
# Separate features and target
if 'is_fraud' in df.columns:
    y = df['is_fraud'].values
    X = df.drop(columns=['is_fraud']).values
    
    # Check class distribution
    print("Class distribution:")
    print(f"Legitimate transactions: {np.sum(y == 0)} ({np.mean(y == 0)*100:.2f}%)")
    print(f"Fraudulent transactions: {np.sum(y == 1)} ({np.mean(y == 1)*100:.2f}%)")
else:
    print("Target column 'is_fraud' not found in the dataset.")

In [None]:
# Split data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Features scaled to zero mean and unit variance.")

In [None]:
# Save test data for later evaluation
test_df = pd.DataFrame(X_test_scaled, columns=df.drop(columns=['is_fraud']).columns)
test_df['is_fraud'] = y_test
test_df.to_parquet('../data/processed/test_data.parquet')
print("Test data saved for later evaluation.")

## 3. Handle Class Imbalance

Fraud detection datasets are typically highly imbalanced. Let's handle this imbalance for better model training.

In [None]:
# Check class imbalance
print("Class distribution in training set:")
print(f"Legitimate transactions: {np.sum(y_train == 0)} ({np.mean(y_train == 0)*100:.2f}%)")
print(f"Fraudulent transactions: {np.sum(y_train == 1)} ({np.mean(y_train == 1)*100:.2f}%)")

# Handle class imbalance using SMOTE
X_train_balanced, y_train_balanced = handle_class_imbalance(
    X_train_scaled, y_train, method='smote', ratio=0.5
)

print("\nClass distribution after balancing:")
print(f"Legitimate transactions: {np.sum(y_train_balanced == 0)} ({np.mean(y_train_balanced == 0)*100:.2f}%)")
print(f"Fraudulent transactions: {np.sum(y_train_balanced == 1)} ({np.mean(y_train_balanced == 1)*100:.2f}%)")

## 4. Classification Model Training

Let's train a classification model for fraud detection.

In [None]:
# Set up MLflow
mlflow.set_experiment("Fraud_Detection_Experiment")

In [None]:
# Define model parameters
input_dim = X_train_scaled.shape[1]
hidden_layers = [128, 64, 32]
dropout_rate = 0.4
batch_size = 256
epochs = 20

# Create model directory if it doesn't exist
model_dir = '../results/models'
os.makedirs(model_dir, exist_ok=True)

# Train model with MLflow tracking
with mlflow.start_run(run_name='classification_model_notebook'):
    # Log parameters
    mlflow.log_params({
        'model_type': 'classification',
        'hidden_layers': str(hidden_layers),
        'dropout_rate': dropout_rate,
        'batch_size': batch_size,
        'epochs': epochs,
        'input_dim': input_dim,
        'balancing_method': 'smote',
        'balancing_ratio': 0.5
    })
    
    # Train model
    model_path = os.path.join(model_dir, "classification_model.h5")
    model, history = train_classification_model(
        X_train_balanced, y_train_balanced, X_val_scaled, y_val,
        input_dim=input_dim,
        batch_size=batch_size,
        epochs=epochs,
        model_path=model_path
    )
    
    # Log metrics
    for epoch, metrics in enumerate(history.history.items()):
        metric_name, values = metrics
        for i, value in enumerate(values):
            mlflow.log_metric(f"train_{metric_name}", value, step=i)
    
    # Log model
    mlflow.tensorflow.log_model(model, "model")

In [None]:
# Plot training history
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

## 5. Autoencoder Model Training

Now, let's train an autoencoder model for anomaly detection.

In [None]:
# Filter training data to include only non-fraud transactions
X_train_normal = X_train_scaled[y_train == 0]
X_val_normal = X_val_scaled[y_val == 0]

print(f"Normal transactions for autoencoder training: {X_train_normal.shape[0]}")
print(f"Normal transactions for validation: {X_val_normal.shape[0]}")

In [None]:
# Define model parameters
input_dim = X_train_normal.shape[1]
hidden_layers = [64, 32]
encoding_dim = 16
batch_size = 256
epochs = 30

# Train autoencoder model with MLflow tracking
with mlflow.start_run(run_name='autoencoder_model_notebook'):
    # Log parameters
    mlflow.log_params({
        'model_type': 'autoencoder',
        'hidden_layers': str(hidden_layers),
        'encoding_dim': encoding_dim,
        'batch_size': batch_size,
        'epochs': epochs,
        'input_dim': input_dim
    })
    
    # Train model
    model_path = os.path.join(model_dir, "autoencoder_model.h5")
    autoencoder, history = train_autoencoder_model(
        X_train_normal, X_val_normal,
        input_dim=input_dim,
        batch_size=batch_size,
        epochs=epochs,
        model_path=model_path
    )
    
    # Log metrics
    for epoch, metrics in enumerate(history.history.items()):
        metric_name, values = metrics
        for i, value in enumerate(values):
            mlflow.log_metric(f"train_{metric_name}", value, step=i)
    
    # Log model
    mlflow.tensorflow.log_model(autoencoder, "model")

In [None]:
# Plot training history
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Autoencoder Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()

## 6. Hyperparameter Tuning

Let's perform hyperparameter tuning to find the best model configuration.

In [None]:
# Define hyperparameter search space for classification model
hidden_layers_options = [
    [64, 32],
    [128, 64, 32],
    [256, 128, 64, 32]
]
dropout_rates = [0.3, 0.4, 0.5]
learning_rates = [0.001, 0.0005, 0.0001]

In [None]:
# Perform grid search for classification model
best_val_auc = 0
best_params = {}

for hidden_layers in hidden_layers_options:
    for dropout_rate in dropout_rates:
        for lr in learning_rates:
            print(f"\nTraining with: hidden_layers={hidden_layers}, dropout_rate={dropout_rate}, learning_rate={lr}")
            
            with mlflow.start_run(run_name=f"tuning_classification_hl{len(hidden_layers)}_dr{dropout_rate}_lr{lr}"):
                # Log parameters
                mlflow.log_params({
                    'model_type': 'classification',
                    'hidden_layers': str(hidden_layers),
                    'dropout_rate': dropout_rate,
                    'learning_rate': lr,
                    'batch_size': batch_size,
                    'epochs': 10  # Reduced epochs for faster tuning
                })
                
                # Create model
                model = create_classification_model(input_dim, hidden_layers, dropout_rate)
                
                # Customize optimizer with learning rate
                model.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
                
                # Train model
                history = model.fit(
                    X_train_balanced, y_train_balanced,
                    batch_size=batch_size,
                    epochs=10,  # Reduced epochs for faster tuning
                    validation_data=(X_val_scaled, y_val),
                    verbose=0
                )
                
                # Evaluate on validation set
                y_val_pred = model.predict(X_val_scaled)
                val_auc = tf.keras.metrics.AUC()(y_val, y_val_pred).numpy()
                
                # Log metrics
                mlflow.log_metric("val_auc", val_auc)
                
                print(f"Validation AUC: {val_auc:.4f}")
                
                # Update best parameters if better
                if val_auc > best_val_auc:
                    best_val_auc = val_auc
                    best_params = {
                        'hidden_layers': hidden_layers,
                        'dropout_rate': dropout_rate,
                        'learning_rate': lr
                    }

print(f"\nBest parameters: {best_params}")
print(f"Best validation AUC: {best_val_auc:.4f}")

## 7. Train Final Models with Best Parameters

Now that we've found the best hyperparameters, let's train the final models.

In [None]:
# Train final classification model with best parameters
with mlflow.start_run(run_name='final_classification_model'):
    # Log parameters
    mlflow.log_params({
        'model_type': 'classification',
        'hidden_layers': str(best_params['hidden_layers']),
        'dropout_rate': best_params['dropout_rate'],
        'learning_rate': best_params['learning_rate'],
        'batch_size': batch_size,
        'epochs': epochs,
        'final_model': True
    })
    
    # Create model
    final_model = create_classification_model(
        input_dim, 
        best_params['hidden_layers'], 
        best_params['dropout_rate']
    )
    
    # Customize optimizer with best learning rate
    final_model.optimizer = tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate'])
    
    # Train model
    model_path = os.path.join(model_dir, "final_classification_model.h5")
    history = final_model.fit(
        X_train_balanced, y_train_balanced,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_val_scaled, y_val),
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5),
            tf.keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', save_best_only=True)
        ]
    )
    
    # Log metrics
    for epoch, metrics in enumerate(history.history.items()):
        metric_name, values = metrics
        for i, value in enumerate(values):
            mlflow.log_metric(f"train_{metric_name}", value, step=i)
    
    # Log model
    mlflow.tensorflow.log_model(final_model, "model")
    
    print(f"Final classification model saved to {model_path}")

## 8. Summary

In this notebook, we've:
1. Loaded and prepared the processed transaction data
2. Handled class imbalance using SMOTE
3. Trained classification and autoencoder models for fraud detection
4. Performed hyperparameter tuning to find the best model configuration
5. Trained final models with the best parameters

The trained models are saved in the `results/models` directory and can be used for evaluation and deployment.