# Notebook 02: Preprocessing & Baseline Comparison

**Objective**: Establish the preprocessing pipeline and baseline model performance for later comparison with neural networks.

**Critical for Reproducibility**: This notebook creates the **train/validation/test splits** and **fitted StandardScaler** that will be reused by all subsequent notebooks. These artifacts are saved to disk to prevent data leakage.

**Contents**:
1. Load and split data (70% train / 15% validation / 15% test, stratified)
2. Fit StandardScaler on training data only
3. Train and evaluate baseline models (Logistic Regression, Random Forest)
4. Save splits, scaler, and baseline performance for comparison

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import sys
import os
import time

# Add src to path
sys.path.append(os.path.abspath('../'))
import config
from src.evaluation_metrics import compute_fraud_metrics, print_classification_summary
from src.visualization_utils import plot_confusion_matrix, plot_precision_recall_curve

# Set random seeds
config.set_random_seeds()

# Ensure directories
config.ensure_directories()

print("‚úì Imports complete")
print(f"‚úì Random seed set to {config.RANDOM_SEED}")
print(f"‚úì Results will be saved to: {config.RESULTS_DIR}")

## 1. Load Data

In [None]:
# Load data
df = pd.read_csv(config.DATA_PATH)
print(f"‚úì Loaded {df.shape[0]:,} transactions with {df.shape[1]} features")

# Separate features and target
X = df[config.FEATURE_COLUMNS].values
y = df[config.TARGET_COLUMN].values

print(f"\n‚úì Features shape: {X.shape}")
print(f"‚úì Target shape: {y.shape}")
print(f"‚úì Fraud prevalence: {y.mean()*100:.4f}%")

## 2. Create Stratified Train/Validation/Test Splits

**Critical Decision**: We use a 70/15/15 split with stratification to maintain class balance across sets. Split indices are **saved** to ensure all subsequent notebooks use identical data partitions (prevents leakage).

In [None]:
# First split: 70% train, 30% temp (for val+test)
X_train, X_temp, y_train, y_temp, train_idx, temp_idx = train_test_split(
    X, y, np.arange(len(y)),
    test_size=0.30,
    stratify=y,
    random_state=config.RANDOM_SEED
)

# Second split: 50% of temp (15% of total) for validation, 50% for test
X_val, X_test, y_val, y_test, val_idx_temp, test_idx_temp = train_test_split(
    X_temp, y_temp, np.arange(len(y_temp)),
    test_size=0.50,
    stratify=y_temp,
    random_state=config.RANDOM_SEED
)

# Map temp indices back to original indices
val_idx = temp_idx[val_idx_temp]
test_idx = temp_idx[test_idx_temp]

# Save indices for reproducibility
np.save(config.TRAIN_INDICES_PATH, train_idx)
np.save(config.VAL_INDICES_PATH, val_idx)
np.save(config.TEST_INDICES_PATH, test_idx)

print("‚úì Data split complete:")
print(f"  Train: {len(train_idx):,} samples ({len(train_idx)/len(y)*100:.1f}%)")
print(f"  Val:   {len(val_idx):,} samples ({len(val_idx)/len(y)*100:.1f}%)")
print(f"  Test:  {len(test_idx):,} samples ({len(test_idx)/len(y)*100:.1f}%)")
print(f"\n‚úì Class distribution:")
print(f"  Train fraud rate: {y_train.mean()*100:.4f}%")
print(f"  Val fraud rate:   {y_val.mean()*100:.4f}%")
print(f"  Test fraud rate:  {y_test.mean()*100:.4f}%")
print(f"\n‚úì Split indices saved to:")
print(f"  {config.TRAIN_INDICES_PATH}")
print(f"  {config.VAL_INDICES_PATH}")
print(f"  {config.TEST_INDICES_PATH}")

## 3. Fit StandardScaler on Training Data

**Critical for Data Leakage Prevention**: The scaler is fit **only** on training data, then applied to validation and test sets. The fitted scaler is saved for use in all subsequent notebooks.

In [None]:
# Initialize and fit scaler on training data ONLY
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform validation and test using fitted scaler
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save fitted scaler
joblib.dump(scaler, config.SCALER_PATH)

print("‚úì StandardScaler fitted on training data")
print(f"‚úì Scaler saved to: {config.SCALER_PATH}")
print(f"\n‚úì Feature statistics (from training data):")
print(f"  Mean range: [{scaler.mean_.min():.4f}, {scaler.mean_.max():.4f}]")
print(f"  Std range:  [{scaler.scale_.min():.4f}, {scaler.scale_.max():.4f}]")
print(f"\n‚úì Scaled data statistics:")
print(f"  Train - Mean: {X_train_scaled.mean():.6f}, Std: {X_train_scaled.std():.6f}")
print(f"  Val   - Mean: {X_val_scaled.mean():.6f}, Std: {X_val_scaled.std():.6f}")
print(f"  Test  - Mean: {X_test_scaled.mean():.6f}, Std: {X_test_scaled.std():.6f}")

## 4. Baseline Model 1: Logistic Regression

We train a simple Logistic Regression with `class_weight='balanced'` to handle class imbalance. This provides a linear baseline for comparison with neural networks.

In [None]:
# Train Logistic Regression
print("Training Logistic Regression with balanced class weights...")
start_time = time.time()

lr_model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=config.RANDOM_SEED,
    n_jobs=-1
)
lr_model.fit(X_train_scaled, y_train)

train_time = time.time() - start_time

# Predictions on validation set
y_val_pred_proba_lr = lr_model.predict_proba(X_val_scaled)[:, 1]
y_val_pred_lr = lr_model.predict(X_val_scaled)

# Compute metrics
lr_metrics = compute_fraud_metrics(y_val, y_val_pred_lr, y_val_pred_proba_lr)

print(f"\n‚úì Training complete in {train_time:.2f} seconds")
print("\n" + "="*60)
print("LOGISTIC REGRESSION - Validation Performance")
print("="*60)
print_classification_summary(lr_metrics)

# Save model
lr_model_path = os.path.join(config.MODELS_DIR, 'logistic_regression_baseline.pkl')
joblib.dump(lr_model, lr_model_path)
print(f"\n‚úì Model saved to: {lr_model_path}")

## 5. Baseline Model 2: Random Forest

Random Forest provides a non-linear baseline with ensemble learning. We use `class_weight='balanced'` and limit tree depth to prevent overfitting.

In [None]:
# Train Random Forest
print("Training Random Forest with balanced class weights...")
start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    class_weight='balanced',
    random_state=config.RANDOM_SEED,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)

train_time = time.time() - start_time

# Predictions on validation set
y_val_pred_proba_rf = rf_model.predict_proba(X_val_scaled)[:, 1]
y_val_pred_rf = rf_model.predict(X_val_scaled)

# Compute metrics
rf_metrics = compute_fraud_metrics(y_val, y_val_pred_rf, y_val_pred_proba_rf)

print(f"\n‚úì Training complete in {train_time:.2f} seconds")
print("\n" + "="*60)
print("RANDOM FOREST - Validation Performance")
print("="*60)
print_classification_summary(rf_metrics)

# Save model
rf_model_path = os.path.join(config.MODELS_DIR, 'random_forest_baseline.pkl')
joblib.dump(rf_model, rf_model_path)
print(f"\n‚úì Model saved to: {rf_model_path}")

## 6. Visualize Baseline Performance

In [None]:
# Visualize baseline comparisons
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Confusion matrices
plot_confusion_matrix(y_val, y_val_pred_lr, ['Legitimate', 'Fraud'], 
                     title='Logistic Regression - Confusion Matrix', ax=axes[0, 0])
plot_confusion_matrix(y_val, y_val_pred_rf, ['Legitimate', 'Fraud'],
                     title='Random Forest - Confusion Matrix', ax=axes[0, 1])

# PR curves
plot_precision_recall_curve(y_val, y_val_pred_proba_lr, 
                            title='Logistic Regression - PR Curve', ax=axes[1, 0])
plot_precision_recall_curve(y_val, y_val_pred_proba_rf,
                            title='Random Forest - PR Curve', ax=axes[1, 1])

plt.tight_layout()
plt.savefig('../results/figures/baseline_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Visualization saved: baseline_comparison.png")

## 7. Save Baseline Performance Targets

These baseline metrics will be used to compare neural network performance in subsequent notebooks.

In [None]:
# Create baseline summary DataFrame
baseline_results = pd.DataFrame({
    'model': ['Logistic Regression', 'Random Forest'],
    'pr_auc': [lr_metrics['pr_auc'], rf_metrics['pr_auc']],
    'roc_auc': [lr_metrics['roc_auc'], rf_metrics['roc_auc']],
    'f1_fraud': [lr_metrics['fraud_f1'], rf_metrics['fraud_f1']],
    'precision_fraud': [lr_metrics['fraud_precision'], rf_metrics['fraud_precision']],
    'recall_fraud': [lr_metrics['fraud_recall'], rf_metrics['fraud_recall']],
    'accuracy': [lr_metrics['accuracy'], rf_metrics['accuracy']]
})

# Save to CSV
baseline_path = os.path.join(config.RESULTS_DIR, 'tables', 'baseline_performance_targets.csv')
baseline_results.to_csv(baseline_path, index=False)

print("‚úì Baseline performance summary:")
print(baseline_results.to_string(index=False))
print(f"\n‚úì Saved to: {baseline_path}")

## 8. Summary & Next Steps

In [None]:
print("\n" + "="*70)
print(" NOTEBOOK 02 SUMMARY - PREPROCESSING & BASELINES")
print("="*70)
print(f"‚úì Data split: {len(train_idx):,} train / {len(val_idx):,} val / {len(test_idx):,} test")
print(f"‚úì Scaler fitted and saved: {config.SCALER_PATH}")
print(f"‚úì Split indices saved for reproducibility")
print("\nüìä Baseline Performance (Validation Set):")
print(f"  Logistic Regression:")
print(f"    - PR-AUC: {lr_metrics['pr_auc']:.4f}")
print(f"    - ROC-AUC: {lr_metrics['roc_auc']:.4f}")
print(f"    - F1 (Fraud): {lr_metrics['fraud_f1']:.4f}")
print(f"  Random Forest:")
print(f"    - PR-AUC: {rf_metrics['pr_auc']:.4f}")
print(f"    - ROC-AUC: {rf_metrics['roc_auc']:.4f}")
print(f"    - F1 (Fraud): {rf_metrics['fraud_f1']:.4f}")

print("\nüéØ Performance Targets for Neural Networks:")
print(f"  - Must exceed: PR-AUC > {max(lr_metrics['pr_auc'], rf_metrics['pr_auc']):.4f}")
print(f"  - Goal: PR-AUC > 0.80 (significant improvement over baselines)")

print("\nüìÅ Artifacts Created:")
artifacts = [
    config.TRAIN_INDICES_PATH,
    config.VAL_INDICES_PATH,
    config.TEST_INDICES_PATH,
    config.SCALER_PATH,
    baseline_path,
    '../results/figures/baseline_comparison.png'
]
for artifact in artifacts:
    print(f"   {artifact}")

print("\n‚úÖ Notebook 02 Complete!")
print("üöÄ Ready for Notebook 03: Neural Network Architecture Exploration")
print("="*70)