In [1]:
# Cell 1: Importing Required Libraries

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import time
from itertools import product
import warnings
warnings.filterwarnings('ignore')

# Adding src directory to path
sys.path.append('../src')

# Importing custom utilities
from config import *
from data_utils import retrieve_processed_datasets, load_resampled_data
from model_utils import (
    initialize_all_models,
    train_single_model,
    evaluate_model_predictions,
    perform_cross_validation,
    save_trained_model
)
from evaluation_utils import (
    calculate_all_metrics,
    create_confusion_matrix,
    perform_statistical_significance_test,
    print_statistical_test_results,
    create_metrics_summary_table
)

# Setting visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("All libraries imported successfully!")
print(f"Working Directory: {Path.cwd()}")
print(f"Random Seed: {SEED_VALUE}")

All libraries imported successfully!
Working Directory: C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\notebooks
Random Seed: 42


In [2]:
# Cell 2: Load Test Data (Unchanged)
# Loading test data which remains constant across all experiments
# One thing to remember is that test set is NEVER resampled

print("="*70)
print("LOADING TEST DATA")
print("="*70)

# Loading original processed datasets
X_train_original, X_test, y_train_original, y_test = retrieve_processed_datasets(
    file_prefix='higgs'
)

print("\nTest set loaded (unchanged across all experiments):")
print(f"  Test features shape: {X_test.shape}")
print(f"  Test labels shape: {y_test.shape}")

# Displaying test set distribution
test_dist = y_test.value_counts().sort_index()
print(f"\nTest Set Class Distribution:")
for class_label, count in test_dist.items():
    percentage = (count / len(y_test)) * 100
    print(f"  Class {int(class_label)}: {count:,} ({percentage:.2f}%)")

print("\nNote: This test set will be used to evaluate ALL 55 experiments")

LOADING TEST DATA
Loading processed data from C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\processed...
Datasets loaded successfully
Training shape: (800000, 28)
Testing shape: (200000, 28)

Test set loaded (unchanged across all experiments):
  Test features shape: (200000, 28)
  Test labels shape: (200000,)

Test Set Class Distribution:
  Class 0: 94,065 (47.03%)
  Class 1: 105,935 (52.97%)

Note: This test set will be used to evaluate ALL 55 experiments


In [3]:
# Cell 3: Load Baseline Results
# Purpose: Loading baseline performance for comparison

print("="*70)
print("LOADING BASELINE RESULTS")
print("="*70)

# Loading baseline metrics
baseline_metrics_path = METRIC_OUTPUT / 'baseline' / 'baseline_metrics.json'

with open(baseline_metrics_path, 'r', encoding='utf-8') as f:
    baseline_metrics = json.load(f)

print("\nBaseline metrics loaded successfully!")
print(f"Number of baseline models: {len(baseline_metrics)}")

# Displaying baseline F1-scores
print("\nBaseline F1-Scores (No Resampling):")
for model_name, metrics in baseline_metrics.items():
    print(f"  {model_name.replace('_', ' ').title()}: {metrics['f1_score']:.4f}")

# Identifying best baseline
best_baseline_model = max(baseline_metrics.items(), 
                          key=lambda x: x[1]['f1_score'])
print(f"\nBest Baseline: {best_baseline_model[0]} (F1={best_baseline_model[1]['f1_score']:.4f})")

LOADING BASELINE RESULTS

Baseline metrics loaded successfully!
Number of baseline models: 5

Baseline F1-Scores (No Resampling):
  Logistic Regression: 0.6869
  Random Forest: 0.7493
  Xgboost: 0.7468
  Svm: 0.4500
  Mlp: 0.7701

Best Baseline: mlp (F1=0.7701)


In [4]:
# Cell 4: Load Resampling Statistics
# Loading resampling method information from previous notebook (03) 
# Statistics for each resampling method

print("="*70)
print("LOADING RESAMPLING STATISTICS")
print("="*70)

# Loading resampling statistics
resampling_stats_path = METRIC_OUTPUT / 'resampling' / 'resampling_statistics.json'

with open(resampling_stats_path, 'r', encoding='utf-8') as f:
    resampling_stats = json.load(f)

print("\nResampling statistics loaded successfully!")
print(f"Number of resampling methods: {len(resampling_stats)}")

print("\nResampling Methods Available:")
for i, method_name in enumerate(resampling_stats.keys(), 1):
    method_info = resampling_stats[method_name]
    print(f"  {i:2d}. {method_name.replace('_', ' ').title()}: "
          f"{method_info['n_samples']:,} samples "
          f"(Ratio: {method_info['imbalance_ratio']:.3f}:1)")


LOADING RESAMPLING STATISTICS

Resampling statistics loaded successfully!
Number of resampling methods: 11

Resampling Methods Available:
   1. Baseline: 800,000 samples (Ratio: 0.888:1)
   2. Random Oversampling: 847,476 samples (Ratio: 1.000:1)
   3. Smote: 847,476 samples (Ratio: 1.000:1)
   4. Borderline Smote: 847,476 samples (Ratio: 1.000:1)
   5. Adasyn: 847,476 samples (Ratio: 1.000:1)
   6. Random Undersampling: 752,524 samples (Ratio: 1.000:1)
   7. Tomek Links: 751,637 samples (Ratio: 0.998:1)
   8. Nearmiss: 752,524 samples (Ratio: 1.000:1)
   9. Smote Tomek: 771,632 samples (Ratio: 1.000:1)
  10. Smote Enn: 276,095 samples (Ratio: 0.858:1)
  11. Class Weighting: 800,000 samples (Ratio: 0.888:1)
