In [1]:
# Cell 1: Import Required Libraries
# Purpose: Importing packages for resampling methods and visualization
# Dependencies: imblearn, sklearn, resampling_utils from src/
# ============================================================================

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

# Adding src directory to path
sys.path.append('../src')

# Importing custom utilities
from config import *
from data_utils import retrieve_processed_datasets
from resampling_utils import (
    initialize_resampling_methods,
    apply_resampling_technique,
    save_resampled_dataset,
    compute_resampling_statistics
)
from visualization import plot_class_distribution

# Setting visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("All libraries imported successfully!")
print(f"Working Directory: {Path.cwd()}")
print(f"Random Seed: {SEED_VALUE}")

All libraries imported successfully!
Working Directory: C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\notebooks
Random Seed: 42


In [3]:
# Cell 2: Loading Training Data Only
# Loading only training data for resampling (test data is untouched)
# Remember to never apply resampling to test data

print("="*70)
print("LOADING TRAINING DATA FOR RESAMPLING")
print("="*70)

# Loading processed datasets
X_train, X_test, y_train, y_test = retrieve_processed_datasets(file_prefix='higgs')

print("\nData loaded successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Displaying original class distribution
print("="*70)
print("ORIGINAL CLASS DISTRIBUTION (Training Set)")
print("="*70)

train_dist = y_train.value_counts().sort_index()
total_samples = len(y_train)

for class_label, count in train_dist.items():
    percentage = (count / total_samples) * 100
    print(f"Class {int(class_label)}: {count:,} samples ({percentage:.2f}%)")

# Calculating imbalance metrics
minority_count = train_dist.min()
majority_count = train_dist.max()
imbalance_ratio = minority_count / majority_count

print(f"\nImbalance Ratio: {imbalance_ratio:.3f}:1")
print(f"Minority Class: {minority_count:,} samples")
print(f"Majority Class: {majority_count:,} samples")
print(f"Total Training Samples: {total_samples:,}")

print("\nNote: Test set will remain UNCHANGED throughout all experiments")

LOADING TRAINING DATA FOR RESAMPLING
Loading processed data from C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\processed...
Datasets loaded successfully
Training shape: (800000, 28)
Testing shape: (200000, 28)

Data loaded successfully!
Training set shape: (800000, 28)
Test set shape: (200000, 28)
ORIGINAL CLASS DISTRIBUTION (Training Set)
Class 0: 376,262 samples (47.03%)
Class 1: 423,738 samples (52.97%)

Imbalance Ratio: 0.888:1
Minority Class: 376,262 samples
Majority Class: 423,738 samples
Total Training Samples: 800,000

Note: Test set will remain UNCHANGED throughout all experiments


In [4]:
# Cell 3: Initialize All Resampling Methods
# Creating instances of all 11 resampling techniques
# The output will be a dictionary of resampling method objects

print("="*70)
print("INITIALIZING RESAMPLING METHODS")
print("="*70)

# Initializing all resampling methods
resampling_methods = initialize_resampling_methods()

print(f"\nInitialized {len(resampling_methods)} resampling methods:")
for i, method_name in enumerate(resampling_methods.keys(), 1):
    print(f"  {i:2d}. {method_name}")

print("\nResampling Methods Categories:")
print("\nOversampling (Increase Minority Class):")
print("  - Random Oversampling (ROS)")
print("  - SMOTE (Synthetic Minority Over-sampling Technique)")
print("  - Borderline-SMOTE (Focus on borderline samples)")
print("  - ADASYN (Adaptive Synthetic Sampling)")

print("\nUndersampling (Reduce Majority Class):")
print("  - Random Undersampling (RUS)")
print("  - Tomek Links (Remove noisy majority samples)")
print("  - NearMiss (Intelligent majority removal)")

print("\nCombination Methods:")
print("  - SMOTE + Tomek (Oversample then clean)")
print("  - SMOTE + ENN (Oversample then remove noise)")

print("\nAlgorithm-Level (No Data Modification):")
print("  - Class Weighting (used during model training)")

print("\nBaseline:")
print("  - None (original imbalanced data)")

INITIALIZING RESAMPLING METHODS

Initialized 11 resampling methods:
   1. baseline
   2. random_over
   3. smote
   4. borderline_smote
   5. adasyn
   6. random_under
   7. tomek
   8. nearmiss
   9. smote_tomek
  10. smote_enn
  11. class_weight

Resampling Methods Categories:

Oversampling (Increase Minority Class):
  - Random Oversampling (ROS)
  - SMOTE (Synthetic Minority Over-sampling Technique)
  - Borderline-SMOTE (Focus on borderline samples)
  - ADASYN (Adaptive Synthetic Sampling)

Undersampling (Reduce Majority Class):
  - Random Undersampling (RUS)
  - Tomek Links (Remove noisy majority samples)
  - NearMiss (Intelligent majority removal)

Combination Methods:
  - SMOTE + Tomek (Oversample then clean)
  - SMOTE + ENN (Oversample then remove noise)

Algorithm-Level (No Data Modification):
  - Class Weighting (used during model training)

Baseline:
  - None (original imbalanced data)


In [5]:
# Cell 4: Baseline - No Resampling
# Storing original data as baseline for comparison
# The output will be baseline statistics and visualization

print("="*70)
print("METHOD 1/11: BASELINE (No Resampling)")
print("="*70)

# Storing baseline data
X_baseline = X_train.copy()
y_baseline = y_train.copy()

# Computing statistics
baseline_stats = {
    'method': 'baseline',
    'n_samples': len(X_baseline),
    'n_minority': int(y_baseline.sum()),
    'n_majority': int((y_baseline == 0).sum()),
    'imbalance_ratio': imbalance_ratio,
    'resampling_time': 0.0
}

print(f"\nBaseline Statistics:")
print(f"  Total Samples: {baseline_stats['n_samples']:,}")
print(f"  Minority Class: {baseline_stats['n_minority']:,}")
print(f"  Majority Class: {baseline_stats['n_majority']:,}")
print(f"  Imbalance Ratio: {baseline_stats['imbalance_ratio']:.3f}:1")

# Saving baseline (for consistency)
save_resampled_dataset(
    X_resampled=X_baseline,
    y_resampled=y_baseline,
    method_name='baseline',
    output_dir=RESAMPLED_DIR
)

# Initializing results storage
resampling_results = {
    'baseline': {
        'X': X_baseline,
        'y': y_baseline,
        'stats': baseline_stats
    }
}

print("\nBaseline data saved")

METHOD 1/11: BASELINE (No Resampling)

Baseline Statistics:
  Total Samples: 800,000
  Minority Class: 423,738
  Majority Class: 376,262
  Imbalance Ratio: 0.888:1
Saving resampled data for baseline...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_baseline_resampled.csv
Observations: 800,000
File size: 248.57 MB

Baseline data saved
