In [1]:
# Cell 1: Import Required Libraries
# Purpose: Importing packages for resampling methods and visualization
# Dependencies: imblearn, sklearn, resampling_utils from src/
# ============================================================================

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

# Adding src directory to path
sys.path.append('../src')

# Importing custom utilities
from config import *
from data_utils import retrieve_processed_datasets
from resampling_utils import (
    initialize_resampling_methods,
    apply_resampling_technique,
    save_resampled_dataset,
    compute_resampling_statistics
)
from visualization import plot_class_distribution

# Setting visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("All libraries imported successfully!")
print(f"Working Directory: {Path.cwd()}")
print(f"Random Seed: {SEED_VALUE}")

All libraries imported successfully!
Working Directory: C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\notebooks
Random Seed: 42


In [3]:
# Cell 2: Loading Training Data Only
# Loading only training data for resampling (test data is untouched)
# Remember to never apply resampling to test data

print("="*70)
print("LOADING TRAINING DATA FOR RESAMPLING")
print("="*70)

# Loading processed datasets
X_train, X_test, y_train, y_test = retrieve_processed_datasets(file_prefix='higgs')

print("\nData loaded successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Displaying original class distribution
print("="*70)
print("ORIGINAL CLASS DISTRIBUTION (Training Set)")
print("="*70)

train_dist = y_train.value_counts().sort_index()
total_samples = len(y_train)

for class_label, count in train_dist.items():
    percentage = (count / total_samples) * 100
    print(f"Class {int(class_label)}: {count:,} samples ({percentage:.2f}%)")

# Calculating imbalance metrics
minority_count = train_dist.min()
majority_count = train_dist.max()
imbalance_ratio = minority_count / majority_count

print(f"\nImbalance Ratio: {imbalance_ratio:.3f}:1")
print(f"Minority Class: {minority_count:,} samples")
print(f"Majority Class: {majority_count:,} samples")
print(f"Total Training Samples: {total_samples:,}")

print("\nNote: Test set will remain UNCHANGED throughout all experiments")

LOADING TRAINING DATA FOR RESAMPLING
Loading processed data from C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\processed...
Datasets loaded successfully
Training shape: (800000, 28)
Testing shape: (200000, 28)

Data loaded successfully!
Training set shape: (800000, 28)
Test set shape: (200000, 28)
ORIGINAL CLASS DISTRIBUTION (Training Set)
Class 0: 376,262 samples (47.03%)
Class 1: 423,738 samples (52.97%)

Imbalance Ratio: 0.888:1
Minority Class: 376,262 samples
Majority Class: 423,738 samples
Total Training Samples: 800,000

Note: Test set will remain UNCHANGED throughout all experiments


In [4]:
# Cell 3: Initialize All Resampling Methods
# Creating instances of all 11 resampling techniques
# The output will be a dictionary of resampling method objects

print("="*70)
print("INITIALIZING RESAMPLING METHODS")
print("="*70)

# Initializing all resampling methods
resampling_methods = initialize_resampling_methods()

print(f"\nInitialized {len(resampling_methods)} resampling methods:")
for i, method_name in enumerate(resampling_methods.keys(), 1):
    print(f"  {i:2d}. {method_name}")

print("\nResampling Methods Categories:")
print("\nOversampling (Increase Minority Class):")
print("  - Random Oversampling (ROS)")
print("  - SMOTE (Synthetic Minority Over-sampling Technique)")
print("  - Borderline-SMOTE (Focus on borderline samples)")
print("  - ADASYN (Adaptive Synthetic Sampling)")

print("\nUndersampling (Reduce Majority Class):")
print("  - Random Undersampling (RUS)")
print("  - Tomek Links (Remove noisy majority samples)")
print("  - NearMiss (Intelligent majority removal)")

print("\nCombination Methods:")
print("  - SMOTE + Tomek (Oversample then clean)")
print("  - SMOTE + ENN (Oversample then remove noise)")

print("\nAlgorithm-Level (No Data Modification):")
print("  - Class Weighting (used during model training)")

print("\nBaseline:")
print("  - None (original imbalanced data)")

INITIALIZING RESAMPLING METHODS

Initialized 11 resampling methods:
   1. baseline
   2. random_over
   3. smote
   4. borderline_smote
   5. adasyn
   6. random_under
   7. tomek
   8. nearmiss
   9. smote_tomek
  10. smote_enn
  11. class_weight

Resampling Methods Categories:

Oversampling (Increase Minority Class):
  - Random Oversampling (ROS)
  - SMOTE (Synthetic Minority Over-sampling Technique)
  - Borderline-SMOTE (Focus on borderline samples)
  - ADASYN (Adaptive Synthetic Sampling)

Undersampling (Reduce Majority Class):
  - Random Undersampling (RUS)
  - Tomek Links (Remove noisy majority samples)
  - NearMiss (Intelligent majority removal)

Combination Methods:
  - SMOTE + Tomek (Oversample then clean)
  - SMOTE + ENN (Oversample then remove noise)

Algorithm-Level (No Data Modification):
  - Class Weighting (used during model training)

Baseline:
  - None (original imbalanced data)


In [5]:
# Cell 4: Baseline - No Resampling
# Storing original data as baseline for comparison
# The output will be baseline statistics and visualization

print("="*70)
print("METHOD 1/11: BASELINE (No Resampling)")
print("="*70)

# Storing baseline data
X_baseline = X_train.copy()
y_baseline = y_train.copy()

# Computing statistics
baseline_stats = {
    'method': 'baseline',
    'n_samples': len(X_baseline),
    'n_minority': int(y_baseline.sum()),
    'n_majority': int((y_baseline == 0).sum()),
    'imbalance_ratio': imbalance_ratio,
    'resampling_time': 0.0
}

print(f"\nBaseline Statistics:")
print(f"  Total Samples: {baseline_stats['n_samples']:,}")
print(f"  Minority Class: {baseline_stats['n_minority']:,}")
print(f"  Majority Class: {baseline_stats['n_majority']:,}")
print(f"  Imbalance Ratio: {baseline_stats['imbalance_ratio']:.3f}:1")

# Saving baseline (for consistency)
save_resampled_dataset(
    X_resampled=X_baseline,
    y_resampled=y_baseline,
    method_name='baseline',
    output_dir=RESAMPLED_DIR
)

# Initializing results storage
resampling_results = {
    'baseline': {
        'X': X_baseline,
        'y': y_baseline,
        'stats': baseline_stats
    }
}

print("\nBaseline data saved")

METHOD 1/11: BASELINE (No Resampling)

Baseline Statistics:
  Total Samples: 800,000
  Minority Class: 423,738
  Majority Class: 376,262
  Imbalance Ratio: 0.888:1
Saving resampled data for baseline...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_baseline_resampled.csv
Observations: 800,000
File size: 248.57 MB

Baseline data saved


In [7]:
# Cell 5: Random Oversampling (ROS)
# Duplicating random minority class samples
# Simple random sampling with replacement from minority class

print("="*70)
print("METHOD 2/11: RANDOM OVERSAMPLING (ROS)")
print("="*70)

# Recording start time
start_time = time.time()

# Applying Random Oversampling
X_ros, y_ros = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='random_oversampling',
    resampler_dict=resampling_methods
)

# Computing resampling time
ros_time = time.time() - start_time

# Computing statistics
ros_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_ros,
    y_resampled=y_ros,
    method_name='random_oversampling',
    resampling_time=ros_time
)

print(f"\nRandom Oversampling Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_ros):,}")
print(f"  Added Samples: {len(X_ros) - len(X_train):,}")
print(f"  New Minority Count: {int(y_ros.sum()):,}")
print(f"  New Majority Count: {int((y_ros == 0).sum()):,}")
print(f"  New Imbalance Ratio: {ros_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {ros_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_ros, y_ros, 'random_oversampling', RESAMPLED_DIR)

# Storing results
resampling_results['random_oversampling'] = {
    'X': X_ros,
    'y': y_ros,
    'stats': ros_stats
}

print("\nROS data saved")

METHOD 2/11: RANDOM OVERSAMPLING (ROS)
Applying resampling technique: random_over
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {1: 423738, 0: 423738}
Dataset size change: +47,476 observations (+5.93%)
New imbalance ratio: 1.000:1

Random Oversampling Results:
  Original Samples: 800,000
  Resampled Samples: 847,476
  Added Samples: 47,476
  New Minority Count: 423,738
  New Majority Count: 423,738
  New Imbalance Ratio: 1.000:1
  Resampling Time: 0.3362 seconds
Saving resampled data for random_oversampling...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_random_oversampling_resampled.csv
Observations: 847,476
File size: 263.30 MB

ROS data saved


In [8]:
# Cell 6: SMOTE (Synthetic Minority Over-sampling Technique)
# Creating synthetic minority samples using interpolation
# Generate samples along lines between minority class neighbors

print("="*70)
print("METHOD 3/11: SMOTE")
print("="*70)

# Recording start time
start_time = time.time()

# Applying SMOTE
X_smote, y_smote = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='smote',
    resampler_dict=resampling_methods
)

# Computing resampling time
smote_time = time.time() - start_time

# Computing statistics
smote_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_smote,
    y_resampled=y_smote,
    method_name='smote',
    resampling_time=smote_time
)

print(f"\nSMOTE Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_smote):,}")
print(f"  Synthetic Samples Created: {len(X_smote) - len(X_train):,}")
print(f"  New Minority Count: {int(y_smote.sum()):,}")
print(f"  New Majority Count: {int((y_smote == 0).sum()):,}")
print(f"  New Imbalance Ratio: {smote_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {smote_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_smote, y_smote, 'smote', RESAMPLED_DIR)

# Storing results
resampling_results['smote'] = {
    'X': X_smote,
    'y': y_smote,
    'stats': smote_stats
}

print("\nSMOTE data saved")

METHOD 3/11: SMOTE
Applying resampling technique: smote
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {1: 423738, 0: 423738}
Dataset size change: +47,476 observations (+5.93%)
New imbalance ratio: 1.000:1

SMOTE Results:
  Original Samples: 800,000
  Resampled Samples: 847,476
  Synthetic Samples Created: 47,476
  New Minority Count: 423,738
  New Majority Count: 423,738
  New Imbalance Ratio: 1.000:1
  Resampling Time: 91.9957 seconds
Saving resampled data for smote...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_smote_resampled.csv
Observations: 847,476
File size: 273.42 MB

SMOTE data saved


In [9]:
# Cell 7: Borderline-SMOTE
# Creating synthetic samples focusing on borderline cases
# It is a SMOTE variant that focuses on minority samples near decision boundary

print("="*70)
print("METHOD 4/11: BORDERLINE-SMOTE")
print("="*70)

# Recording start time
start_time = time.time()

# Applying Borderline-SMOTE
X_bsmote, y_bsmote = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='borderline_smote',
    resampler_dict=resampling_methods
)

# Computing resampling time
bsmote_time = time.time() - start_time

# Computing statistics
bsmote_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_bsmote,
    y_resampled=y_bsmote,
    method_name='borderline_smote',
    resampling_time=bsmote_time
)

print(f"\nBorderline-SMOTE Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_bsmote):,}")
print(f"  Synthetic Samples Created: {len(X_bsmote) - len(X_train):,}")
print(f"  New Minority Count: {int(y_bsmote.sum()):,}")
print(f"  New Majority Count: {int((y_bsmote == 0).sum()):,}")
print(f"  New Imbalance Ratio: {bsmote_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {bsmote_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_bsmote, y_bsmote, 'borderline_smote', RESAMPLED_DIR)

# Storing results
resampling_results['borderline_smote'] = {
    'X': X_bsmote,
    'y': y_bsmote,
    'stats': bsmote_stats
}

print("\nBorderline-SMOTE data saved")

METHOD 4/11: BORDERLINE-SMOTE
Applying resampling technique: borderline_smote
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {1: 423738, 0: 423738}
Dataset size change: +47,476 observations (+5.93%)
New imbalance ratio: 1.000:1

Borderline-SMOTE Results:
  Original Samples: 800,000
  Resampled Samples: 847,476
  Synthetic Samples Created: 47,476
  New Minority Count: 423,738
  New Majority Count: 423,738
  New Imbalance Ratio: 1.000:1
  Resampling Time: 248.6283 seconds
Saving resampled data for borderline_smote...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_borderline_smote_resampled.csv
Observations: 847,476
File size: 273.46 MB

Borderline-SMOTE data saved


In [11]:
# Cell 8: ADASYN (Adaptive Synthetic Sampling)
# Creating synthetic samples with density-based weighting
# It generates more samples in harder-to-learn regions
# Note: ADASYN may fail on some datasets, and it failed on this dataset as well

print("="*70)
print("METHOD 5/11: ADASYN")
print("="*70)

# Recording start time
start_time = time.time()

try:
    # Applying ADASYN
    X_adasyn, y_adasyn = apply_resampling_technique(
        X=X_train,
        y=y_train,
        method_name='adasyn',
        resampler_dict=resampling_methods
    )
    
    # Computing resampling time
    adasyn_time = time.time() - start_time
    
    # Computing statistics
    adasyn_stats = compute_resampling_statistics(
        X_original=X_train,
        y_original=y_train,
        X_resampled=X_adasyn,
        y_resampled=y_adasyn,
        method_name='adasyn',
        resampling_time=adasyn_time
    )
    
    print(f"\nADASYN Results:")
    print(f"  Original Samples: {len(X_train):,}")
    print(f"  Resampled Samples: {len(X_adasyn):,}")
    print(f"  Synthetic Samples Created: {len(X_adasyn) - len(X_train):,}")
    print(f"  New Minority Count: {int(y_adasyn.sum()):,}")
    print(f"  New Majority Count: {int((y_adasyn == 0).sum()):,}")
    print(f"  New Imbalance Ratio: {adasyn_stats['imbalance_ratio']:.3f}:1")
    print(f"  Resampling Time: {adasyn_time:.4f} seconds")
    
    # Saving resampled data
    save_resampled_dataset(X_adasyn, y_adasyn, 'adasyn', RESAMPLED_DIR)
    
    # Storing results
    resampling_results['adasyn'] = {
        'X': X_adasyn,
        'y': y_adasyn,
        'stats': adasyn_stats
    }
    
    print("\nADASYN data saved!")

except ValueError as e:
    print(f"\nADASYN FAILED: {e}")
    print("\nThis is a known ADASYN limitation when:")
    print("  - Dataset is already relatively balanced")
    print("  - Density distribution doesn't allow sample generation")
    print("  - Ratio settings prevent sample creation")
    
    print("\nUsing SMOTE as fallback for ADASYN...")
    
    # Fallback to SMOTE
    X_adasyn, y_adasyn = apply_resampling_technique(
        X=X_train,
        y=y_train,
        method_name='smote',
        resampler_dict=resampling_methods
    )
    
    adasyn_time = time.time() - start_time
    
    adasyn_stats = compute_resampling_statistics(
        X_original=X_train,
        y_original=y_train,
        X_resampled=X_adasyn,
        y_resampled=y_adasyn,
        method_name='adasyn_smote_fallback',
        resampling_time=adasyn_time
    )
    
    print(f"\nFallback SMOTE Results (labeled as ADASYN):")
    print(f"  Original Samples: {len(X_train):,}")
    print(f"  Resampled Samples: {len(X_adasyn):,}")
    print(f"  New Imbalance Ratio: {adasyn_stats['imbalance_ratio']:.3f}:1")
    
    # Saving with ADASYN name for consistency
    save_resampled_dataset(X_adasyn, y_adasyn, 'adasyn', RESAMPLED_DIR)
    
    resampling_results['adasyn'] = {
        'X': X_adasyn,
        'y': y_adasyn,
        'stats': adasyn_stats,
        'note': 'ADASYN failed - SMOTE used as fallback'
    }
    
    print("\nFallback data saved as 'adasyn'")

METHOD 5/11: ADASYN
Applying resampling technique: adasyn
Original distribution: {1: 423738, 0: 376262}
Error during resampling: No samples will be generated with the provided ratio settings.

ADASYN FAILED: No samples will be generated with the provided ratio settings.

This is a known ADASYN limitation when:
  - Dataset is already relatively balanced
  - Density distribution doesn't allow sample generation
  - Ratio settings prevent sample creation

Using SMOTE as fallback for ADASYN...
Applying resampling technique: smote
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {1: 423738, 0: 423738}
Dataset size change: +47,476 observations (+5.93%)
New imbalance ratio: 1.000:1

Fallback SMOTE Results (labeled as ADASYN):
  Original Samples: 800,000
  Resampled Samples: 847,476
  New Imbalance Ratio: 1.000:1
Saving resampled data for adasyn...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_adasyn_resampled.csv
Observatio

In [12]:
# Cell 9: Random Undersampling (RUS)
# Removing random majority class samples
# Random sampling without replacement from majority class

print("="*70)
print("METHOD 6/11: RANDOM UNDERSAMPLING (RUS)")
print("="*70)

# Recording start time
start_time = time.time()

# Applying Random Undersampling
X_rus, y_rus = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='random_undersampling',
    resampler_dict=resampling_methods
)

# Computing resampling time
rus_time = time.time() - start_time

# Computing statistics
rus_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_rus,
    y_resampled=y_rus,
    method_name='random_undersampling',
    resampling_time=rus_time
)

print(f"\nRandom Undersampling Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_rus):,}")
print(f"  Removed Samples: {len(X_train) - len(X_rus):,}")
print(f"  New Minority Count: {int(y_rus.sum()):,}")
print(f"  New Majority Count: {int((y_rus == 0).sum()):,}")
print(f"  New Imbalance Ratio: {rus_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {rus_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_rus, y_rus, 'random_undersampling', RESAMPLED_DIR)

# Storing results
resampling_results['random_undersampling'] = {
    'X': X_rus,
    'y': y_rus,
    'stats': rus_stats
}

print("\nRUS data saved")

METHOD 6/11: RANDOM UNDERSAMPLING (RUS)
Applying resampling technique: random_under
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {0: 376262, 1: 376262}
Dataset size change: -47,476 observations (-5.93%)
New imbalance ratio: 1.000:1

Random Undersampling Results:
  Original Samples: 800,000
  Resampled Samples: 752,524
  Removed Samples: 47,476
  New Minority Count: 376,262
  New Majority Count: 376,262
  New Imbalance Ratio: 1.000:1
  Resampling Time: 0.3536 seconds
Saving resampled data for random_undersampling...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_random_undersampling_resampled.csv
Observations: 752,524
File size: 233.80 MB

RUS data saved


In [13]:
# Cell 10: Tomek Links
# Removing noisy majority samples at class boundaries
# It identifies and removes Tomek link pairs (one from each class)

print("="*70)
print("METHOD 7/11: TOMEK LINKS")
print("="*70)

# Recording start time
start_time = time.time()

# Applying Tomek Links
X_tomek, y_tomek = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='tomek_links',
    resampler_dict=resampling_methods
)

# Computing resampling time
tomek_time = time.time() - start_time

# Computing statistics
tomek_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_tomek,
    y_resampled=y_tomek,
    method_name='tomek_links',
    resampling_time=tomek_time
)

print(f"\nTomek Links Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_tomek):,}")
print(f"  Removed Samples: {len(X_train) - len(X_tomek):,}")
print(f"  New Minority Count: {int(y_tomek.sum()):,}")
print(f"  New Majority Count: {int((y_tomek == 0).sum()):,}")
print(f"  New Imbalance Ratio: {tomek_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {tomek_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_tomek, y_tomek, 'tomek_links', RESAMPLED_DIR)

# Storing results
resampling_results['tomek_links'] = {
    'X': X_tomek,
    'y': y_tomek,
    'stats': tomek_stats
}

print("\nTomek Links data saved")

METHOD 7/11: TOMEK LINKS
Applying resampling technique: tomek
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {1: 375375, 0: 376262}
Dataset size change: -48,363 observations (-6.05%)
New imbalance ratio: 0.998:1

Tomek Links Results:
  Original Samples: 800,000
  Resampled Samples: 751,637
  Removed Samples: 48,363
  New Minority Count: 375,375
  New Majority Count: 376,262
  New Imbalance Ratio: 0.998:1
  Resampling Time: 426.5350 seconds
Saving resampled data for tomek_links...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_tomek_links_resampled.csv
Observations: 751,637
File size: 233.50 MB

Tomek Links data saved


In [14]:
# Cell 11: NearMiss
# Intelligent undersampling based on nearest neighbors
# It selects majority samples that are close to minority samples

print("="*70)
print("METHOD 8/11: NEARMISS")
print("="*70)

# Recording start time
start_time = time.time()

# Applying NearMiss
X_nearmiss, y_nearmiss = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='nearmiss',
    resampler_dict=resampling_methods
)

# Computing resampling time
nearmiss_time = time.time() - start_time

# Computing statistics
nearmiss_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_nearmiss,
    y_resampled=y_nearmiss,
    method_name='nearmiss',
    resampling_time=nearmiss_time
)

print(f"\nNearMiss Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_nearmiss):,}")
print(f"  Removed Samples: {len(X_train) - len(X_nearmiss):,}")
print(f"  New Minority Count: {int(y_nearmiss.sum()):,}")
print(f"  New Majority Count: {int((y_nearmiss == 0).sum()):,}")
print(f"  New Imbalance Ratio: {nearmiss_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {nearmiss_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_nearmiss, y_nearmiss, 'nearmiss', RESAMPLED_DIR)

# Storing results
resampling_results['nearmiss'] = {
    'X': X_nearmiss,
    'y': y_nearmiss,
    'stats': nearmiss_stats
}

print("\nNearMiss data saved")

METHOD 8/11: NEARMISS
Applying resampling technique: nearmiss
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {0: 376262, 1: 376262}
Dataset size change: -47,476 observations (-5.93%)
New imbalance ratio: 1.000:1

NearMiss Results:
  Original Samples: 800,000
  Resampled Samples: 752,524
  Removed Samples: 47,476
  New Minority Count: 376,262
  New Majority Count: 376,262
  New Imbalance Ratio: 1.000:1
  Resampling Time: 101.1349 seconds
Saving resampled data for nearmiss...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_nearmiss_resampled.csv
Observations: 752,524
File size: 234.10 MB

NearMiss data saved


In [15]:
# Cell 12: SMOTE + Tomek Links (Combination)
# Oversampling with SMOTE then cleaning with Tomek Links
# It creates synthetic samples, then removes noisy boundary samples

print("="*70)
print("METHOD 9/11: SMOTE + TOMEK LINKS")
print("="*70)

# Recording start time
start_time = time.time()

# Applying SMOTE + Tomek
X_smote_tomek, y_smote_tomek = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='smote_tomek',
    resampler_dict=resampling_methods
)

# Computing resampling time
smote_tomek_time = time.time() - start_time

# Computing statistics
smote_tomek_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_smote_tomek,
    y_resampled=y_smote_tomek,
    method_name='smote_tomek',
    resampling_time=smote_tomek_time
)

print(f"\nSMOTE + Tomek Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_smote_tomek):,}")
print(f"  Net Change: {len(X_smote_tomek) - len(X_train):,}")
print(f"  New Minority Count: {int(y_smote_tomek.sum()):,}")
print(f"  New Majority Count: {int((y_smote_tomek == 0).sum()):,}")
print(f"  New Imbalance Ratio: {smote_tomek_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {smote_tomek_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_smote_tomek, y_smote_tomek, 'smote_tomek', RESAMPLED_DIR)

# Storing results
resampling_results['smote_tomek'] = {
    'X': X_smote_tomek,
    'y': y_smote_tomek,
    'stats': smote_tomek_stats
}

print("\nSMOTE + Tomek data saved")

METHOD 9/11: SMOTE + TOMEK LINKS
Applying resampling technique: smote_tomek
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {1: 385816, 0: 385816}
Dataset size change: -28,368 observations (-3.55%)
New imbalance ratio: 1.000:1

SMOTE + Tomek Results:
  Original Samples: 800,000
  Resampled Samples: 771,632
  Net Change: -28,368
  New Minority Count: 385,816
  New Majority Count: 385,816
  New Imbalance Ratio: 1.000:1
  Resampling Time: 633.4957 seconds
Saving resampled data for smote_tomek...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_smote_tomek_resampled.csv
Observations: 771,632
File size: 249.81 MB

SMOTE + Tomek data saved


In [16]:
# Cell 13: SMOTE + ENN (Combination)
# Oversampling with SMOTE then cleaning with Edited Nearest Neighbors
# It creates synthetic samples, then removes misclassified samples

print("="*70)
print("METHOD 10/11: SMOTE + ENN")
print("="*70)

# Recording start time
start_time = time.time()

# Applying SMOTE + ENN
X_smote_enn, y_smote_enn = apply_resampling_technique(
    X=X_train,
    y=y_train,
    method_name='smote_enn',
    resampler_dict=resampling_methods
)

# Computing resampling time
smote_enn_time = time.time() - start_time

# Computing statistics
smote_enn_stats = compute_resampling_statistics(
    X_original=X_train,
    y_original=y_train,
    X_resampled=X_smote_enn,
    y_resampled=y_smote_enn,
    method_name='smote_enn',
    resampling_time=smote_enn_time
)

print(f"\nSMOTE + ENN Results:")
print(f"  Original Samples: {len(X_train):,}")
print(f"  Resampled Samples: {len(X_smote_enn):,}")
print(f"  Net Change: {len(X_smote_enn) - len(X_train):,}")
print(f"  New Minority Count: {int(y_smote_enn.sum()):,}")
print(f"  New Majority Count: {int((y_smote_enn == 0).sum()):,}")
print(f"  New Imbalance Ratio: {smote_enn_stats['imbalance_ratio']:.3f}:1")
print(f"  Resampling Time: {smote_enn_time:.4f} seconds")

# Saving resampled data
save_resampled_dataset(X_smote_enn, y_smote_enn, 'smote_enn', RESAMPLED_DIR)

# Storing results
resampling_results['smote_enn'] = {
    'X': X_smote_enn,
    'y': y_smote_enn,
    'stats': smote_enn_stats
}

print("\nSMOTE + ENN data saved")


METHOD 10/11: SMOTE + ENN
Applying resampling technique: smote_enn
Original distribution: {1: 423738, 0: 376262}
Resampled distribution: {0: 148567, 1: 127528}
Dataset size change: -523,905 observations (-65.49%)
New imbalance ratio: 0.858:1

SMOTE + ENN Results:
  Original Samples: 800,000
  Resampled Samples: 276,095
  Net Change: -523,905
  New Minority Count: 127,528
  New Majority Count: 148,567
  New Imbalance Ratio: 0.858:1
  Resampling Time: 563.5251 seconds
Saving resampled data for smote_enn...
Data saved to C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\resampled\higgs_smote_enn_resampled.csv
Observations: 276,095
File size: 92.14 MB

SMOTE + ENN data saved


In [17]:
# Cell 14: Class Weighting (Algorithm-Level)
# Documenting class weighting as algorithm-level approach
# No data resampling - weights applied during model training

print("="*70)
print("METHOD 11/11: CLASS WEIGHTING (Algorithm-Level)")
print("="*70)

print("\nClass Weighting Approach:")
print("  - No data modification (no resampling)")
print("  - Weights applied during model training")
print("  - Penalty for misclassifying minority class")

# Computing class weights
class_0_count = (y_train == 0).sum()
class_1_count = (y_train == 1).sum()
total_samples = len(y_train)

# Calculating balanced weights
weight_0 = total_samples / (2 * class_0_count)
weight_1 = total_samples / (2 * class_1_count)

print(f"\nCalculated Class Weights:")
print(f"  Class 0 (Majority): {weight_0:.4f}")
print(f"  Class 1 (Minority): {weight_1:.4f}")
print(f"  Weight Ratio: {weight_1/weight_0:.4f}:1")

# Storing class weight information
class_weight_dict = {0: weight_0, 1: weight_1}

class_weight_stats = {
    'method': 'class_weighting',
    'n_samples': len(X_train),
    'n_minority': int(class_1_count),
    'n_majority': int(class_0_count),
    'imbalance_ratio': imbalance_ratio,
    'resampling_time': 0.0,
    'weight_0': weight_0,
    'weight_1': weight_1
}

# Storing results (data unchanged)
resampling_results['class_weighting'] = {
    'X': X_train.copy(),
    'y': y_train.copy(),
    'stats': class_weight_stats,
    'weights': class_weight_dict
}

print("\nNote: Class weights will be applied during model training")
print("Data remains unchanged - this is an algorithm-level approach")

METHOD 11/11: CLASS WEIGHTING (Algorithm-Level)

Class Weighting Approach:
  - No data modification (no resampling)
  - Weights applied during model training
  - Penalty for misclassifying minority class

Calculated Class Weights:
  Class 0 (Majority): 1.0631
  Class 1 (Minority): 0.9440
  Weight Ratio: 0.8880:1

Note: Class weights will be applied during model training
Data remains unchanged - this is an algorithm-level approach
