In [1]:
# Cell 1: Importing Required Libraries

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Adding src directory to path for importing custom modules
sys.path.append('../src')

# Importing custom utility functions
from config import *
from data_utils import retrieve_processed_datasets
from model_utils import (
    initialize_all_models,
    train_single_model,
    evaluate_model_predictions,
    perform_cross_validation,
    extract_feature_importance,
    save_trained_model,
    create_model_comparison_table,
    perform_cv_all_models
)
from evaluation_utils import (
    calculate_all_metrics,
    create_confusion_matrix,
    plot_roc_curve,
    plot_precision_recall_curve,
    compare_multiple_roc_curves,
    generate_classification_report,
    create_metrics_summary_table
)
from visualization import plot_feature_importance_horizontal

# Setting visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("All libraries imported successfully!")
print(f"Working Directory: {Path.cwd()}")
print(f"Random Seed: {SEED_VALUE}")

All libraries imported successfully!
Working Directory: C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\notebooks
Random Seed: 42


In [2]:
# Cell 2: Loading Processed Data from Notebook 01
# Loading the cleaned and scaled training/test data

print("LOADING PROCESSED DATA...")

# Loading processed datasets using utility function
X_train, X_test, y_train, y_test = retrieve_processed_datasets(file_prefix='higgs')

print("\nData loaded successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Verifying class distribution
print("\nClass Distribution in Training Set:")
train_dist = y_train.value_counts()
for class_label, count in train_dist.items():
    percentage = (count / len(y_train)) * 100
    print(f"  Class {int(class_label)}: {count:,} samples ({percentage:.2f}%)")

print("\nClass Distribution in Test Set:")
test_dist = y_test.value_counts()
for class_label, count in test_dist.items():
    percentage = (count / len(y_test)) * 100
    print(f"  Class {int(class_label)}: {count:,} samples ({percentage:.2f}%)")

# Calculating imbalance ratio
minority_count = train_dist.min()
majority_count = train_dist.max()
imbalance_ratio = minority_count / majority_count
print(f"\nImbalance Ratio: {imbalance_ratio:.3f}:1")


LOADING PROCESSED DATA...
Loading processed data from C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\processed...
Datasets loaded successfully
Training shape: (800000, 28)
Testing shape: (200000, 28)

Data loaded successfully!
Training set shape: (800000, 28)
Test set shape: (200000, 28)

Class Distribution in Training Set:
  Class 1: 423,738 samples (52.97%)
  Class 0: 376,262 samples (47.03%)

Class Distribution in Test Set:
  Class 1: 105,935 samples (52.97%)
  Class 0: 94,065 samples (47.03%)

Imbalance Ratio: 0.888:1
