In [1]:
# Cell 1: Importing Required Libraries

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Adding src directory to path for importing custom modules
sys.path.append('../src')

# Importing custom utility functions
from config import *
from data_utils import retrieve_processed_datasets
from model_utils import (
    initialize_all_models,
    train_single_model,
    evaluate_model_predictions,
    perform_cross_validation,
    extract_feature_importance,
    save_trained_model,
    create_model_comparison_table,
    perform_cv_all_models
)
from evaluation_utils import (
    calculate_all_metrics,
    create_confusion_matrix,
    plot_roc_curve,
    plot_precision_recall_curve,
    compare_multiple_roc_curves,
    generate_classification_report,
    create_metrics_summary_table
)
from visualization import plot_feature_importance_horizontal

# Setting visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("All libraries imported successfully!")
print(f"Working Directory: {Path.cwd()}")
print(f"Random Seed: {SEED_VALUE}")

All libraries imported successfully!
Working Directory: C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\notebooks
Random Seed: 42


In [2]:
# Cell 2: Loading Processed Data from Notebook 01
# Loading the cleaned and scaled training/test data

print("LOADING PROCESSED DATA...")

# Loading processed datasets using utility function
X_train, X_test, y_train, y_test = retrieve_processed_datasets(file_prefix='higgs')

print("\nData loaded successfully!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Verifying class distribution
print("\nClass Distribution in Training Set:")
train_dist = y_train.value_counts()
for class_label, count in train_dist.items():
    percentage = (count / len(y_train)) * 100
    print(f"  Class {int(class_label)}: {count:,} samples ({percentage:.2f}%)")

print("\nClass Distribution in Test Set:")
test_dist = y_test.value_counts()
for class_label, count in test_dist.items():
    percentage = (count / len(y_test)) * 100
    print(f"  Class {int(class_label)}: {count:,} samples ({percentage:.2f}%)")

# Calculating imbalance ratio
minority_count = train_dist.min()
majority_count = train_dist.max()
imbalance_ratio = minority_count / majority_count
print(f"\nImbalance Ratio: {imbalance_ratio:.3f}:1")


LOADING PROCESSED DATA...
Loading processed data from C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\processed...
Datasets loaded successfully
Training shape: (800000, 28)
Testing shape: (200000, 28)

Data loaded successfully!
Training set shape: (800000, 28)
Test set shape: (200000, 28)

Class Distribution in Training Set:
  Class 1: 423,738 samples (52.97%)
  Class 0: 376,262 samples (47.03%)

Class Distribution in Test Set:
  Class 1: 105,935 samples (52.97%)
  Class 0: 94,065 samples (47.03%)

Imbalance Ratio: 0.888:1


In [3]:
# Cell 3: Initializing All Baseline Models
# Creating instances of all 5 sklearn models WITHOUT class weights
# Models: Logistic Regression, Random Forest, XGBoost, SVM, MLP
# I am using original imbalanced data to establish baseline performance

print("INITIALIZING BASELINE MODELS...")

# Initializing all models without class weights (true baseline)
baseline_models = initialize_all_models(use_class_weights=False)

print(f"\nInitialized {len(baseline_models)} models:")
for i, (model_name, model) in enumerate(baseline_models.items(), 1):
    print(f"  {i}. {model_name}: {type(model).__name__}")

print("\nModel Configurations:")
print("\n1. Logistic Regression:")
print(f"   - Max iterations: {LOGIT_CONFIG['max_iter']}")
print(f"   - Solver: {LOGIT_CONFIG['solver']}")

print("\n2. Random Forest:")
print(f"   - N estimators: {RF_CONFIG['n_estimators']}")
print(f"   - Max depth: {RF_CONFIG['max_depth']}")

print("\n3. XGBoost:")
print(f"   - N estimators: {XGB_CONFIG['n_estimators']}")
print(f"   - Learning rate: {XGB_CONFIG['learning_rate']}")

print("\n4. Support Vector Machine:")
print(f"   - Kernel: {SVM_CONFIG['kernel']}")
print(f"   - Probability: {SVM_CONFIG['probability']}")

print("\n5. Multi-Layer Perceptron:")
print(f"   - Hidden layers: {MLP_CONFIG['hidden_layer_sizes']}")
print(f"   - Activation: {MLP_CONFIG['activation']}")

print("\nAll models ready for training on original imbalanced data!")

INITIALIZING BASELINE MODELS...

Initialized 5 models:
  1. logistic_regression: LogisticRegression
  2. random_forest: RandomForestClassifier
  3. xgboost: XGBClassifier
  4. svm: SVC
  5. mlp: MLPClassifier

Model Configurations:

1. Logistic Regression:
   - Max iterations: 1000
   - Solver: lbfgs

2. Random Forest:
   - N estimators: 100
   - Max depth: 20

3. XGBoost:
   - N estimators: 100
   - Learning rate: 0.1

4. Support Vector Machine:
   - Kernel: rbf
   - Probability: True

5. Multi-Layer Perceptron:
   - Hidden layers: (100, 50)
   - Activation: relu

All models ready for training on original imbalanced data!


In [4]:
# Cell 4: Training Logistic Regression (Baseline)
# Training linear model as simplest baseline

print("MODEL 1/5: LOGISTIC REGRESSION:")

# Training logistic regression
lr_model, lr_train_time = train_single_model(
    model=baseline_models['logistic_regression'],
    X_train=X_train,
    y_train=y_train,
    model_name='Logistic Regression',
    verbose=True
)

# Evaluating on test set
lr_results = evaluate_model_predictions(
    model=lr_model,
    X_test=X_test,
    y_test=y_test,
    model_name='Logistic Regression',
    verbose=True
)

# Calculating comprehensive metrics
lr_metrics = calculate_all_metrics(
    y_true=y_test,
    y_pred=lr_results['y_pred'],
    y_proba=lr_results['y_proba']
)

print("\nLogistic Regression Performance:")
print(f"  Accuracy: {lr_metrics['accuracy']:.4f}")
print(f"  Precision: {lr_metrics['precision']:.4f}")
print(f"  Recall: {lr_metrics['recall']:.4f}")
print(f"  F1-Score: {lr_metrics['f1_score']:.4f}")
print(f"  AUC-ROC: {lr_metrics['auc_roc']:.4f}")
print(f"  AUC-PR: {lr_metrics['auc_pr']:.4f}")
print(f"  G-Mean: {lr_metrics['g_mean']:.4f}")
print(f"  MCC: {lr_metrics['mcc']:.4f}")

# Storing results for comparison
baseline_results = {
    'logistic_regression': {
        'model': lr_model,
        'metrics': lr_metrics,
        'predictions': lr_results['y_pred'],
        'probabilities': lr_results['y_proba'],
        'training_time': lr_train_time
    }
}

MODEL 1/5: LOGISTIC REGRESSION:
Training Logistic Regression...
Training completed in 3.34 seconds
Evaluating Logistic Regression...
Prediction completed in 0.0286 seconds

Logistic Regression Performance:
  Accuracy: 0.6414
  Precision: 0.6390
  Recall: 0.7425
  F1-Score: 0.6869
  AUC-ROC: 0.6850
  AUC-PR: 0.6832
  G-Mean: 0.6259
  MCC: 0.2771


In [5]:
# Cell 5: Training Random Forest (Baseline)
# Training ensemble tree-based model
# Good performance is expected because it naturally handles imbalance better

print("MODEL 2/5: RANDOM FOREST")

# Training random forest
rf_model, rf_train_time = train_single_model(
    model=baseline_models['random_forest'],
    X_train=X_train,
    y_train=y_train,
    model_name='Random Forest',
    verbose=True
)

# Evaluating on test set
rf_results = evaluate_model_predictions(
    model=rf_model,
    X_test=X_test,
    y_test=y_test,
    model_name='Random Forest',
    verbose=True
)

# Calculating comprehensive metrics
rf_metrics = calculate_all_metrics(
    y_true=y_test,
    y_pred=rf_results['y_pred'],
    y_proba=rf_results['y_proba']
)

print("\nRandom Forest Performance:")
print(f"  Accuracy: {rf_metrics['accuracy']:.4f}")
print(f"  Precision: {rf_metrics['precision']:.4f}")
print(f"  Recall: {rf_metrics['recall']:.4f}")
print(f"  F1-Score: {rf_metrics['f1_score']:.4f}")
print(f"  AUC-ROC: {rf_metrics['auc_roc']:.4f}")
print(f"  AUC-PR: {rf_metrics['auc_pr']:.4f}")
print(f"  G-Mean: {rf_metrics['g_mean']:.4f}")
print(f"  MCC: {rf_metrics['mcc']:.4f}")

# Storing results
baseline_results['random_forest'] = {
    'model': rf_model,
    'metrics': rf_metrics,
    'predictions': rf_results['y_pred'],
    'probabilities': rf_results['y_proba'],
    'training_time': rf_train_time
}

MODEL 2/5: RANDOM FOREST
Training Random Forest...
Training completed in 72.45 seconds
Evaluating Random Forest...
Prediction completed in 1.8143 seconds

Random Forest Performance:
  Accuracy: 0.7318
  Precision: 0.7420
  Recall: 0.7568
  F1-Score: 0.7493
  AUC-ROC: 0.8130
  AUC-PR: 0.8286
  G-Mean: 0.7297
  MCC: 0.4611


In [6]:
# Cell 6: Training XGBoost (Baseline)
# Training gradient boosting model
# It is often the best performer and handles imbalance reasonably but has slower training

print("MODEL 3/5: XGBOOST")

# Training XGBoost
xgb_model, xgb_train_time = train_single_model(
    model=baseline_models['xgboost'],
    X_train=X_train,
    y_train=y_train,
    model_name='XGBoost',
    verbose=True
)

# Evaluating on test set
xgb_results = evaluate_model_predictions(
    model=xgb_model,
    X_test=X_test,
    y_test=y_test,
    model_name='XGBoost',
    verbose=True
)

# Calculating comprehensive metrics
xgb_metrics = calculate_all_metrics(
    y_true=y_test,
    y_pred=xgb_results['y_pred'],
    y_proba=xgb_results['y_proba']
)

print("\nXGBoost Performance:")
print(f"  Accuracy: {xgb_metrics['accuracy']:.4f}")
print(f"  Precision: {xgb_metrics['precision']:.4f}")
print(f"  Recall: {xgb_metrics['recall']:.4f}")
print(f"  F1-Score: {xgb_metrics['f1_score']:.4f}")
print(f"  AUC-ROC: {xgb_metrics['auc_roc']:.4f}")
print(f"  AUC-PR: {xgb_metrics['auc_pr']:.4f}")
print(f"  G-Mean: {xgb_metrics['g_mean']:.4f}")
print(f"  MCC: {xgb_metrics['mcc']:.4f}")

# Storing results
baseline_results['xgboost'] = {
    'model': xgb_model,
    'metrics': xgb_metrics,
    'predictions': xgb_results['y_pred'],
    'probabilities': xgb_results['y_proba'],
    'training_time': xgb_train_time
}

MODEL 3/5: XGBOOST
Training XGBoost...
Training completed in 4.33 seconds
Evaluating XGBoost...
Prediction completed in 0.0820 seconds

XGBoost Performance:
  Accuracy: 0.7303
  Precision: 0.7428
  Recall: 0.7509
  F1-Score: 0.7468
  AUC-ROC: 0.8109
  AUC-PR: 0.8264
  G-Mean: 0.7287
  MCC: 0.4584


In [7]:
# Cell 7: Training Support Vector Machine (Baseline)
# Training kernel-based classifier

print("MODEL 4/5: SUPPORT VECTOR MACHINE")

# Training SVM
svm_model, svm_train_time = train_single_model(
    model=baseline_models['svm'],
    X_train=X_train,
    y_train=y_train,
    model_name='SVM',
    verbose=True
)

# Evaluating on test set
svm_results = evaluate_model_predictions(
    model=svm_model,
    X_test=X_test,
    y_test=y_test,
    model_name='SVM',
    verbose=True
)

# Calculating comprehensive metrics
svm_metrics = calculate_all_metrics(
    y_true=y_test,
    y_pred=svm_results['y_pred'],
    y_proba=svm_results['y_proba']
)

print("\nSVM Performance:")
print(f"  Accuracy: {svm_metrics['accuracy']:.4f}")
print(f"  Precision: {svm_metrics['precision']:.4f}")
print(f"  Recall: {svm_metrics['recall']:.4f}")
print(f"  F1-Score: {svm_metrics['f1_score']:.4f}")
print(f"  AUC-ROC: {svm_metrics['auc_roc']:.4f}")
print(f"  AUC-PR: {svm_metrics['auc_pr']:.4f}")
print(f"  G-Mean: {svm_metrics['g_mean']:.4f}")
print(f"  MCC: {svm_metrics['mcc']:.4f}")

# Storing results
baseline_results['svm'] = {
    'model': svm_model,
    'metrics': svm_metrics,
    'predictions': svm_results['y_pred'],
    'probabilities': svm_results['y_proba'],
    'training_time': svm_train_time
}

MODEL 4/5: SUPPORT VECTOR MACHINE
Training SVM...
Training completed in 1254.29 seconds
Evaluating SVM...
Prediction completed in 50.6166 seconds

SVM Performance:
  Accuracy: 0.5343
  Precision: 0.6010
  Recall: 0.3596
  F1-Score: 0.4500
  AUC-ROC: 0.5767
  AUC-PR: 0.5906
  G-Mean: 0.5127
  MCC: 0.0973


In [8]:
# Cell 8: Training Multi-Layer Perceptron (Baseline)
# Training neural network with sklearn

print("MODEL 5/5: MULTI-LAYER PERCEPTRON")

# Training MLP
mlp_model, mlp_train_time = train_single_model(
    model=baseline_models['mlp'],
    X_train=X_train,
    y_train=y_train,
    model_name='MLP',
    verbose=True
)

# Evaluating on test set
mlp_results = evaluate_model_predictions(
    model=mlp_model,
    X_test=X_test,
    y_test=y_test,
    model_name='MLP',
    verbose=True
)

# Calculating comprehensive metrics
mlp_metrics = calculate_all_metrics(
    y_true=y_test,
    y_pred=mlp_results['y_pred'],
    y_proba=mlp_results['y_proba']
)

print("\nMLP Performance:")
print(f"  Accuracy: {mlp_metrics['accuracy']:.4f}")
print(f"  Precision: {mlp_metrics['precision']:.4f}")
print(f"  Recall: {mlp_metrics['recall']:.4f}")
print(f"  F1-Score: {mlp_metrics['f1_score']:.4f}")
print(f"  AUC-ROC: {mlp_metrics['auc_roc']:.4f}")
print(f"  AUC-PR: {mlp_metrics['auc_pr']:.4f}")
print(f"  G-Mean: {mlp_metrics['g_mean']:.4f}")
print(f"  MCC: {mlp_metrics['mcc']:.4f}")

# Storing results
baseline_results['mlp'] = {
    'model': mlp_model,
    'metrics': mlp_metrics,
    'predictions': mlp_results['y_pred'],
    'probabilities': mlp_results['y_proba'],
    'training_time': mlp_train_time
}

print("ALL 5 BASELINE MODELS TRAINED SUCCESSFULLY")

MODEL 5/5: MULTI-LAYER PERCEPTRON
Training MLP...
Training completed in 301.27 seconds
Evaluating MLP...
Prediction completed in 0.4166 seconds

MLP Performance:
  Accuracy: 0.7546
  Precision: 0.7644
  Recall: 0.7758
  F1-Score: 0.7701
  AUC-ROC: 0.8378
  AUC-PR: 0.8506
  G-Mean: 0.7529
  MCC: 0.5070
ALL 5 BASELINE MODELS TRAINED SUCCESSFULLY
