In [1]:
# Cell 1: Importing Required Libraries

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import time
warnings.filterwarnings('ignore')

# Clustering imports
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score,
    normalized_mutual_info_score
)

# Visualization imports
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import cdist

# Adding src directory to path
sys.path.append('../src')

# Importing custom utilities
from config import *
from data_utils import retrieve_processed_datasets

# Setting visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("Set2")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

print("All libraries imported successfully!")
print(f"Working Directory: {Path.cwd()}")
print(f"Scikit-learn clustering modules loaded")

All libraries imported successfully!
Working Directory: C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\notebooks
Scikit-learn clustering modules loaded


In [3]:
# Cell 2: Problem Introduction - Clustering Analysis
# Defining clustering objectives for particle physics data
# Approach: Unsupervised pattern discovery in collision events

print("="*70)
print("TASK D: CLUSTERING ANALYSIS")
print("="*70)

print("\nClustering Objectives:")
print("  1. Discover natural groupings in particle collision events")
print("  2. Identify event patterns beyond binary classification")
print("  3. Understand physics-based event characteristics")
print("  4. Validate clusters against known signal/background labels")

print("\nClustering Approaches:")
print("  - K-Means: Partition-based clustering")
print("  - Hierarchical: Agglomerative clustering")
print("  - DBSCAN: Density-based clustering")

print("\nWhy Clustering for HIGGS Data?")
print("  - Discover sub-categories within signal/background")
print("  - Identify rare event types")
print("  - Understand event topology patterns")
print("  - Validate supervised classification boundaries")

TASK D: CLUSTERING ANALYSIS

Clustering Objectives:
  1. Discover natural groupings in particle collision events
  2. Identify event patterns beyond binary classification
  3. Understand physics-based event characteristics
  4. Validate clusters against known signal/background labels

Clustering Approaches:
  - K-Means: Partition-based clustering
  - Hierarchical: Agglomerative clustering
  - DBSCAN: Density-based clustering

Why Clustering for HIGGS Data?
  - Discover sub-categories within signal/background
  - Identify rare event types
  - Understand event topology patterns
  - Validate supervised classification boundaries


In [4]:
# Cell 3: Loading and Preparing Data for Clustering
# Loading processed data, sampling, and SCALING features
# Feature scaling is CRITICAL for distance-based clustering

print("="*70)
print("DATA LOADING & PREPARATION FOR CLUSTERING")
print("="*70)

# Loading processed data
X_full, X_test_full, y_full, y_test_full = retrieve_processed_datasets(
    file_prefix='higgs'
)

print("\nFull dataset loaded:")
print(f"  Training: {X_full.shape}")
print(f"  Test: {X_test_full.shape}")

# Sampling for computational efficiency
# Clustering on 800K samples is very slow so I am using representative sample
CLUSTER_SAMPLE_SIZE = 50000  # 50K samples for clustering

print(f"\nSampling {CLUSTER_SAMPLE_SIZE:,} events for clustering analysis")
print("   (Clustering is O(n²) - full dataset would take hours)")

# Stratified sampling to preserve class distribution
from sklearn.model_selection import train_test_split

X_cluster, _, y_cluster, _ = train_test_split(
    X_full, y_full,
    train_size=CLUSTER_SAMPLE_SIZE,
    random_state=SEED_VALUE,
    stratify=y_full
)

print(f"\nClustering Dataset (before scaling):")
print(f"  Shape: {X_cluster.shape}")
print(f"  Features: {X_cluster.shape[1]}")
print(f"  Samples: {X_cluster.shape[0]:,}")

DATA LOADING & PREPARATION FOR CLUSTERING
Loading processed data from C:\Users\Ashutosh\Documents\Projects\beyond-smote-evaluation\data\processed...
Datasets loaded successfully
Training shape: (800000, 28)
Testing shape: (200000, 28)

Full dataset loaded:
  Training: (800000, 28)
  Test: (200000, 28)

Sampling 50,000 events for clustering analysis
   (Clustering is O(n²) - full dataset would take hours)

Clustering Dataset (before scaling):
  Shape: (50000, 28)
  Features: 28
  Samples: 50,000
