# Data Exploration: Higgs Boson Dataset

This notebook explores the Higgs boson dataset and visualizes the features.

In [None]:
import sys
sys.path.append('../src')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import load_higgs_data, get_feature_names
from visualization import plot_feature_distributions, plot_feature_correlation

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

%matplotlib inline

## Load Data

Load the Higgs dataset. If the dataset file is not available, synthetic data will be generated.

In [None]:
# Load data (will generate synthetic data if file not found)
X_train, X_test, y_train, y_test = load_higgs_data(
    data_path='../data/HIGGS.csv',
    n_samples=50000,  # Use subset for faster processing
    test_split=0.2,
    random_seed=42
)

feature_names = get_feature_names()

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Number of features: {X_train.shape[1]}")

## Dataset Statistics

In [None]:
print("Class Distribution:")
print(f"  Training - Signal: {np.sum(y_train == 1)} ({np.mean(y_train)*100:.2f}%)")
print(f"  Training - Background: {np.sum(y_train == 0)} ({(1-np.mean(y_train))*100:.2f}%)")
print(f"\n  Test - Signal: {np.sum(y_test == 1)} ({np.mean(y_test)*100:.2f}%)")
print(f"  Test - Background: {np.sum(y_test == 0)} ({(1-np.mean(y_test))*100:.2f}%)")

## Feature Distributions

Compare feature distributions between signal and background events.

In [None]:
# Separate signal and background
X_signal = X_train[y_train == 1]
X_background = X_train[y_train == 0]

print(f"Signal samples: {len(X_signal)}")
print(f"Background samples: {len(X_background)}")

In [None]:
# Plot feature distributions for first 9 features
plot_feature_distributions(
    X_signal, 
    X_background, 
    feature_names, 
    n_features=9,
    save_path='../figures/feature_distributions_1.png'
)

In [None]:
# Plot feature distributions for next 9 features
plot_feature_distributions(
    X_signal[:, 9:18], 
    X_background[:, 9:18], 
    feature_names[9:18], 
    n_features=9,
    save_path='../figures/feature_distributions_2.png'
)

## Feature Correlations

In [None]:
# Plot correlation matrix
plot_feature_correlation(
    X_train[:5000],  # Use subset for faster computation
    feature_names,
    save_path='../figures/feature_correlation.png'
)

## Feature Statistics

In [None]:
import pandas as pd

# Create DataFrame for better visualization
df_stats = pd.DataFrame({
    'Feature': feature_names,
    'Mean': X_train.mean(axis=0),
    'Std': X_train.std(axis=0),
    'Min': X_train.min(axis=0),
    'Max': X_train.max(axis=0)
})

print("Feature Statistics:")
print(df_stats.to_string(index=False))