In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv("atlas-higgs-challenge-2014-v2.csv")

In [5]:
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (818238, 35)


Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [6]:
target_col = "Label"
drop_cols = ["Label", "Weight"]

X = df.drop(columns=drop_cols)
y = df[target_col]

In [7]:
y_encoded = y.map({"b": 0, "s": 1})

y_encoded.value_counts()

Label
0    538678
1    279560
Name: count, dtype: int64

In [8]:
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].replace(-999, np.nan)
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

In [9]:
X.select_dtypes(exclude=[np.number]).head()

Unnamed: 0,KaggleSet
0,t
1,t
2,t
3,t
4,t


In [10]:
# Keeping only numeric features
X = X.select_dtypes(include=[np.number])

print("Features after dropping non-numeric columns:", X.shape)

Features after dropping non-numeric columns: (818238, 32)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (654590, 32)
Test shape: (163648, 32)


In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Handling Non-Numeric Features

During preprocessing, non-numeric features were identified and excluded from
model training. The selected machine learning algorithms require numerical
inputs, and the retained features represent continuous physics-based
measurements suitable for statistical learning.

In [14]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=y_train
)

class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
class_weight_dict

{0: 0.7594873556070191, 1: 1.463438081270568}

In [15]:
np.save("data/X_train_scaled.npy", X_train_scaled)
np.save("data/X_test_scaled.npy", X_test_scaled)
np.save("data/y_train.npy", y_train.values)
np.save("data/y_test.npy", y_test.values)