In [44]:
import openml
import pandas as pd

# ID 23512: 100,000
# ID 42769: 1M
# ID 45570: 11M

dataset = openml.datasets.get_dataset(42769)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# Combine into a DataFrame
df = pd.concat([X, y], axis=1)
print(df.shape)

(1000000, 29)


In [46]:
df.isnull().sum()

lepton_pT                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet_1_pt                    0
jet_1_eta                   0
jet_1_phi                   0
jet_1_b-tag                 0
jet_2_pt                    0
jet_2_eta                   0
jet_2_phi                   0
jet_2_b-tag                 0
jet_3_pt                    0
jet_3_eta                   0
jet_3_phi                   0
jet_3_b-tag                 0
jet_4_pt                    0
jet_4_eta                   0
jet_4_phi                   0
jet_4_b-tag                 0
m_jj                        0
m_jjj                       0
m_lv                        0
m_jlv                       0
m_bb                        0
m_wbb                       0
m_wwbb                      0
target                      0
dtype: int64

In [48]:
df.dropna(inplace = True)

In [50]:
df.dtypes

lepton_pT                    float64
lepton_eta                   float64
lepton_phi                   float64
missing_energy_magnitude     float64
missing_energy_phi           float64
jet_1_pt                     float64
jet_1_eta                    float64
jet_1_phi                    float64
jet_1_b-tag                  float64
jet_2_pt                     float64
jet_2_eta                    float64
jet_2_phi                    float64
jet_2_b-tag                  float64
jet_3_pt                     float64
jet_3_eta                    float64
jet_3_phi                    float64
jet_3_b-tag                  float64
jet_4_pt                     float64
jet_4_eta                    float64
jet_4_phi                    float64
jet_4_b-tag                  float64
m_jj                         float64
m_jjj                        float64
m_lv                         float64
m_jlv                        float64
m_bb                         float64
m_wbb                        float64
m

# MLP 

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.optimizers import Adam

In [59]:
# Separate features and target
X = df.drop(columns='target').astype(np.float32).values
y = df['target'].astype(np.float32).values 

In [61]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [63]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [64]:
model = Sequential([
        Dense(64, input_dim=28),
        Activation('relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [67]:
model.summary()

In [69]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [79]:
model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7300 - loss: 0.5310 - val_accuracy: 0.7387 - val_loss: 0.5175
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7300 - loss: 0.5303 - val_accuracy: 0.7394 - val_loss: 0.5168
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7303 - loss: 0.5289 - val_accuracy: 0.7398 - val_loss: 0.5162
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7303 - loss: 0.5286 - val_accuracy: 0.7387 - val_loss: 0.5166
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7309 - loss: 0.5288 - val_accuracy: 0.7405 - val_loss: 0.5152
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7307 - loss: 0.5279 - val_accuracy: 0.7393 - val_loss: 0.5156
Epoch 7/20
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x322c49820>

In [80]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 188us/step - accuracy: 0.7424 - loss: 0.5108
Validation Accuracy: 74.13%
