In [1]:
import openml
import pandas as pd

# ID 23512: 100,000
# ID 42769: 1M
# ID 45570: 11M

dataset = openml.datasets.get_dataset(23512)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# Combine into a DataFrame
df = pd.concat([X, y], axis=1)
print(df.shape)

(98050, 29)


In [3]:
df.isnull().sum()

lepton_pT                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet1pt                      0
jet1eta                     0
jet1phi                     0
jet1b-tag                   0
jet2pt                      0
jet2eta                     0
jet2phi                     0
jet2b-tag                   0
jet3pt                      0
jet3eta                     0
jet3phi                     0
jet3b-tag                   0
jet4pt                      0
jet4eta                     0
jet4phi                     1
jet4b-tag                   1
m_jj                        1
m_jjj                       1
m_lv                        1
m_jlv                       1
m_bb                        1
m_wbb                       1
m_wwbb                      1
class                       0
dtype: int64

In [5]:
df.dropna(inplace = True)

In [7]:
df.dtypes

lepton_pT                    float64
lepton_eta                   float64
lepton_phi                   float64
missing_energy_magnitude     float64
missing_energy_phi           float64
jet1pt                       float64
jet1eta                      float64
jet1phi                      float64
jet1b-tag                    float64
jet2pt                       float64
jet2eta                      float64
jet2phi                      float64
jet2b-tag                    float64
jet3pt                       float64
jet3eta                      float64
jet3phi                      float64
jet3b-tag                    float64
jet4pt                       float64
jet4eta                      float64
jet4phi                      float64
jet4b-tag                    float64
m_jj                         float64
m_jjj                        float64
m_lv                         float64
m_jlv                        float64
m_bb                         float64
m_wbb                        float64
m

# MLP 

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [11]:
# Separate features and target
X = df.drop(columns='class').astype(np.float32).values
y = df['class'].astype(np.float32).values 

In [12]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(28,)),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
model.summary()

In [16]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [17]:
model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/5
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5913 - loss: 0.6597 - val_accuracy: 0.6632 - val_loss: 0.6140
Epoch 2/5
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6729 - loss: 0.6033 - val_accuracy: 0.6856 - val_loss: 0.5944
Epoch 3/5
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6942 - loss: 0.5814 - val_accuracy: 0.6960 - val_loss: 0.5813
Epoch 4/5
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7050 - loss: 0.5630 - val_accuracy: 0.7009 - val_loss: 0.5739
Epoch 5/5
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7167 - loss: 0.5495 - val_accuracy: 0.7055 - val_loss: 0.5667


<keras.src.callbacks.history.History at 0x323b196a0>

In [18]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239us/step - accuracy: 0.7036 - loss: 0.5671
Validation Accuracy: 70.55%
