In [1]:
import openml
import pandas as pd

# ID 23512: 100,000
# ID 42769: 1M
# ID 45570: 11M

dataset = openml.datasets.get_dataset(42769)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# Combine into a DataFrame
df = pd.concat([X, y], axis=1)
print(df.shape)

(1000000, 29)


In [3]:
df.isnull().sum()

lepton_pT                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet_1_pt                    0
jet_1_eta                   0
jet_1_phi                   0
jet_1_b-tag                 0
jet_2_pt                    0
jet_2_eta                   0
jet_2_phi                   0
jet_2_b-tag                 0
jet_3_pt                    0
jet_3_eta                   0
jet_3_phi                   0
jet_3_b-tag                 0
jet_4_pt                    0
jet_4_eta                   0
jet_4_phi                   0
jet_4_b-tag                 0
m_jj                        0
m_jjj                       0
m_lv                        0
m_jlv                       0
m_bb                        0
m_wbb                       0
m_wwbb                      0
target                      0
dtype: int64

In [5]:
df.dropna(inplace = True)

In [7]:
df.dtypes

lepton_pT                    float64
lepton_eta                   float64
lepton_phi                   float64
missing_energy_magnitude     float64
missing_energy_phi           float64
jet_1_pt                     float64
jet_1_eta                    float64
jet_1_phi                    float64
jet_1_b-tag                  float64
jet_2_pt                     float64
jet_2_eta                    float64
jet_2_phi                    float64
jet_2_b-tag                  float64
jet_3_pt                     float64
jet_3_eta                    float64
jet_3_phi                    float64
jet_3_b-tag                  float64
jet_4_pt                     float64
jet_4_eta                    float64
jet_4_phi                    float64
jet_4_b-tag                  float64
m_jj                         float64
m_jjj                        float64
m_lv                         float64
m_jlv                        float64
m_bb                         float64
m_wbb                        float64
m

# MLP 

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.optimizers import Adam

In [21]:
# Separate features and target
X = df.drop(columns='target').astype(np.float32).values
y = df['target'].astype(np.float32).values 

In [23]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [25]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [27]:
model = Sequential([
        Dense(128, input_dim=28),
        Activation('relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [29]:
model.summary()

In [31]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [33]:
model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6227 - loss: 0.6562 - val_accuracy: 0.7026 - val_loss: 0.5695
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6999 - loss: 0.5729 - val_accuracy: 0.7177 - val_loss: 0.5469
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7123 - loss: 0.5553 - val_accuracy: 0.7256 - val_loss: 0.5364
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7206 - loss: 0.5443 - val_accuracy: 0.7312 - val_loss: 0.5283
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7256 - loss: 0.5371 - val_accuracy: 0.7326 - val_loss: 0.5245
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7284 - loss: 0.5327 - val_accuracy: 0.7366 - val_loss: 0.5204
Epoch 7/10
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x165df58e0>

In [34]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 238us/step - accuracy: 0.7414 - loss: 0.5146
Test Accuracy: 74.19%


In [35]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 216us/step - accuracy: 0.7399 - loss: 0.5138
Validation Accuracy: 74.15%


In [36]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 217us/step - accuracy: 0.7447 - loss: 0.5082
Train Accuracy: 74.44%


In [40]:
model = Sequential([
        Dense(128, input_dim=28),
        Activation('sigmoid'),
        Dense(128),
        Activation('sigmoid'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [41]:
model.summary()

In [42]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [43]:
model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6083 - loss: 0.6540 - val_accuracy: 0.6476 - val_loss: 0.6305
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6529 - loss: 0.6258 - val_accuracy: 0.6745 - val_loss: 0.6056
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6754 - loss: 0.6011 - val_accuracy: 0.6924 - val_loss: 0.5829
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6953 - loss: 0.5785 - val_accuracy: 0.7013 - val_loss: 0.5696
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7055 - loss: 0.5653 - val_accuracy: 0.7085 - val_loss: 0.5608
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7118 - loss: 0.5564 - val_accuracy: 0.7137 - val_loss: 0.5533
Epoch 7/20
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x3435d58e0>

In [44]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256us/step - accuracy: 0.7345 - loss: 0.5253
Test Accuracy: 73.45%


In [45]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 239us/step - accuracy: 0.7378 - loss: 0.5194
Train Accuracy: 73.73%


In [50]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251us/step - accuracy: 0.7345 - loss: 0.5240
Validation Accuracy: 73.48%


In [55]:
# relu, relu, sigmoid

In [57]:
model = Sequential([
        Dense(128, input_dim=28),
        Activation('relu'),
        Dense(128),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [59]:
model.summary()

In [61]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [63]:
model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6643 - loss: 0.6090 - val_accuracy: 0.7138 - val_loss: 0.5541
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7211 - loss: 0.5465 - val_accuracy: 0.7243 - val_loss: 0.5380
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7315 - loss: 0.5306 - val_accuracy: 0.7321 - val_loss: 0.5297
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7369 - loss: 0.5212 - val_accuracy: 0.7382 - val_loss: 0.5196
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7419 - loss: 0.5145 - val_accuracy: 0.7404 - val_loss: 0.5162
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7447 - loss: 0.5103 - val_accuracy: 0.7420 - val_loss: 0.5134
Epoch 7/20
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x349ad6ba0>

In [64]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 239us/step - accuracy: 0.7481 - loss: 0.5018
Test Accuracy: 74.91%


In [65]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 237us/step - accuracy: 0.7600 - loss: 0.4854
Train Accuracy: 76.00%


In [66]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 232us/step - accuracy: 0.7496 - loss: 0.5023
Validation Accuracy: 74.99%
