In [1]:
import openml
import pandas as pd

# Higgs Boson dataset
# ID 23512: 100,000
# ID 42769: 1M
# ID 45570: 11M

dataset = openml.datasets.get_dataset(23512)
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# Combine into a DataFrame
df = pd.concat([X, y], axis=1)
print(df.shape)

(98050, 29)


In [2]:
df.isnull().sum()

lepton_pT                   0
lepton_eta                  0
lepton_phi                  0
missing_energy_magnitude    0
missing_energy_phi          0
jet1pt                      0
jet1eta                     0
jet1phi                     0
jet1b-tag                   0
jet2pt                      0
jet2eta                     0
jet2phi                     0
jet2b-tag                   0
jet3pt                      0
jet3eta                     0
jet3phi                     0
jet3b-tag                   0
jet4pt                      0
jet4eta                     0
jet4phi                     1
jet4b-tag                   1
m_jj                        1
m_jjj                       1
m_lv                        1
m_jlv                       1
m_bb                        1
m_wbb                       1
m_wwbb                      1
class                       0
dtype: int64

In [3]:
df.dropna(inplace = True)

In [4]:
df.dtypes

lepton_pT                    float64
lepton_eta                   float64
lepton_phi                   float64
missing_energy_magnitude     float64
missing_energy_phi           float64
jet1pt                       float64
jet1eta                      float64
jet1phi                      float64
jet1b-tag                    float64
jet2pt                       float64
jet2eta                      float64
jet2phi                      float64
jet2b-tag                    float64
jet3pt                       float64
jet3eta                      float64
jet3phi                      float64
jet3b-tag                    float64
jet4pt                       float64
jet4eta                      float64
jet4phi                      float64
jet4b-tag                    float64
m_jj                         float64
m_jjj                        float64
m_lv                         float64
m_jlv                        float64
m_bb                         float64
m_wbb                        float64
m

# MLP 

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.optimizers import Adam

In [11]:
# Separate features and target
X = df.drop(columns='class').astype(np.float32).values
y = df['class'].astype(np.float32).values 

In [12]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [14]:
# defining model architecture
model = Sequential([
        Dense(128, input_dim=28),
        Activation('relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [15]:
model.summary()

In [22]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [28]:
model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7223 - loss: 0.5416 - val_accuracy: 0.7220 - val_loss: 0.5460
Epoch 2/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7218 - loss: 0.5402 - val_accuracy: 0.7186 - val_loss: 0.5473
Epoch 3/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7241 - loss: 0.5355 - val_accuracy: 0.7213 - val_loss: 0.5463
Epoch 4/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7244 - loss: 0.5373 - val_accuracy: 0.7204 - val_loss: 0.5465
Epoch 5/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7243 - loss: 0.5413 - val_accuracy: 0.7217 - val_loss: 0.5458
Epoch 6/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7263 - loss: 0.5365 - val_accuracy: 0.7225 - val_loss: 0.5447
Epoch 7/10
[1m77/77[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x30400c2f0>

In [30]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252us/step - accuracy: 0.7161 - loss: 0.5505
Test Accuracy: 72.51%


In [32]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252us/step - accuracy: 0.7234 - loss: 0.5410
Validation Accuracy: 72.13%


In [34]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m2452/2452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197us/step - accuracy: 0.7419 - loss: 0.5140
Train Accuracy: 74.24%


In [40]:
model = Sequential([
        Dense(128, input_dim=28),
        Activation('sigmoid'),
        Dense(128),
        Activation('sigmoid'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [41]:
model.summary()

In [42]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [43]:
model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6083 - loss: 0.6540 - val_accuracy: 0.6476 - val_loss: 0.6305
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6529 - loss: 0.6258 - val_accuracy: 0.6745 - val_loss: 0.6056
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6754 - loss: 0.6011 - val_accuracy: 0.6924 - val_loss: 0.5829
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6953 - loss: 0.5785 - val_accuracy: 0.7013 - val_loss: 0.5696
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7055 - loss: 0.5653 - val_accuracy: 0.7085 - val_loss: 0.5608
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7118 - loss: 0.5564 - val_accuracy: 0.7137 - val_loss: 0.5533
Epoch 7/20
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x3435d58e0>

In [44]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 256us/step - accuracy: 0.7345 - loss: 0.5253
Test Accuracy: 73.45%


In [45]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 239us/step - accuracy: 0.7378 - loss: 0.5194
Train Accuracy: 73.73%


In [50]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 251us/step - accuracy: 0.7345 - loss: 0.5240
Validation Accuracy: 73.48%


In [55]:
# relu, relu, sigmoid

In [57]:
model = Sequential([
        Dense(128, input_dim=28),
        Activation('relu'),
        Dense(128),
        Activation('relu'),
        Dense(1),
        Activation('sigmoid')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [59]:
model.summary()

In [61]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [63]:
model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6643 - loss: 0.6090 - val_accuracy: 0.7138 - val_loss: 0.5541
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7211 - loss: 0.5465 - val_accuracy: 0.7243 - val_loss: 0.5380
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7315 - loss: 0.5306 - val_accuracy: 0.7321 - val_loss: 0.5297
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7369 - loss: 0.5212 - val_accuracy: 0.7382 - val_loss: 0.5196
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7419 - loss: 0.5145 - val_accuracy: 0.7404 - val_loss: 0.5162
Epoch 6/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7447 - loss: 0.5103 - val_accuracy: 0.7420 - val_loss: 0.5134
Epoch 7/20
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x349ad6ba0>

In [64]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 239us/step - accuracy: 0.7481 - loss: 0.5018
Test Accuracy: 74.91%


In [65]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 237us/step - accuracy: 0.7600 - loss: 0.4854
Train Accuracy: 76.00%


In [66]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 232us/step - accuracy: 0.7496 - loss: 0.5023
Validation Accuracy: 74.99%


# Forest Cover Dataset

In [39]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.nn.functional import one_hot
import openml
import random
from ucimlrepo import fetch_ucirepo

# function to preprocess dataset from 1 of 3 different sources: openML, UCI, and Kaggle
def load_and_process_dataset(source, target_column=None, dataset_id=None, 
                             uci_id=None, test_size=0.1, random_state=42, n_samples=None, usecols=None):
    if source == "openml":
        if dataset_id is None:
            raise ValueError("You must provide a dataset_id for OpenML datasets.")
        dataset = openml.datasets.get_dataset(dataset_id)
        X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
        target_column = dataset.default_target_attribute
        df = pd.concat([X, y], axis=1)

    
    elif source == "uci":
        if uci_id is None:
            raise ValueError("You must provide uci_id when source='uci'.")

        repo     = fetch_ucirepo(id=uci_id)
        data_url = repo.metadata.data_url    
        headers  = repo.data.headers         

        if target_column is None:
            target_column = getattr(repo.metadata, "target_col", headers[-1])
        if isinstance(target_column, (list, tuple)):
            if len(target_column) == 1:
                target_column = target_column[0]
            else:
                raise ValueError(
                    f"Multiple possible targets {target_column}; please specify one."
                )

        read_kwargs = {"header": 0}
        if n_samples is not None:
            read_kwargs["nrows"] = n_samples

        # only restrict columns if user actually passed usecols
        if usecols is not None:
            if any(not isinstance(c, str) for c in usecols):
                raise TypeError("All entries in usecols must be strings")
            cols_to_read = set(usecols) | {target_column}
            read_kwargs["usecols"] = cols_to_read

        df = pd.read_csv(data_url, **read_kwargs)

    elif source == "kaggle":
        if dataset_id is None or target_column is None:
            raise ValueError("For Kaggle datasets, provide file_path and target_column.")
        df = pd.read_csv(dataset_id)  # here, dataset_id acts as the file path

    else:
        raise ValueError("source must be one of: 'openml', 'uci', 'kaggle'")

    df.dropna(inplace=True)

    X = df.drop(columns=[target_column])
    y = df[target_column]

    X = X.apply(lambda col: col.astype(str) if col.dtype == 'object' else col)
    
    X = pd.get_dummies(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    y_tensor = one_hot(torch.tensor(y_encoded)).float()

    X_train_np, X_val_np, y_train_tensor, y_val_tensor = train_test_split(
        X_scaled, y_tensor, test_size=test_size, random_state=random_state
    )

    X_train_tensor = torch.tensor(X_train_np).float()
    X_val_tensor = torch.tensor(X_val_np).float()

    return (X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor)

In [41]:
# import whatever dataset you want to train the model on
dataset = load_and_process_dataset(source="uci", uci_id=31, n_samples = 100000)

In [43]:
X_train, X_val, y_train, y_val = dataset[0], dataset[2], dataset[1], dataset[3]

In [67]:
y_val.shape

torch.Size([10000, 7])

In [69]:
# defining model architecture
model = Sequential([
        Dense(128, input_dim=54),
        Activation('relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32),
        Activation('relu'),
        Dense(7),
        Activation('softmax')
    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [71]:
model.summary()

In [73]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [77]:
model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=1024,
    validation_data=(X_val, y_val)
)

Epoch 1/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8668 - loss: 0.0897 - val_accuracy: 0.8773 - val_loss: 0.0841
Epoch 2/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8696 - loss: 0.0890 - val_accuracy: 0.8759 - val_loss: 0.0830
Epoch 3/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8661 - loss: 0.0893 - val_accuracy: 0.8760 - val_loss: 0.0824
Epoch 4/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8710 - loss: 0.0869 - val_accuracy: 0.8794 - val_loss: 0.0814
Epoch 5/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8685 - loss: 0.0878 - val_accuracy: 0.8782 - val_loss: 0.0810
Epoch 6/40
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8732 - loss: 0.0862 - val_accuracy: 0.8800 - val_loss: 0.0807
Epoch 7/40
[1m88/88[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x34f1071d0>

In [81]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655us/step - accuracy: 0.9033 - loss: 0.0660
Validation Accuracy: 90.24%


In [83]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 206us/step - accuracy: 0.9127 - loss: 0.0610
Train Accuracy: 91.37%


In [89]:
from pytorch_tabnet.tab_model import TabNetClassifier
model = TabNetClassifier()



In [97]:
model.fit(
    X_train, y_train
)

ValueError: Data must be 1-dimensional, got ndarray of shape (90000, 7) instead

In [81]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655us/step - accuracy: 0.9033 - loss: 0.0660
Validation Accuracy: 90.24%


In [83]:
loss, accuracy = model.evaluate(X_train, y_train)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

[1m2813/2813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 206us/step - accuracy: 0.9127 - loss: 0.0610
Train Accuracy: 91.37%
