In [1]:
import sys

sys.path.insert(0, "../utils")

In [30]:
import sklearn.datasets as skds
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer, OrdinalEncoder, LabelEncoder
import numpy as np
import pandas as pd
from transformation import BSplineTransformer, spline_transform_dataset
from trainers import FFMTrainer, FMTrainer
import math
import optuna
import optuna.samplers
from typing import Callable
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset
from tqdm import trange

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
print(device)

cuda:0


In [4]:
torch.manual_seed(42)
np.random.seed(42)

In [5]:
raw_df = pd.read_csv("../data/higgs.arff",
                      names=["Label", "pT", "eta", "phi", "missing_energy_magnitude", "missing_energy_phi","jet1pt","jet1eta","jet1phi","jet1b","jet2pt","jet2eta","jet2phi","jet2b","jet3pt","jet3eta","jet3phi","jet3b","jet4pt","jet4eta","jet4phi","jet4b","m_jj","m_jjj","m_lv","m_jlv","m_bb","m_wbb","m_wwbb"],
                      dtype={0:int, 1 :float, 2:float, 3:float, 4:float, 5:float, 6:float, 7:float, 8:float, 9:float, 10:float, 11:float, 12:float, 13:float, 14:float, 15:float, 16:float, 17:float, 18:float, 19:float, 20:float, 21:float, 22:float, 23:float, 24:float, 25:float, 26:float, 27:float, 28:float, 29:float},
                      na_values="?")  # TODO: only 3000 lines are loaded in the data

In [6]:
raw_df = raw_df.dropna(axis=0)

In [7]:
raw_df.sample(8)

Unnamed: 0,Label,pT,eta,phi,missing_energy_magnitude,missing_energy_phi,jet1pt,jet1eta,jet1phi,jet1b,...,jet4eta,jet4phi,jet4b,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
77204,1,0.978916,1.844643,-1.422353,0.910926,1.741381,0.83674,1.038726,0.040076,2.173076,...,0.509227,1.487003,0.0,0.815551,1.017818,0.981035,1.173383,0.978406,0.987195,0.84339
38305,0,0.785475,0.947617,1.601191,1.730936,-1.300155,0.898392,0.655507,1.034625,2.173076,...,-0.129548,-0.258853,0.0,1.666249,1.403631,0.986471,1.152768,1.342005,1.13437,1.092319
68013,1,0.572818,-1.231151,-1.497259,0.557409,1.587439,2.817746,0.714921,-0.137981,1.086538,...,-0.510981,1.181246,3.101961,2.894233,1.823246,0.998267,1.867427,2.78522,2.321026,1.750416
31266,0,0.548844,2.153391,-0.562318,1.489274,0.24805,1.791746,-0.108951,1.369468,0.0,...,-0.621747,1.301107,3.101961,0.934149,0.871509,0.985588,0.821194,1.113907,1.328275,1.810306
94357,1,0.8466,-0.158811,-1.235365,0.728787,-0.783034,1.155717,-0.462463,-0.844258,1.086538,...,-1.235538,1.066934,0.0,0.440971,0.965494,0.98553,1.133286,1.747692,0.984962,0.814022
40265,0,0.522125,-1.027591,0.597447,1.804414,-0.801832,0.361024,0.048496,-1.564392,0.0,...,1.336221,1.379905,0.0,0.793973,1.153002,1.280278,0.912474,0.859079,0.891584,0.92811
58128,0,1.776103,0.709969,0.825495,1.859818,0.553562,0.699329,-0.858555,-0.895261,2.173076,...,-0.374398,-0.301581,0.0,1.151344,0.976257,1.001275,1.987672,0.858882,1.263109,1.232239
14660,0,1.368359,0.191818,0.169094,1.411127,0.573316,1.964517,0.41092,-0.842041,2.173076,...,-2.119163,1.737823,0.0,1.077456,1.392801,0.974027,0.940942,3.311254,1.679857,1.371441


In [8]:
raw_df.shape

(98049, 29)

In [9]:
raw_df.columns

Index(['Label', 'pT', 'eta', 'phi', 'missing_energy_magnitude',
       'missing_energy_phi', 'jet1pt', 'jet1eta', 'jet1phi', 'jet1b', 'jet2pt',
       'jet2eta', 'jet2phi', 'jet2b', 'jet3pt', 'jet3eta', 'jet3phi', 'jet3b',
       'jet4pt', 'jet4eta', 'jet4phi', 'jet4b', 'm_jj', 'm_jjj', 'm_lv',
       'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'],
      dtype='object')

In [10]:
train, test = train_test_split(raw_df, test_size=0.2, random_state=42)

In [11]:
tr_feats = train.drop("Label", axis=1)
tr_target = train["Label"]
te_feats = test.drop("Label", axis=1)
te_target = test["Label"]

In [12]:
quant_transform = QuantileTransformer(output_distribution='uniform',
                                      n_quantiles=10000,
                                      subsample=len(tr_feats),
                                      random_state=42)
X_train_qs = quant_transform.fit_transform(tr_feats)
X_test_qs = quant_transform.transform(te_feats)

In [13]:
def train_spline_fm(embedding_dim: int, step_size: float, batch_size: int, num_knots: int, num_epochs: int,
                     callback: Callable[[int, float], None]=None):
    bs = BSplineTransformer(num_knots, 3)
    tr_indices, tr_weights, tr_offsets, tr_fields = spline_transform_dataset(X_train_qs, bs)
    te_indices, te_weights, te_offsets, te_fields = spline_transform_dataset(X_test_qs, bs)

    num_fields = X_train_qs.shape[1]
    num_embeddings = int(max(np.max(tr_indices), np.max(te_indices)) + 1)

    train_ds = TensorDataset(
        torch.tensor(tr_indices, dtype=torch.int64),
        torch.tensor(tr_weights, dtype=torch.float32),
        torch.tensor(tr_offsets, dtype=torch.int64),
        torch.tensor(tr_fields, dtype=torch.int64),
        torch.tensor(tr_target.values, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(te_indices, dtype=torch.int64),
        torch.tensor(te_weights, dtype=torch.float32),
        torch.tensor(te_offsets, dtype=torch.int64),
        torch.tensor(te_fields, dtype=torch.int64),
        torch.tensor(te_target.values, dtype=torch.float32))


    trainer = FMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.BCEWithLogitsLoss(), device)

In [14]:
def train_spline_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 2, 32)
    num_knots = trial.suggest_int('num_knots', 3, 48)
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        trial.report(loss, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return train_spline_fm(embedding_dim, step_size, batch_size, num_knots, num_epochs,
                           callback=callback)

In [15]:
study = optuna.create_study(study_name='splines',
                            direction='minimize',
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(train_spline_objective, n_trials=100)

[32m[I 2023-05-16 13:19:07,876][0m A new study created in memory with name: splines[0m
[32m[I 2023-05-16 13:20:21,677][0m Trial 0 finished with value: 0.5853611826896667 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 24, 'num_knots': 30, 'num_epochs': 6}. Best is trial 0 with value: 0.5853611826896667.[0m
[32m[I 2023-05-16 13:22:19,259][0m Trial 1 finished with value: 0.5886282324790955 and parameters: {'embedding_dim': 2, 'step_size': 0.012551115172973842, 'batch_size': 28, 'num_knots': 30, 'num_epochs': 12}. Best is trial 0 with value: 0.5853611826896667.[0m
[32m[I 2023-05-16 13:23:30,027][0m Trial 2 finished with value: 0.5681326389312744 and parameters: {'embedding_dim': 1, 'step_size': 0.44447541666908114, 'batch_size': 27, 'num_knots': 12, 'num_epochs': 7}. Best is trial 2 with value: 0.5681326389312744.[0m
[32m[I 2023-05-16 13:25:25,524][0m Trial 3 finished with value: 0.5769233107566833 and parameters: {'embedding_dim': 2, 'ste

In [16]:
trial = study.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.5443105697631836
Best hyperparameters: {'embedding_dim': 10, 'step_size': 0.10968446514533971, 'batch_size': 20, 'num_knots': 5, 'num_epochs': 13}


In [17]:
study.best_params

{'embedding_dim': 10,
 'step_size': 0.10968446514533971,
 'batch_size': 20,
 'num_knots': 5,
 'num_epochs': 13}

In [18]:
train_spline_fm(**study.best_params)

0.54607093334198

In [32]:
spline_losses = []
for i in trange(20):
    loss = train_spline_fm(**study.best_params)
    spline_losses.append(loss)

100%|██████████| 20/20 [2:36:52<00:00, 470.63s/it]  


In [33]:
spline_losses

[0.5441789627075195,
 0.5446186661720276,
 0.5441141128540039,
 0.5452879667282104,
 0.5450193881988525,
 0.544367790222168,
 0.5449992418289185,
 0.5459874272346497,
 0.5445811152458191,
 0.5439596772193909,
 0.5442858338356018,
 0.5452952980995178,
 0.5440512895584106,
 0.5441167950630188,
 0.5458352565765381,
 0.5453296899795532,
 0.5457161664962769,
 0.5438892245292664,
 0.5454685688018799,
 0.5439222455024719]

In [36]:
np.mean(spline_losses), np.std(spline_losses), 100 * np.std(spline_losses) / np.mean(spline_losses)

(0.5447512358427048, 0.0006816852086465472, 0.12513697331810766)

In [19]:
def train_bin_fm(embedding_dim: int, step_size: float, batch_size: int,
                  num_bins: int, bin_strategy: str, num_epochs: int,
                  callback: Callable[[int, float], None]=None):
    num_fields = tr_feats.shape[1]
    num_embeddings = num_fields * num_bins
    index_offsets = np.arange(0, num_fields) * num_bins

    discretizer = KBinsDiscretizer(num_bins, encode='ordinal', strategy=bin_strategy, random_state=42)
    discretizer.fit(tr_feats)

    tr_indices = discretizer.transform(tr_feats)
    tr_indices += np.tile(index_offsets, (tr_indices.shape[0], 1))
    tr_weights = np.ones_like(tr_indices)
    tr_fields = np.tile(np.arange(0, num_fields), (tr_indices.shape[0], 1))
    tr_offsets = tr_fields.copy()

    te_indices = discretizer.transform(te_feats)
    te_indices += np.tile(index_offsets, (te_indices.shape[0], 1))
    te_weights = np.ones_like(te_indices)
    te_fields = np.tile(np.arange(0, num_fields), (te_indices.shape[0], 1))
    te_offsets = te_fields.copy()

    train_ds = TensorDataset(
        torch.tensor(tr_indices, dtype=torch.int64),
        torch.tensor(tr_weights, dtype=torch.float32),
        torch.tensor(tr_offsets, dtype=torch.int64),
        torch.tensor(tr_fields, dtype=torch.int64),
        torch.tensor(tr_target.values, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(te_indices, dtype=torch.int64),
        torch.tensor(te_weights, dtype=torch.float32),
        torch.tensor(te_offsets, dtype=torch.int64),
        torch.tensor(te_fields, dtype=torch.int64),
        torch.tensor(te_target.values, dtype=torch.float32))

    trainer = FMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.BCEWithLogitsLoss(), device)

In [20]:
def test_bins_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 2, 32)
    num_bins = trial.suggest_int('num_bins', 2, 100)
    bin_strategy = trial.suggest_categorical('bin_strategy', ['uniform', 'quantile'])
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        trial.report(loss, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return train_bin_fm(embedding_dim, step_size, batch_size, num_bins, bin_strategy, num_epochs,
                         callback=callback)


In [25]:
study_bins = optuna.create_study(study_name='bins',
                                 direction='minimize',
                                 sampler=optuna.samplers.TPESampler(seed=42))
study_bins.optimize(test_bins_objective, n_trials=100)

[32m[I 2023-05-16 17:56:07,154][0m A new study created in memory with name: bins[0m
[32m[I 2023-05-16 17:57:02,886][0m Trial 0 finished with value: 0.619638204574585 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 24, 'num_bins': 61, 'bin_strategy': 'uniform', 'num_epochs': 5}. Best is trial 0 with value: 0.619638204574585.[0m
[32m[I 2023-05-16 17:58:24,214][0m Trial 1 finished with value: 0.6673367023468018 and parameters: {'embedding_dim': 9, 'step_size': 0.10502105436744279, 'batch_size': 23, 'num_bins': 4, 'bin_strategy': 'uniform', 'num_epochs': 7}. Best is trial 0 with value: 0.619638204574585.[0m
[32m[I 2023-05-16 18:02:30,571][0m Trial 2 finished with value: 0.6010141372680664 and parameters: {'embedding_dim': 2, 'step_size': 0.020492680115417352, 'batch_size': 11, 'num_bins': 53, 'bin_strategy': 'uniform', 'num_epochs': 11}. Best is trial 2 with value: 0.6010141372680664.[0m
[32m[I 2023-05-16 18:05:42,727][0m Trial 3 finished w

In [26]:
study_bins.best_params

{'embedding_dim': 9,
 'step_size': 0.13789649551190705,
 'batch_size': 4,
 'num_bins': 8,
 'bin_strategy': 'quantile',
 'num_epochs': 6}

In [27]:
trial = study_bins.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.5633455514907837
Best hyperparameters: {'embedding_dim': 9, 'step_size': 0.13789649551190705, 'batch_size': 4, 'num_bins': 8, 'bin_strategy': 'quantile', 'num_epochs': 6}


In [28]:
train_bin_fm(**study_bins.best_params)



0.5625268816947937

In [34]:
bin_losses = []
for i in trange(20):
    loss = train_bin_fm(**study_bins.best_params)
    bin_losses.append(loss)

100%|██████████| 20/20 [3:47:29<00:00, 682.50s/it]


In [35]:
bin_losses

[0.5628558397293091,
 0.5632562637329102,
 0.5656234622001648,
 0.5647141933441162,
 0.5634582042694092,
 0.5627597570419312,
 0.5638707280158997,
 0.5648466944694519,
 0.5640009045600891,
 0.563601016998291,
 0.5659196972846985,
 0.5635817050933838,
 0.5617431402206421,
 0.5623651146888733,
 0.5654476881027222,
 0.562347412109375,
 0.5612610578536987,
 0.5633291602134705,
 0.5651400685310364,
 0.5640577077865601]

In [37]:
np.mean(bin_losses), np.std(bin_losses), 100 * np.std(bin_losses) / np.mean(bin_losses)

(0.5637089908123016, 0.0012599007055282235, 0.22350197106360029)

In [38]:
100 * (np.mean(spline_losses) / np.mean(bin_losses) - 1)

-3.363039312585525