In [1]:
import sys

sys.path.insert(0, "../utils")

In [27]:
import sklearn.datasets as skds
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer, OrdinalEncoder, LabelEncoder
import numpy as np
import pandas as pd
from transformation import BSplineTransformer, spline_transform_dataset
from trainers import FFMTrainer
import math
import optuna
import optuna.samplers
from typing import Callable
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset
from tqdm import trange

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
print(device)

cuda:0


In [4]:
torch.manual_seed(42)
np.random.seed(42)

In [5]:
raw_df = pd.read_csv("../data/adult-all.txt",
                      names=["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
                             "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
                             "Hours per week", "Country", "Target"],
                      dtype={0:int, 1:str, 2:int, 3:str, 4:int, 5: str, 6:str ,
                             7:str ,8:str ,9: str, 10:int, 11:int, 12:int, 13:str,14: str},
                      na_values="?")

In [6]:
raw_df.sample(20)

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
7762,18,Private,423024,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,20,United-States,<=50K
23881,17,Private,178953,12th,8,Never-married,Sales,Own-child,White,Female,0,0,20,United-States,<=50K
30507,25,Local-gov,348986,HS-grad,9,Never-married,Handlers-cleaners,Other-relative,Black,Male,0,0,40,United-States,<=50K
28911,20,Private,218215,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,30,United-States,<=50K
19484,47,Private,244025,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,Amer-Indian-Eskimo,Male,0,0,56,Puerto-Rico,<=50K
43031,33,Private,399531,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,Black,Male,0,0,40,United-States,<=50K
28188,38,Private,200220,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
12761,21,Private,329530,11th,7,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Mexico,<=50K
40834,43,Private,282155,Assoc-acdm,12,Divorced,Prof-specialty,Not-in-family,White,Female,4650,0,40,United-States,<=50K
27875,55,Private,202220,HS-grad,9,Married-civ-spouse,Other-service,Wife,Black,Female,2407,0,35,United-States,<=50K


In [7]:
raw_df.columns

Index(['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num',
       'Martial Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital Gain', 'Capital Loss', 'Hours per week', 'Country', 'Target'],
      dtype='object')

In [8]:
categorical_columns = ['Workclass', 'Education', 'Martial Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']
numerical_columns = ['Age', 'fnlwgt', 'Education-Num', 'Capital Gain', 'Capital Loss', 'Hours per week']

In [9]:
na_dict = {col: f'NA_{col}' for col in categorical_columns}
cat_ordinal = raw_df.fillna(na_dict)
cat_ordinal[categorical_columns] = OrdinalEncoder().fit_transform(cat_ordinal[categorical_columns])
cat_ordinal["Target"] = LabelEncoder().fit_transform(cat_ordinal["Target"])

In [10]:
train, test = train_test_split(cat_ordinal, test_size=0.2, random_state=42)

In [11]:
tr_cat = train[categorical_columns]
tr_num = train[numerical_columns]
tr_target = train["Target"]

te_cat = test[categorical_columns]
te_num = test[numerical_columns]
te_target = test["Target"]

In [12]:
tr_num_qs = []
te_num_qs = []
special_values = dict()
for col_idx, col in enumerate(tr_num.columns):
    tr_col = tr_num.iloc[:, col_idx].to_numpy().astype(np.float32)
    te_col = te_num.iloc[:, col_idx].to_numpy().astype(np.float32)

    if col in ['Capital Loss', 'Capital Gain']:
        regular_tr_mask = tr_col > 0
        regular_te_mask = te_col > 0
        tr_col[~regular_tr_mask] = -1.
        te_col[~regular_te_mask] = -1.
        special_values[col_idx] = [-1.]
    else:
        regular_tr_mask = np.ones_like(tr_col, dtype=bool)
        regular_te_mask = np.ones_like(te_col, dtype=bool)

    transformer = QuantileTransformer(subsample=np.sum(regular_tr_mask), output_distribution='uniform')
    tr_col[regular_tr_mask] = transformer.fit_transform(tr_col[regular_tr_mask].reshape(-1, 1)).reshape(-1)
    te_col[regular_te_mask] = transformer.transform(te_col[regular_te_mask].reshape(-1, 1)).reshape(-1)

    tr_num_qs.append(tr_col)
    te_num_qs.append(te_col)

tr_num_qs = np.stack(tr_num_qs, axis=1)
te_num_qs = np.stack(te_num_qs, axis=1)


In [13]:
num_cat_fields = tr_cat.shape[1]
cat_offsets = np.cumsum([0] + [cat_ordinal[col].nunique() for col in categorical_columns])
num_cat_embeddings = cat_offsets[-1]
cat_offsets = cat_offsets[:-1]

tr_cat_indices = tr_cat.values + np.tile(cat_offsets, (len(tr_cat), 1))
tr_cat_weights = np.ones_like(tr_cat_indices, dtype=np.float32)
tr_cat_offsets = np.tile(np.arange(num_cat_fields, dtype=np.int32), (tr_cat.shape[0], 1))
tr_cat_fields = tr_cat_offsets

te_cat_indices = te_cat.values + np.tile(cat_offsets, (len(te_cat), 1))
te_cat_weights = np.ones_like(te_cat_indices, dtype=np.float32)
te_cat_offsets = np.tile(np.arange(num_cat_fields, dtype=np.int32), (te_cat.shape[0], 1))
te_cat_fields = te_cat_offsets


In [14]:
def train_spline_ffm(embedding_dim: int, step_size: float, batch_size: int, num_knots: int, num_epochs: int,
                     callback: Callable[[int, float], None]=None):
    bs = BSplineTransformer(num_knots, 3)
    tr_num_indices, tr_num_weights, tr_num_offsets, tr_num_fields = spline_transform_dataset(tr_num_qs, bs, special_values=special_values)
    te_num_indices, te_num_weights, te_num_offsets, te_num_fields = spline_transform_dataset(te_num_qs, bs, special_values=special_values)

    num_numerical_fields = tr_num_qs.shape[1]
    num_numerical_embeddings = int(max(np.max(tr_num_indices), np.max(te_num_indices)) + 1)

    num_fields = num_numerical_fields + num_cat_fields
    num_embeddings = num_numerical_embeddings + num_cat_embeddings

    tr_indices = np.concatenate([tr_cat_indices, tr_num_indices + num_cat_embeddings], axis=1)
    tr_weights = np.concatenate([tr_cat_weights, tr_num_weights], axis=1)
    tr_offsets = np.concatenate([tr_cat_offsets, tr_num_offsets + num_cat_fields], axis=1)
    tr_fields = np.concatenate([tr_cat_fields, tr_num_fields + num_cat_fields], axis=1)

    te_indices = np.concatenate([te_cat_indices, te_num_indices + num_cat_embeddings], axis=1)
    te_weights = np.concatenate([te_cat_weights, te_num_weights], axis=1)
    te_offsets = np.concatenate([te_cat_offsets, te_num_offsets + num_cat_fields], axis=1)
    te_fields = np.concatenate([te_cat_fields, te_num_fields + num_cat_fields], axis=1)

    train_ds = TensorDataset(
        torch.tensor(tr_indices, dtype=torch.int64),
        torch.tensor(tr_weights, dtype=torch.float32),
        torch.tensor(tr_offsets, dtype=torch.int64),
        torch.tensor(tr_fields, dtype=torch.int64),
        torch.tensor(tr_target.values, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(te_indices, dtype=torch.int64),
        torch.tensor(te_weights, dtype=torch.float32),
        torch.tensor(te_offsets, dtype=torch.int64),
        torch.tensor(te_fields, dtype=torch.int64),
        torch.tensor(te_target.values, dtype=torch.float32))


    trainer = FFMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.BCEWithLogitsLoss(), device)

In [15]:
def train_spline_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 2, 32)
    num_knots = trial.suggest_int('num_knots', 3, 48)
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        trial.report(loss, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return train_spline_ffm(embedding_dim, step_size, batch_size, num_knots, num_epochs,
                           callback=callback)

In [16]:
study = optuna.create_study(study_name='splines',
                            direction='minimize',
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(train_spline_objective, n_trials=100)

[32m[I 2023-05-16 19:04:42,736][0m A new study created in memory with name: splines[0m
[32m[I 2023-05-16 19:06:55,632][0m Trial 0 finished with value: 0.3308318555355072 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 24, 'num_knots': 30, 'num_epochs': 6}. Best is trial 0 with value: 0.3308318555355072.[0m
[32m[I 2023-05-16 19:11:10,824][0m Trial 1 finished with value: 0.2987254559993744 and parameters: {'embedding_dim': 2, 'step_size': 0.012551115172973842, 'batch_size': 28, 'num_knots': 30, 'num_epochs': 12}. Best is trial 1 with value: 0.2987254559993744.[0m
[32m[I 2023-05-16 19:13:41,166][0m Trial 2 finished with value: 0.308167040348053 and parameters: {'embedding_dim': 1, 'step_size': 0.44447541666908114, 'batch_size': 27, 'num_knots': 12, 'num_epochs': 7}. Best is trial 1 with value: 0.2987254559993744.[0m
[32m[I 2023-05-16 19:18:26,030][0m Trial 3 finished with value: 0.2933942675590515 and parameters: {'embedding_dim': 2, 'step

In [17]:
trial = study.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.2841481864452362
Best hyperparameters: {'embedding_dim': 10, 'step_size': 0.01628886658284197, 'batch_size': 7, 'num_knots': 41, 'num_epochs': 8}


In [18]:
study.best_params

{'embedding_dim': 10,
 'step_size': 0.01628886658284197,
 'batch_size': 7,
 'num_knots': 41,
 'num_epochs': 8}

In [19]:
train_spline_ffm(**study.best_params)

0.28711041808128357

In [28]:
spline_losses = []
for i in trange(20):
    loss = train_spline_ffm(**study.best_params)
    spline_losses.append(loss)

100%|██████████| 20/20 [2:32:39<00:00, 457.99s/it]  


In [29]:
spline_losses

[0.28441962599754333,
 0.28561073541641235,
 0.28526991605758667,
 0.2852311432361603,
 0.2871018052101135,
 0.28534361720085144,
 0.2867548167705536,
 0.2859247922897339,
 0.28690609335899353,
 0.28653082251548767,
 0.28614985942840576,
 0.2878064811229706,
 0.286538690328598,
 0.2865554690361023,
 0.28602224588394165,
 0.2861751914024353,
 0.2853567898273468,
 0.28668129444122314,
 0.2849292755126953,
 0.2857675850391388]

In [33]:
np.mean(spline_losses), np.std(spline_losses)

(0.2860538125038147, 0.0008072314680490749)

In [20]:
def train_bin_ffm(embedding_dim: int, step_size: float, batch_size: int,
                  num_bins: int, bin_strategy: str, num_epochs: int,
                  callback: Callable[[int, float], None]=None):
    num_numerical_fields = tr_num_qs.shape[1]
    num_numerical_embeddings = num_numerical_fields * num_bins
    numerical_offsets = np.arange(0, num_numerical_fields) * num_bins

    discretizer = KBinsDiscretizer(num_bins, encode='ordinal', strategy=bin_strategy, random_state=42)
    discretizer.fit(tr_num)

    tr_num_indices = discretizer.transform(tr_num)
    tr_num_indices += np.tile(numerical_offsets, (tr_num.shape[0], 1))
    tr_num_weights = np.ones_like(tr_num_indices)
    tr_num_fields = np.tile(np.arange(0, num_numerical_fields), (tr_num.shape[0], 1))
    tr_num_offsets = tr_num_fields.copy()

    te_num_indices = discretizer.transform(te_num)
    te_num_indices += np.tile(numerical_offsets, (te_num.shape[0], 1))
    te_num_weights = np.ones_like(te_num_indices)
    te_num_fields = np.tile(np.arange(0, num_numerical_fields), (te_num.shape[0], 1))
    te_num_offsets = te_num_fields.copy()


    num_fields = num_numerical_fields + num_cat_fields
    num_embeddings = num_numerical_embeddings + num_cat_embeddings

    tr_indices = np.concatenate([tr_cat_indices, tr_num_indices + num_cat_embeddings], axis=1)
    tr_weights = np.concatenate([tr_cat_weights, tr_num_weights], axis=1)
    tr_offsets = np.concatenate([tr_cat_offsets, tr_num_offsets + num_cat_fields], axis=1)
    tr_fields = np.concatenate([tr_cat_fields, tr_num_fields + num_cat_fields], axis=1)

    te_indices = np.concatenate([te_cat_indices, te_num_indices + num_cat_embeddings], axis=1)
    te_weights = np.concatenate([te_cat_weights, te_num_weights], axis=1)
    te_offsets = np.concatenate([te_cat_offsets, te_num_offsets + num_cat_fields], axis=1)
    te_fields = np.concatenate([te_cat_fields, te_num_fields + num_cat_fields], axis=1)

    train_ds = TensorDataset(
        torch.tensor(tr_indices, dtype=torch.int64),
        torch.tensor(tr_weights, dtype=torch.float32),
        torch.tensor(tr_offsets, dtype=torch.int64),
        torch.tensor(tr_fields, dtype=torch.int64),
        torch.tensor(tr_target.values, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(te_indices, dtype=torch.int64),
        torch.tensor(te_weights, dtype=torch.float32),
        torch.tensor(te_offsets, dtype=torch.int64),
        torch.tensor(te_fields, dtype=torch.int64),
        torch.tensor(te_target.values, dtype=torch.float32))

    trainer = FFMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.BCEWithLogitsLoss(), device)

In [21]:
def test_bins_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 2, 32)
    num_bins = trial.suggest_int('num_bins', 2, 100)
    bin_strategy = trial.suggest_categorical('bin_strategy', ['uniform', 'quantile'])
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        trial.report(loss, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return train_bin_ffm(embedding_dim, step_size, batch_size, num_bins, bin_strategy, num_epochs,
                         callback=callback)


In [22]:
study_bins = optuna.create_study(study_name='bins',
                                 direction='minimize',
                                 sampler=optuna.samplers.TPESampler(seed=42))
study_bins.optimize(test_bins_objective, n_trials=100)

[32m[I 2023-05-17 02:30:20,329][0m A new study created in memory with name: bins[0m
[32m[I 2023-05-17 02:31:48,083][0m Trial 0 finished with value: 0.36048880219459534 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 24, 'num_bins': 61, 'bin_strategy': 'uniform', 'num_epochs': 5}. Best is trial 0 with value: 0.36048880219459534.[0m
[32m[I 2023-05-17 02:33:58,528][0m Trial 1 finished with value: 0.35097000002861023 and parameters: {'embedding_dim': 9, 'step_size': 0.10502105436744279, 'batch_size': 23, 'num_bins': 4, 'bin_strategy': 'uniform', 'num_epochs': 7}. Best is trial 1 with value: 0.35097000002861023.[0m
[32m[I 2023-05-17 02:40:40,921][0m Trial 2 finished with value: 0.3105776607990265 and parameters: {'embedding_dim': 2, 'step_size': 0.020492680115417352, 'batch_size': 11, 'num_bins': 53, 'bin_strategy': 'uniform', 'num_epochs': 11}. Best is trial 2 with value: 0.3105776607990265.[0m
[32m[I 2023-05-17 02:45:58,507][0m Trial 3 fin

In [23]:
study_bins.best_params

{'embedding_dim': 8,
 'step_size': 0.013772781946733741,
 'batch_size': 25,
 'num_bins': 58,
 'bin_strategy': 'uniform',
 'num_epochs': 11}

In [24]:
trial = study_bins.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.29582858085632324
Best hyperparameters: {'embedding_dim': 8, 'step_size': 0.013772781946733741, 'batch_size': 25, 'num_bins': 58, 'bin_strategy': 'uniform', 'num_epochs': 11}


In [25]:
train_bin_ffm(**study_bins.best_params)

0.2982105612754822

In [30]:
bin_losses = []
for i in trange(20):
    loss = train_bin_ffm(**study_bins.best_params)
    bin_losses.append(loss)

100%|██████████| 20/20 [1:02:58<00:00, 188.92s/it]


In [31]:
bin_losses

[0.2994536757469177,
 0.3009205758571625,
 0.296610563993454,
 0.2973214089870453,
 0.2992994785308838,
 0.2983883023262024,
 0.29931512475013733,
 0.2996693253517151,
 0.2996058762073517,
 0.2982383966445923,
 0.30039355158805847,
 0.3006632924079895,
 0.29591503739356995,
 0.2998608648777008,
 0.2998100817203522,
 0.2978292405605316,
 0.29889991879463196,
 0.2984970808029175,
 0.29809093475341797,
 0.3017430901527405]

In [32]:
np.mean(bin_losses), np.std(bin_losses)

(0.2990262910723686, 0.001413742829358982)