In [1]:
import sys

sys.path.insert(0, "../utils")

In [24]:
import sklearn.datasets as skds
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer, StandardScaler
from sklearn.model_selection import train_test_split
from weighted_fm import WeightedFFM, WeightedFM
from trainers import FFMTrainer
from transformation import BSplineTransformer, spline_transform_dataset
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
import math
import optuna
import optuna.samplers
from typing import Callable
from tqdm import tqdm, trange

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
print(device)

cuda:0


In [4]:
torch.manual_seed(42)
np.random.seed(42)

In [5]:
import tarfile
import joblib

In [6]:
with tarfile.open(mode="r:gz", name="../data/cal_housing.tgz") as f:
    cal_housing = np.loadtxt(
        f.extractfile("CaliforniaHousing/cal_housing.data"), delimiter=","
    )
    # Columns are not in the same order compared to the previous
    # URL resource on lib.stat.cmu.edu
    columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
    cal_housing = cal_housing[:, columns_index]

    joblib.dump(cal_housing, "../data/cal_housing_py3.pkz", compress=6)

In [7]:
ds = skds.fetch_california_housing(data_home="../data")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(ds['data'], ds['target'], test_size=0.2, random_state=42)

In [9]:
target_scaler = StandardScaler()
y_train = target_scaler.fit_transform(y_train.reshape(-1, 1)).reshape(-1)
y_test = target_scaler.transform(y_test.reshape(-1, 1)).reshape(-1)

In [10]:
quant_transform = QuantileTransformer(output_distribution='uniform',
                                      n_quantiles=10000,
                                      subsample=len(X_train),
                                      random_state=42)
X_train_qs = quant_transform.fit_transform(X_train)
X_test_qs = quant_transform.transform(X_test)

In [28]:
def train_spline_ffm(embedding_dim: int, step_size: float, batch_size: int, num_knots: int, num_epochs: int,
                     callback: Callable[[int, float], None]=None):
    bs = BSplineTransformer(num_knots, 3)
    train_indices, train_weights, train_offsets, train_fields = spline_transform_dataset(X_train_qs, bs)
    test_indices, test_weights, test_offsets, test_fields = spline_transform_dataset(X_test_qs, bs)
    num_fields = ds['data'].shape[1]
    num_embeddings = bs.basis_size() * num_fields

    train_ds = TensorDataset(
        torch.tensor(train_indices, dtype=torch.int64),
        torch.tensor(train_weights, dtype=torch.float32),
        torch.tensor(train_offsets, dtype=torch.int64),
        torch.tensor(train_fields, dtype=torch.int64),
        torch.tensor(y_train, dtype=torch.float32))
    test_ds = TensorDataset(
        torch.tensor(test_indices, dtype=torch.int64),
        torch.tensor(test_weights, dtype=torch.float32),
        torch.tensor(test_offsets, dtype=torch.int64),
        torch.tensor(test_fields, dtype=torch.int64),
        torch.tensor(y_test, dtype=torch.float32))

    criterion = torch.nn.MSELoss()
    trainer = FFMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, criterion, device)

In [12]:
def train_spline_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 2, 32)
    num_knots = trial.suggest_int('num_knots', 3, 48)
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        rmse = math.sqrt(loss)
        trial.report(rmse, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    mse = train_spline_ffm(embedding_dim, step_size, batch_size, num_knots, num_epochs,
                           callback=callback)
    return math.sqrt(mse)

In [13]:
study = optuna.create_study(study_name='splines',
                            direction='minimize',
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(train_spline_objective, n_trials=100)

[32m[I 2023-05-16 19:06:04,290][0m A new study created in memory with name: splines[0m
[32m[I 2023-05-16 19:07:06,616][0m Trial 0 finished with value: 0.4704895326349986 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 24, 'num_knots': 30, 'num_epochs': 6}. Best is trial 0 with value: 0.4704895326349986.[0m
[32m[I 2023-05-16 19:08:50,430][0m Trial 1 finished with value: 0.5324508054601986 and parameters: {'embedding_dim': 2, 'step_size': 0.012551115172973842, 'batch_size': 28, 'num_knots': 30, 'num_epochs': 12}. Best is trial 0 with value: 0.4704895326349986.[0m
[32m[I 2023-05-16 19:09:57,194][0m Trial 2 finished with value: 0.45800390831558263 and parameters: {'embedding_dim': 1, 'step_size': 0.44447541666908114, 'batch_size': 27, 'num_knots': 12, 'num_epochs': 7}. Best is trial 2 with value: 0.45800390831558263.[0m
[32m[I 2023-05-16 19:11:51,841][0m Trial 3 finished with value: 0.46901124985568104 and parameters: {'embedding_dim': 2, '

In [14]:
trial = study.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.42568818221361704
Best hyperparameters: {'embedding_dim': 7, 'step_size': 0.052749809167358816, 'batch_size': 14, 'num_knots': 21, 'num_epochs': 12}


In [15]:
study.best_params

{'embedding_dim': 7,
 'step_size': 0.052749809167358816,
 'batch_size': 14,
 'num_knots': 21,
 'num_epochs': 12}

In [29]:
spline_losses = []
for i in trange(20):
    mse = train_spline_ffm(**study.best_params)
    spline_losses.append(math.sqrt(mse))

100%|██████████| 20/20 [1:01:29<00:00, 184.47s/it]


In [34]:
spline_losses

[0.42679040640311267,
 0.42634108249853464,
 0.4261458174783907,
 0.43225683700092105,
 0.431104387357407,
 0.4288870899249733,
 0.4277180881970324,
 0.4311970464012093,
 0.4338799640302308,
 0.4241321194256667,
 0.42784070300256505,
 0.43046144662743624,
 0.4316383465322712,
 0.42870344707227903,
 0.4286306736221006,
 0.4301517752661648,
 0.4326343059382102,
 0.428075541124546,
 0.42944209286761614,
 0.4320622969331268]

In [39]:
np.mean(spline_losses), 3 * np.std(spline_losses), np.mean(spline_losses) + 3 * np.std(spline_losses), np.mean(spline_losses) - 3 * np.std(spline_losses)

(0.42940467338518973,
 0.007393634011334159,
 0.43679830739652387,
 0.4220110393738556)

In [30]:
def train_bin_ffm(embedding_dim: int, step_size: float, batch_size: int,
                  num_bins: int, bin_strategy: str, num_epochs: int,
                  callback: Callable[[int, float], None]=None):
    num_fields = X_train.shape[1]
    offsets = np.arange(0, num_fields) * num_bins

    discretizer = KBinsDiscretizer(num_bins, encode='ordinal', strategy=bin_strategy, random_state=42)
    discretizer.fit(X_train)

    indices_train = discretizer.transform(X_train)
    indices_train += np.tile(offsets, (X_train.shape[0], 1))
    weights_train = np.ones_like(indices_train)
    fields_train = np.tile(np.arange(0, num_fields), (X_train.shape[0], 1))

    indices_test = discretizer.transform(X_test)
    indices_test += np.tile(offsets, (X_test.shape[0], 1))
    weights_test = np.ones_like(indices_test)
    fields_test = np.tile(np.arange(0, num_fields), (X_test.shape[0], 1))


    num_embeddings = num_fields * num_bins
    train_ds = TensorDataset(
        torch.tensor(indices_train, dtype=torch.int64),
        torch.tensor(weights_train, dtype=torch.float32),
        torch.tensor(fields_train, dtype=torch.int64),
        torch.tensor(fields_train, dtype=torch.int64),
        torch.tensor(y_train, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(indices_test, dtype=torch.int64),
        torch.tensor(weights_test, dtype=torch.float32),
        torch.tensor(fields_test, dtype=torch.int64),
        torch.tensor(fields_test, dtype=torch.int64),
        torch.tensor(y_test, dtype=torch.float32))

    trainer = FFMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.MSELoss(), device)

In [18]:
def test_bins_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 2, 32)
    num_bins = trial.suggest_int('num_bins', 2, 100)
    bin_strategy = trial.suggest_categorical('bin_strategy', ['uniform', 'quantile'])
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, mse: float):
        rmse = math.sqrt(mse)
        trial.report(rmse, epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    mse = train_bin_ffm(embedding_dim, step_size, batch_size, num_bins, bin_strategy, num_epochs,
                        callback=callback)
    return math.sqrt(mse)

In [19]:
study_bins = optuna.create_study(study_name='bins',
                            direction='minimize',
                            sampler=optuna.samplers.TPESampler(seed=42))
study_bins.optimize(test_bins_objective, n_trials=100)

[32m[I 2023-05-16 23:36:36,044][0m A new study created in memory with name: bins[0m
[32m[I 2023-05-16 23:37:23,402][0m Trial 0 finished with value: 0.5280771877157359 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 24, 'num_bins': 61, 'bin_strategy': 'uniform', 'num_epochs': 5}. Best is trial 0 with value: 0.5280771877157359.[0m
[32m[I 2023-05-16 23:38:21,498][0m Trial 1 finished with value: 0.7002097411644254 and parameters: {'embedding_dim': 9, 'step_size': 0.10502105436744279, 'batch_size': 23, 'num_bins': 4, 'bin_strategy': 'uniform', 'num_epochs': 7}. Best is trial 0 with value: 0.5280771877157359.[0m
[32m[I 2023-05-16 23:42:02,363][0m Trial 2 finished with value: 0.5551966895575728 and parameters: {'embedding_dim': 2, 'step_size': 0.020492680115417352, 'batch_size': 11, 'num_bins': 53, 'bin_strategy': 'uniform', 'num_epochs': 11}. Best is trial 0 with value: 0.5280771877157359.[0m
[32m[I 2023-05-16 23:44:53,104][0m Trial 3 finishe

In [20]:
study_bins.best_params

{'embedding_dim': 10,
 'step_size': 0.04746441356701626,
 'batch_size': 30,
 'num_bins': 13,
 'bin_strategy': 'quantile',
 'num_epochs': 6}

In [21]:
trial = study_bins.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.4725330759763331
Best hyperparameters: {'embedding_dim': 10, 'step_size': 0.04746441356701626, 'batch_size': 30, 'num_bins': 13, 'bin_strategy': 'quantile', 'num_epochs': 6}


In [22]:
train_bin_ffm(**study_bins.best_params)

0.22700029611587524

In [32]:
bin_losses = []
for i in trange(20):
    mse = train_bin_ffm(**study_bins.best_params)
    bin_losses.append(math.sqrt(mse))

100%|██████████| 20/20 [15:42<00:00, 47.11s/it]


In [33]:
bin_losses

[0.47253831069784313,
 0.47436226585193625,
 0.4746327178614912,
 0.47469055988463177,
 0.4737954243998599,
 0.4738253487224963,
 0.47137653094042825,
 0.47257028538839857,
 0.47029610602108984,
 0.47444127877551856,
 0.4718063374009628,
 0.4723477105300527,
 0.47160880620469847,
 0.477920785313591,
 0.4717265988354307,
 0.47360833488214976,
 0.4717850814181646,
 0.4717952515621083,
 0.4710633573328178,
 0.47449408806774845]

In [40]:
np.mean(bin_losses), 3 * np.std(bin_losses), np.mean(bin_losses) + 3 * np.std(bin_losses), np.mean(bin_losses) - 3 * np.std(bin_losses)

(0.47303425900457097,
 0.0051958734734685146,
 0.47823013247803947,
 0.46783838553110246)