In [1]:
import sys

sys.path.insert(0, "../utils")

In [28]:
import sklearn.datasets as skds
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer, OrdinalEncoder, LabelEncoder, StandardScaler
import numpy as np
import pandas as pd
from transformation import BSplineTransformer, spline_transform_dataset
from trainers import FFMTrainer, FMTrainer
import math
import optuna
import optuna.samplers
from typing import Callable
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset
from tqdm import trange

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
print(device)

cuda:0


In [4]:
torch.manual_seed(42)
np.random.seed(42)

In [5]:
raw_df = pd.read_csv("../data/Ye_millionsongdataset/Training_set_songs.csv",
                     names=['Year', 'TA01', 'TA02', 'TA03', 'TA04', 'TA05', 'TA06', 'TA07', 'TA08', 'TA09', 'TA10', 'TA11', 'TA12', 'TC01', 'TC02', 'TC03', 'TC04', 'TC05', 'TC06', 'TC07', 'TC08', 'TC09', 'TC10', 'TC11', 'TC12', 'TC13', 'TC14', 'TC15', 'TC16', 'TC17', 'TC18', 'TC19', 'TC20', 'TC21', 'TC22', 'TC23', 'TC24', 'TC25', 'TC26', 'TC27', 'TC28', 'TC29', 'TC30', 'TC31', 'TC32', 'TC33', 'TC34', 'TC35', 'TC36', 'TC37', 'TC38', 'TC39', 'TC40', 'TC41', 'TC42', 'TC43', 'TC44', 'TC45', 'TC46', 'TC47', 'TC48', 'TC49', 'TC50', 'TC51', 'TC52', 'TC53', 'TC54', 'TC55', 'TC56', 'TC57', 'TC58', 'TC59', 'TC60', 'TC61', 'TC62', 'TC63', 'TC64', 'TC65', 'TC66', 'TC67', 'TC68', 'TC69', 'TC70', 'TC71', 'TC72', 'TC73', 'TC74', 'TC75', 'TC76', 'TC77', 'TC78'],
                     dtype={0:int, 1:float, 2:float, 3:float, 4:float, 5:float, 6:float, 7:float, 8:float, 9:float, 10:float, 11:float, 12:float, 13:float, 14:float, 15:float, 16:float, 17:float, 18:float, 19:float, 20:float, 21:float, 22:float, 23:float, 24:float, 25:float, 26:float, 27:float, 28:float, 29:float, 30:float, 31:float, 32:float, 33:float, 34:float, 35:float, 36:float, 37:float, 38:float, 39:float, 40:float, 41:float, 42:float, 43:float, 44:float, 45:float, 46:float, 47:float, 48:float, 49:float, 50:float, 51:float, 52:float, 53:float, 54:float, 55:float, 56:float, 57:float, 58:float, 59:float, 60:float, 61:float, 62:float, 63:float, 64:float, 65:float, 66:float, 67:float, 68:float, 69:float, 70:float, 71:float, 72:float, 73:float, 74:float, 75:float, 76:float, 77:float, 78:float, 79:float, 80:float, 81:float, 82:float, 83:float, 84:float, 85:float, 86:float, 87:float, 88:float, 89:float, 90:float},
                     na_values="?", skiprows=1)  # TODO: only 3000 lines are loaded in the data

In [6]:
raw_df.sample(6)

Unnamed: 0,Year,TA01,TA02,TA03,TA04,TA05,TA06,TA07,TA08,TA09,...,TC69,TC70,TC71,TC72,TC73,TC74,TC75,TC76,TC77,TC78
332595,2004,51.19589,18.54077,12.552,3.25,-7.79542,-19.63461,0.27662,-1.03994,-1.57231,...,-16.43458,-37.05054,-100.93186,49.26529,-1.47467,-39.25996,-22.44389,-16.37939,-49.67664,3.87491
230573,1989,51.44573,53.42621,35.83483,-16.72867,-48.13185,-13.04248,-45.74081,-6.18791,16.601,...,9.9422,-71.8228,59.4025,45.14201,-2.72343,35.62292,37.99939,7.04862,21.08678,2.07779
364530,1987,44.85215,33.51052,27.44631,24.99783,-15.02508,12.29223,10.57365,7.06412,-1.29649,...,5.05131,56.78609,136.98499,-30.38374,24.51034,62.77834,44.80304,30.34866,105.17991,2.58183
82857,2002,50.25653,59.83236,37.8021,-0.57762,-1.64064,-11.77884,-2.25574,-4.17007,4.05922,...,-4.12555,21.3462,29.72172,56.71419,2.6159,56.47152,-26.05716,-0.77059,29.40943,-0.02311
108108,1971,45.48775,-5.4979,9.56187,9.36977,-11.15726,-3.63341,11.00297,-6.36722,5.37455,...,-29.83405,228.02525,53.6819,-47.38609,14.62809,124.90797,-26.61476,5.08838,295.42035,19.74883
446568,2005,49.59492,35.4011,-8.11273,-13.40502,3.18931,-14.05923,3.04436,-1.43375,3.60354,...,-0.40689,-91.80271,-29.51019,-6.4071,6.99983,81.51023,-83.44656,2.81049,24.80947,-10.32877


In [7]:
raw_df.shape

(463715, 91)

In [8]:
raw_df.columns

Index(['Year', 'TA01', 'TA02', 'TA03', 'TA04', 'TA05', 'TA06', 'TA07', 'TA08',
       'TA09', 'TA10', 'TA11', 'TA12', 'TC01', 'TC02', 'TC03', 'TC04', 'TC05',
       'TC06', 'TC07', 'TC08', 'TC09', 'TC10', 'TC11', 'TC12', 'TC13', 'TC14',
       'TC15', 'TC16', 'TC17', 'TC18', 'TC19', 'TC20', 'TC21', 'TC22', 'TC23',
       'TC24', 'TC25', 'TC26', 'TC27', 'TC28', 'TC29', 'TC30', 'TC31', 'TC32',
       'TC33', 'TC34', 'TC35', 'TC36', 'TC37', 'TC38', 'TC39', 'TC40', 'TC41',
       'TC42', 'TC43', 'TC44', 'TC45', 'TC46', 'TC47', 'TC48', 'TC49', 'TC50',
       'TC51', 'TC52', 'TC53', 'TC54', 'TC55', 'TC56', 'TC57', 'TC58', 'TC59',
       'TC60', 'TC61', 'TC62', 'TC63', 'TC64', 'TC65', 'TC66', 'TC67', 'TC68',
       'TC69', 'TC70', 'TC71', 'TC72', 'TC73', 'TC74', 'TC75', 'TC76', 'TC77',
       'TC78'],
      dtype='object')

In [9]:
train, test = train_test_split(raw_df, test_size=0.2, random_state=42)

In [10]:
tr_feats = train.drop("Year", axis=1)
tr_target = train["Year"].values
te_feats = test.drop("Year", axis=1)
te_target = test["Year"].values

In [11]:
target_scaler = StandardScaler()
tr_target = target_scaler.fit_transform(tr_target.reshape(-1, 1)).reshape(-1)
te_target = target_scaler.transform(te_target.reshape(-1, 1)).reshape(-1)

In [12]:
quant_transform = QuantileTransformer(output_distribution='uniform',
                                      n_quantiles=10000,
                                      subsample=len(tr_feats),
                                      random_state=42)
X_train_qs = quant_transform.fit_transform(tr_feats)
X_test_qs = quant_transform.transform(te_feats)

In [13]:
def train_spline_fm(embedding_dim: int, step_size: float, batch_size: int, num_knots: int, num_epochs: int,
                     callback: Callable[[int, float], None]=None):
    bs = BSplineTransformer(num_knots, 3)
    tr_indices, tr_weights, tr_offsets, tr_fields = spline_transform_dataset(X_train_qs, bs)
    te_indices, te_weights, te_offsets, te_fields = spline_transform_dataset(X_test_qs, bs)

    num_fields = X_train_qs.shape[1]
    num_embeddings = int(max(np.max(tr_indices), np.max(te_indices)) + 1)

    train_ds = TensorDataset(
        torch.tensor(tr_indices, dtype=torch.int64),
        torch.tensor(tr_weights, dtype=torch.float32),
        torch.tensor(tr_offsets, dtype=torch.int64),
        torch.tensor(tr_fields, dtype=torch.int64),
        torch.tensor(tr_target, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(te_indices, dtype=torch.int64),
        torch.tensor(te_weights, dtype=torch.float32),
        torch.tensor(te_offsets, dtype=torch.int64),
        torch.tensor(te_fields, dtype=torch.int64),
        torch.tensor(te_target, dtype=torch.float32))


    trainer = FMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.MSELoss(), device)

In [14]:
def train_spline_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 32, 256)
    num_knots = trial.suggest_int('num_knots', 3, 48)
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        trial.report(math.sqrt(loss), epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return math.sqrt(train_spline_fm(embedding_dim, step_size, batch_size, num_knots, num_epochs,
                                     callback=callback))

In [15]:
study = optuna.create_study(study_name='splines',
                            direction='minimize',
                            sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(train_spline_objective, n_trials=100)

[32m[I 2023-05-16 18:53:23,460][0m A new study created in memory with name: splines[0m
[32m[I 2023-05-16 18:56:43,426][0m Trial 0 finished with value: 1.2386136255730438 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 196, 'num_knots': 30, 'num_epochs': 6}. Best is trial 0 with value: 1.2386136255730438.[0m
[32m[I 2023-05-16 19:03:07,293][0m Trial 1 finished with value: 9.578139736045964 and parameters: {'embedding_dim': 2, 'step_size': 0.012551115172973842, 'batch_size': 226, 'num_knots': 30, 'num_epochs': 12}. Best is trial 0 with value: 1.2386136255730438.[0m
[32m[I 2023-05-16 19:06:49,945][0m Trial 2 finished with value: 1.4629122193885566 and parameters: {'embedding_dim': 1, 'step_size': 0.44447541666908114, 'batch_size': 219, 'num_knots': 12, 'num_epochs': 7}. Best is trial 0 with value: 1.2386136255730438.[0m
[32m[I 2023-05-16 19:13:15,304][0m Trial 3 finished with value: 5.51720045029323 and parameters: {'embedding_dim': 2, 'ste

In [16]:
trial = study.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

Test loss: 0.8794329950964015
Best hyperparameters: {'embedding_dim': 10, 'step_size': 0.24654623010907453, 'batch_size': 68, 'num_knots': 3, 'num_epochs': 12}


In [17]:
study.best_params

{'embedding_dim': 10,
 'step_size': 0.24654623010907453,
 'batch_size': 68,
 'num_knots': 3,
 'num_epochs': 12}

In [18]:
train_spline_fm(**study.best_params)

0.7787345051765442

In [29]:
spline_losses = []
for i in trange(20):
    loss = train_spline_fm(**study.best_params)
    spline_losses.append(math.sqrt(loss))

100%|██████████| 20/20 [2:39:59<00:00, 479.95s/it]  


In [30]:
spline_losses

[0.881695640108332,
 0.881986518583829,
 0.8812350129821337,
 0.8835068731661306,
 0.8756028550892196,
 0.8795998443241684,
 0.8812374141146075,
 0.8793456949910461,
 0.879686949816137,
 0.8792196435620432,
 0.8800452079323017,
 0.8800887905106133,
 0.8787611778058435,
 0.8798512104892138,
 0.8808272328401603,
 0.8801772358534317,
 0.8815416286395354,
 0.8799189857558083,
 0.8787488669310817,
 0.8843083494702224]

In [33]:
np.mean(spline_losses), np.std(spline_losses), 100 * np.std(spline_losses) / np.mean(spline_losses)

(0.880369256648293, 0.0018009996668723587, 0.2045732121234053)

In [19]:
def train_bin_fm(embedding_dim: int, step_size: float, batch_size: int,
                  num_bins: int, bin_strategy: str, num_epochs: int,
                  callback: Callable[[int, float], None]=None):
    num_fields = tr_feats.shape[1]
    num_embeddings = num_fields * num_bins
    index_offsets = np.arange(0, num_fields) * num_bins

    discretizer = KBinsDiscretizer(num_bins, encode='ordinal', strategy=bin_strategy, random_state=42)
    discretizer.fit(tr_feats)

    tr_indices = discretizer.transform(tr_feats)
    tr_indices += np.tile(index_offsets, (tr_indices.shape[0], 1))
    tr_weights = np.ones_like(tr_indices)
    tr_fields = np.tile(np.arange(0, num_fields), (tr_indices.shape[0], 1))
    tr_offsets = tr_fields.copy()

    te_indices = discretizer.transform(te_feats)
    te_indices += np.tile(index_offsets, (te_indices.shape[0], 1))
    te_weights = np.ones_like(te_indices)
    te_fields = np.tile(np.arange(0, num_fields), (te_indices.shape[0], 1))
    te_offsets = te_fields.copy()

    train_ds = TensorDataset(
        torch.tensor(tr_indices, dtype=torch.int64),
        torch.tensor(tr_weights, dtype=torch.float32),
        torch.tensor(tr_offsets, dtype=torch.int64),
        torch.tensor(tr_fields, dtype=torch.int64),
        torch.tensor(tr_target, dtype=torch.float32))

    test_ds = TensorDataset(
        torch.tensor(te_indices, dtype=torch.int64),
        torch.tensor(te_weights, dtype=torch.float32),
        torch.tensor(te_offsets, dtype=torch.int64),
        torch.tensor(te_fields, dtype=torch.int64),
        torch.tensor(te_target, dtype=torch.float32))

    trainer = FMTrainer(embedding_dim, step_size, batch_size, num_epochs, callback)
    return trainer.train(num_fields, num_embeddings, train_ds, test_ds, torch.nn.MSELoss(), device)

In [20]:
def test_bins_objective(trial: optuna.Trial):
    embedding_dim = trial.suggest_int('embedding_dim', 1, 10)
    step_size = trial.suggest_float('step_size', 1e-2, 0.5, log=True)
    batch_size = trial.suggest_int('batch_size', 32, 256)
    num_bins = trial.suggest_int('num_bins', 2, 100)
    bin_strategy = trial.suggest_categorical('bin_strategy', ['uniform', 'quantile'])
    num_epochs = trial.suggest_int('num_epochs', 5, 15)

    def callback(epoch: int, loss: float):
        trial.report(math.sqrt(loss), epoch)
        if trial.should_prune():
            raise optuna.TrialPruned()

    return math.sqrt(train_bin_fm(embedding_dim, step_size, batch_size, num_bins, bin_strategy, num_epochs,
                                  callback=callback))


In [21]:
study_bins = optuna.create_study(study_name='bins',
                                 direction='minimize',
                                 sampler=optuna.samplers.TPESampler(seed=42))
study_bins.optimize(test_bins_objective, n_trials=100)

[32m[I 2023-05-17 08:23:55,739][0m A new study created in memory with name: bins[0m
[32m[I 2023-05-17 08:25:38,113][0m Trial 0 finished with value: 1.5271531398304155 and parameters: {'embedding_dim': 4, 'step_size': 0.4123206532618726, 'batch_size': 196, 'num_bins': 61, 'bin_strategy': 'uniform', 'num_epochs': 5}. Best is trial 0 with value: 1.5271531398304155.[0m
[32m[I 2023-05-17 08:28:04,884][0m Trial 1 finished with value: 0.9448509350530763 and parameters: {'embedding_dim': 9, 'step_size': 0.10502105436744279, 'batch_size': 191, 'num_bins': 4, 'bin_strategy': 'uniform', 'num_epochs': 7}. Best is trial 1 with value: 0.9448509350530763.[0m
[32m[I 2023-05-17 08:34:45,404][0m Trial 2 finished with value: 5.603986016331611 and parameters: {'embedding_dim': 2, 'step_size': 0.020492680115417352, 'batch_size': 100, 'num_bins': 53, 'bin_strategy': 'uniform', 'num_epochs': 11}. Best is trial 1 with value: 0.9448509350530763.[0m
[32m[I 2023-05-17 08:40:16,860][0m Trial 3 finis

KeyboardInterrupt: 

In [None]:
study_bins.best_params

In [None]:
trial = study_bins.best_trial

print('Test loss: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

In [None]:
train_bin_fm(**study_bins.best_params)

In [None]:
bin_losses = []
for i in trange(20):
    loss = train_bin_fm(**study_bins.best_params)
    bin_losses.append(math.sqrt(loss))

In [None]:
bin_losses

In [None]:
np.mean(bin_losses), np.std(bin_losses)