### Load preprocessed data

Run the script that downloads and processes the MovieLens data.
Uncomment it to run the download & processing script.

In [None]:
!wget https://raw.githubusercontent.com/cemoody/simple_mf/master/src/download.py
!wget https://raw.githubusercontent.com/cemoody/simple_mf/master/notebooks/abstract_model.py

In [None]:
!pip install pytorch_lightning

In [None]:
!pip install wandb

In [None]:
!wandb login

In [None]:
!python download.py

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from torch import from_numpy
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import BatchSampler
from torch.utils.data import RandomSampler

fh = np.load('data/dataset.npz')

# We have a bunch of feature columns and last column is the y-target
# Note pytorch is finicky about need int64 types
train_x = fh['train_x'].astype(np.int64)
train_y = fh['train_y']

# We've already split into train & test
X_test = fh['test_x'].astype(np.int64)
Y_test = fh['test_y']

X_train, X_val, Y_train, Y_val = train_test_split(train_x, train_y)


n_user = int(fh['n_user'])
n_item = int(fh['n_item'])

# columns are user_id, item_id and other features 
# we won't use the 3rd and 4th columns
print(X_train)
print(' ')
print(Y_train)



def dataloader(*arrs, batch_size=32):
    dataset = TensorDataset(*arrs)
    bs = BatchSampler(RandomSampler(dataset), 
                      batch_size=batch_size, drop_last=False)
    return DataLoader(dataset, batch_sampler=bs, num_workers=8)
 
train = dataloader(from_numpy(X_train), from_numpy(Y_train))
test = dataloader(from_numpy(X_test), from_numpy(Y_test))
val = dataloader(from_numpy(X_val), from_numpy(Y_val))

[[ 748 1224   24    0]
 [5255 1064   27   10]
 [5277 1189   78   20]
 ...
 [4379 1246  338    4]
 [3526  468  105    2]
 [5812  348  361    7]]
 
[[5.]
 [5.]
 [4.]
 ...
 [3.]
 [3.]
 [5.]]


In [None]:
from abstract_model import AbstractModel

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl

from pytorch_lightning.loggers import TensorBoardLogger


def l2_regularize(array):
    return torch.sum(array ** 2.0)


class MF(AbstractModel):
    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0, batch_size=128):
        super().__init__()
        # These are simple hyperparameters
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        self.c_bias = c_bias
        self.batch_size = batch_size
        self.save_hyperparameters()
        
        # These are learned and fit by PyTorch
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, inputs):
        # This is the most import function in this script
        # These are the user indices, and correspond to "u" variable
        user_id = inputs[:, 0]
        # Item indices, correspond to the "i" variable
        item_id = inputs[:, 1]
        # vector user = p_u
        vector_user = self.user(user_id)
        # vector item = q_i
        vector_item = self.item(item_id)
        # this is a dot product & a user-item interaction: p_u * q_i
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Pull out biases
        # bias_user shape (bs, 1)
        # bias_user.squeeze() shape (bs,)
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)

        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        # MSE error between target = R_ui and prediction = p_u * q_i
        loss_mse = F.mse_loss(prediction, target.squeeze())
        return loss_mse, {"mse": loss_mse}
    
    def reg(self):
        # Add new regularization to the biases
        reg_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        reg_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        
        # Compute L2 reularization over user (P) and item (Q) matrices
        reg_user =  l2_regularize(self.user.weight) * self.c_vector
        reg_item = l2_regularize(self.item.weight) * self.c_vector
        # Add up the MSE loss + user & item regularization
        log = {"reg_user": reg_user, "reg_item": reg_item,
               "reg_bias_user": reg_bias_user, "reg_bias_item": reg_bias_item}
        total = reg_user + reg_item + reg_bias_user + reg_bias_item
        return total, log

#### Optimize hyperparameters

In [None]:
import optuna
from pytorch_lightning.loggers.wandb import WandbLogger

def objective(trial):
    # Sample parameters -- without declaring them in advance!
    k = trial.suggest_int('n_hid', 1, 20)
    # pretty good params are c_bias = 5e-8, c_vector=1e-5, nhid=5
    c_vector = trial.suggest_loguniform('c_vector', 1e-8, 1e-1)
    c_bias = trial.suggest_loguniform('c_bias', 1e-8, 1e-1)
    model = MF(n_user, n_item, k=k, c_bias=c_bias, c_vector=c_vector,
              batch_size=1024)
    model.save_data(train_x, train_y, test_x, test_y)

    # add a logger
    logger = WandbLogger(name="02_mf", project="simple_mf")

    trainer = pl.Trainer(max_epochs=100, logger=logger,
                         early_stop_callback=True,
                         gpus=1,
                         progress_bar_refresh_rate=1) 
    trainer.fit(model)
    results = trainer.test(model)
    return results['avg_test_loss']

In [None]:
study = optuna.create_study(storage='sqlite:///02.db', 
                            study_name='no-name-1f329d04-352a-4e7f-afb3-546b8be0cfab',
                            load_if_exists=True)
study.trials_dataframe()

[I 2020-08-24 23:58:34,221] Using an existing study with name 'no-name-1f329d04-352a-4e7f-afb3-546b8be0cfab' instead of creating a new one.


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_c_bias,params_c_vector,params_n_hid,system_attrs_fail_reason,state
0,0,0.87816,2020-08-02 05:11:40.629335,2020-08-02 05:37:27.379551,00:25:46.750216,1.221189e-05,9.239021e-08,20.0,,COMPLETE
1,1,0.736967,2020-08-02 05:37:27.383097,2020-08-02 06:01:32.865823,00:24:05.482726,5.124483e-08,1.042599e-05,5.0,,COMPLETE
2,2,0.783924,2020-08-02 06:01:32.869508,2020-08-02 06:28:04.169764,00:26:31.300256,0.001058522,3.165495e-05,19.0,,COMPLETE
3,3,0.780473,2020-08-02 06:28:04.174248,2020-08-02 07:01:06.600876,00:33:02.426628,1.526843e-06,4.778658e-07,12.0,,COMPLETE
4,4,0.830474,2020-08-02 07:01:06.604357,2020-08-02 07:11:39.575382,00:10:32.971025,4.593979e-08,0.04186301,2.0,,COMPLETE
5,5,0.784753,2020-08-02 07:11:39.579158,2020-08-02 07:28:37.312584,00:16:57.733426,6.5417e-06,3.501192e-08,1.0,,COMPLETE
6,6,0.829048,2020-08-02 07:28:37.317200,2020-08-02 07:40:16.349813,00:11:39.032613,2.767049e-06,0.005067678,3.0,,COMPLETE
7,7,0.830485,2020-08-02 07:40:16.354294,2020-08-02 07:52:31.056262,00:12:14.701968,3.382782e-07,0.01675947,10.0,,COMPLETE
8,8,0.953741,2020-08-02 07:52:31.063816,2020-08-02 08:02:43.307739,00:10:12.243923,0.0004288864,0.09926742,16.0,,COMPLETE
9,9,0.777146,2020-08-02 08:02:43.311642,2020-08-02 08:27:41.690093,00:24:58.378451,2.949048e-05,1.297505e-07,2.0,,COMPLETE


In [None]:
best_mse = study.best_trial.value
best_rmse = np.sqrt(best_mse)
best_rmse

NameError: name 'study' is not defined

In [None]:
study.best_params

{'c_bias': 5.124483316506529e-08,
 'c_vector': 1.042598682967818e-05,
 'n_hid': 5}

In [None]:
study.optimize(objective, n_trials=100)

#### Train a model with the best hyperparameters

In [None]:
from pytorch_lightning.loggers.wandb import WandbLogger

k = 5
c_vector = 1e-5
c_bias = 5e-8
model = MF(n_user, n_item, k=k, c_bias=c_bias, c_vector=c_vector,
          batch_size=1024)

# add a logger
logger = WandbLogger(name="02_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     early_stop_callback=True,
                     progress_bar_refresh_rate=1) 

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

GPU available but not used. Set the --gpus flag when calling the script.



In [None]:
trainer.fit(model, train, val)




wandb: Waiting for W&B process to finish, PID 14639
wandb: Program ended successfully.
wandb:                                                                                
wandb: Find user logs for this run at: wandb/run-20200925_154151-15e4xlr1/logs/debug.log
wandb: Find internal logs for this run at: wandb/run-20200925_154151-15e4xlr1/logs/debug-internal.log
wandb: Run summary:
wandb:     global_step 611899
wandb:             mse 1.2072919607162476
wandb:        reg_user 0.01724400743842125
wandb:        reg_item 0.015419412404298782
wandb:   reg_bias_user 0.00030013531795702875
wandb:   reg_bias_item 9.267379209632054e-05
wandb:      train_loss 1.240348219871521
wandb:           epoch 28
wandb:           _step 12266
wandb:        _runtime 5635
wandb:      _timestamp 1601054148
wandb:        val_loss 0.7469817399978638
wandb: Run history:
wandb:     global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
wandb:             mse █▅▄▃▂▁▅▁▂▂▂▁▁▄▃▂▃▃▃▃▂▃▃▂▂▃▁▂▃▃▁▂▃▃▃▂▄▃▂▃
wandb:        reg





  | Name      | Type      | Params
----------------------------------------
0 | user      | Embedding | 30 K  
1 | item      | Embedding | 19 K  
2 | bias_user | Embedding | 6 K   
3 | bias_item | Embedding | 3 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…


Detected KeyboardInterrupt, attempting graceful shutdown...

Saving latest checkpoint..





1

In [None]:
trainer.save_checkpoint("02_best")