### Load preprocessed data

Run the script that downloads and processes the MovieLens data

In [3]:
# !python ../src/download.py

n_item 3953
n_user 6041
n_featuers 9994
n_occu 21
n_rows 1000209


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
fh = np.load('data/dataset.npz')

# We have a bunch of feature columns and last column is the y-target
# Note pytorch is finicky about need int64 types
train_x = fh['train_x'].astype(np.int64)
train_y = fh['train_y']

# We've already split into train & test
X_test = fh['test_x'].astype(np.int64)
Y_test = fh['test_y']

X_train, X_val, Y_train, Y_val = train_test_split(train_x, train_y)


n_user = int(fh['n_user'])
n_item = int(fh['n_item'])

# columns are user_id, item_id and other features 
# we won't use the 3rd and 4th columns
print(X_train)
print(' ')
print(Y_train)

[[1635 3196   31    0]
 [ 352 1041  480    4]
 [5271  318   37    0]
 ...
 [3871 2709   76    4]
 [2934 2450  174   20]
 [3217 1280   37    0]]
 
[[5.]
 [4.]
 [3.]
 ...
 [4.]
 [1.]
 [5.]]


In [15]:
X_train[:, 0].max()

6040

In [18]:
len(np.unique(X_train[:, 0]))

6040

In [12]:
Y_train

array([[5.],
       [4.],
       [3.],
       ...,
       [4.],
       [1.],
       [5.]], dtype=float32)

### Define data loaders

In [2]:
from torch import from_numpy
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import BatchSampler
from torch.utils.data import RandomSampler


def dataloader(*arrs, batch_size=32):
    dataset = TensorDataset(*arrs)
    bs = BatchSampler(RandomSampler(dataset), 
                      batch_size=batch_size, drop_last=False)
    return DataLoader(dataset, batch_sampler=bs, num_workers=8)
 
train = dataloader(from_numpy(X_train), from_numpy(Y_train))
test = dataloader(from_numpy(X_test), from_numpy(Y_test))
val = dataloader(from_numpy(X_val), from_numpy(Y_val))

### Define the MF Model

In [13]:
from abstract_model import AbstractModel

In [20]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl

from pytorch_lightning.loggers import TensorBoardLogger


def l2_regularize(array):
    return torch.sum(array ** 2.0)


class MF(AbstractModel):
    def __init__(self, n_user, n_item, k=18, c_vector=1.0, batch_size=128):
        super().__init__()
        # These are simple hyperparameters
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        self.batch_size = batch_size
        self.save_hyperparameters()
        
        # These are learned and fit by PyTorch
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
    
    def forward(self, inputs):
        # This is the most import function in this script
        # These are the user indices, and correspond to "u" variable
        user_id = inputs[:, 0]
        # Item indices, correspond to the "i" variable
        item_id = inputs[:, 1]
        # vector user = p_u
        vector_user = self.user(user_id)
        # equivalent:
        # self.user.weight[user_id, :]
        # vector item = q_i
        vector_item = self.item(item_id)
        # this is a dot product & a user-item interaction: p_u * q_i
        # shape vector_user is (batch_size, k)
        # vector_user * vector_item is shape (batch_size, k)
        # sum(vector_user * vector_item is shape, dim=1) (batch_size)
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        return ui_interaction
    
    def loss(self, prediction, target):
        # MSE error between target = R_ui and prediction = p_u * q_i
        # target is (batchsize, 1)
        # target.squeeze (batchsize, )
        loss_mse = F.mse_loss(prediction, target.squeeze())
        return loss_mse, {"mse": loss_mse}
    
    def reg(self):
        # Compute L2 reularization over user (P) and item (Q) matrices
        reg_user =  l2_regularize(self.user.weight) * self.c_vector
        reg_item = l2_regularize(self.item.weight) * self.c_vector
        # Add up the MSE loss + user & item regularization
        log = {"reg_user": reg_user, "reg_item": reg_item}
        total = reg_user + reg_item
        return total, log

Note, I typically tune the initial batch size to get an efficient number of datapoints observed per second. On a CPU run `htop` or `top` in another window / terminal tab to see how efficiently your CPU is being  used. If it spends a lot of time below 50% utilization, crank up  the batch size. GPUs are more sensitive, and it's difficult to keep them well fed. To measure efficiency, use `nvidia-smi`. That command only  gets you a snapshot though, so to keep it running use `watch -n 0.1 nvidia-smi`.

In [21]:
from pytorch_lightning.loggers.wandb import WandbLogger

batch_size = 1024
k = 5
c_vector = 1e-5
model = MF(n_user, n_item, k=k, c_vector=c_vector,
          batch_size=batch_size)

# add a logger
logger = WandbLogger(name="01_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     early_stop_callback=True,
                     gpus=1,
                     progress_bar_refresh_rate=1) 


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [22]:
trainer.fit(model, train, val)

[34m[1mwandb[0m: Waiting for W&B process to finish, PID 13799
[34m[1mwandb[0m: Program ended successfully.
[34m[1mwandb[0m: - 0.00MB of 0.00MB uploaded (0.00MB deduped)




[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: Find user logs for this run at: wandb/run-20200925_153955-3u1uuj7l/logs/debug.log
[34m[1mwandb[0m: Find internal logs for this run at: wandb/run-20200925_153955-3u1uuj7l/logs/debug-internal.log
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:   global_step 696299
[34m[1mwandb[0m:           mse 3.83900785446167
[34m[1mwandb[0m:      reg_user 0.14615029096603394
[34m[1mwandb[0m:      reg_item 0.142988920211792
[34m[1mwandb[0m:    train_loss 4.128147125244141
[34m[1mwandb[0m:         epoch 32
[34m[1mwandb[0m:         _step 13958
[34m[1mwandb[0m:      _runtime 5767
[34m[1mwandb[0m:    _timestamp 1601054164
[34m[1mwandb[0m:      val_loss 0.8459229469299316
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:   global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:           mse █▇▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[3


  | Name | Type      | Params
-----------------------------------
0 | user | Embedding | 30 K  
1 | item | Embedding | 19 K  





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Saving latest checkpoint..





1

In [None]:
results = trainer.test(model)
mse = results['avg_test_loss']
rmse = np.sqrt(mse)
rmse

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_loss': tensor(15.5915, device='cuda:0'),
 'val_loss': tensor(15.5915, device='cuda:0')}
--------------------------------------------------------------------------------



3.9486024339809935