In [1]:
### Load preprocessed data

In [2]:
# conda  uninstall tensorboard; pip uninstall -y tensorboard; conda install tensorboard; conda install pytorch-lightning -c conda-forge

In [None]:
!python ../src/download_ml20.py

In [1]:
import numpy as np
fh = np.load('data/dataset_ml20_wide.npz')
# We have a bunch of feature columns and last column is the y-target
max_seq_len = 768 + 1
train_items = fh['train_items'].astype(np.int64)[:, :max_seq_len]
# Note that ratings are modified:
# Move from integer scale (1, 2, ... 10) 
# back to half-star scale (0.5, 1.0, 1.5, ... 4.5, 5.0)
train_ratng = fh['train_ratng'].astype(np.int64)[:, :max_seq_len] / 2.0
test_items = fh['test_items'].astype(np.int64)[:, :max_seq_len]
test_ratng = fh['test_ratng'].astype(np.int64)[:, :max_seq_len] / 2.0

n_user = train_items.shape[0]
n_rank = train_items.shape[1]
n_item = int(train_items.max() + 1)
n_resp = int(train_ratng.max() + 1)

FileNotFoundError: [Errno 2] No such file or directory: 'data/dataset_ml20_wide.npz'

#### Data Structure

Notice that the inputs are now 2D. Each row in `train_items` represents is a 1D stream of items seen by a single user. Different rows will be from different user streams. Note that each stream is padded with zeros so it is a fixed input size. `train_ratng` is a similar structure, but gives the categorical rating output (scaled from [0.0, 0.5, ... 4.5, 5.0] to [0, 1,2, ...10]) that that user gave that item.

In [5]:
train_items

array([[  924,   919,  2683, ...,     0,     0,     0],
       [   62,   469,  1121, ...,     0,     0,     0],
       [  589,  1188,  1721, ...,     0,     0,     0],
       ...,
       [  720,  1373,   362, ...,     0,     0,     0],
       [ 2012,  2115,   908, ...,     0,     0,     0],
       [ 3174,  2872, 48780, ...,     0,     0,     0]])

In [6]:
train_ratng

array([[ 7,  7,  7, ...,  0,  0,  0],
       [10,  6,  6, ...,  0,  0,  0],
       [ 8,  4,  8, ...,  0,  0,  0],
       ...,
       [ 1,  1,  6, ...,  0,  0,  0],
       [ 2,  8,  8, ...,  0,  0,  0],
       [10,  8, 10, ...,  0,  0,  0]])

In [7]:
np.unique(train_ratng[train_items > 0])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [8]:
np.unique(train_items)

array([     0,      2,      3, ..., 131170, 131237, 131262])

In [9]:
train_items.shape, test_items.shape

((124644, 769), (13849, 769))

In [10]:
!pip install -q reformer_pytorch

You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [11]:
import random
import torch
import numpy as np
import pandas as pd
from torch import nn
from torch import from_numpy
import pytorch_lightning as pl
from torch.nn import functional as F
from reformer_pytorch import Reformer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [12]:
from abstract_model import AbstractModel

In [13]:
class AR(AbstractModel):
    def __init__(self, n_item, n_dim, n_resp, n_rank, p=0.1,
                 heads=2, depth=2, batch_size=32, weight_decay=1e-6):
        super().__init__()
        self.n_dim = n_dim
        self.n_item = n_item
        self.n_resp = n_resp
        self.save_hyperparameters()
        
        # This means that item=0 will always yield the zero vector
        self.item = nn.Embedding(n_item, n_dim, padding_idx=0)
        self.resp = nn.Embedding(n_resp, n_dim)
        self.reformer = Reformer(dim=n_dim, depth=depth, heads=heads, causal=True, max_seq_len=n_rank)
        self.batch_size = batch_size
        self.weight_decay = weight_decay
        self.dropout = nn.Dropout(p=p)
    
    def forward(self, items, ratng):
        item_vec = self.item(items)
        resp_vec = self.resp(ratng)
        intx_vec = self.dropout(item_vec * resp_vec)
        mask = items != 0
        user_vec = self.reformer(intx_vec, input_mask=mask)
        return user_vec
    
    def loss(self, user_raw, items, ratg):
        # user_vec is (batchsize, window, n_dim)
        batchsize, window, n_dim = user_raw.shape
        item_raw = self.item(items)
        user_bas, user_vec = user_raw[:, :, 0], user_raw[:, :, 1:]
        item_bas, item_vec = item_raw[:, :, 0], item_raw[:, :, 1:]
        pred = user_bas + item_bas + (user_vec * item_vec).sum(dim=2)
        # Ignore ratings that are zero -- zero isn't actually possible from the
        # user. Instead zero is empty padding that we should ignore.
        mask = ratg != 0
        loss_sum = F.mse_loss(pred[mask], ratg[mask], reduction='sum')
        loss_mean = loss_sum / (mask.sum() * 1.0)
        return loss_mean, {"mse": loss_mean}
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4, 
                                weight_decay=self.weight_decay)

    def step(self, batch, batch_nb, prefix='train', add_reg=True):
        items, ratng = batch
        # Pass in leading arrays, missing the last element
        # (hence the [:-1]) for every user that's to  be predicted
        user_vec = self.forward(items[:,  :-1], ratng[:, :-1])
        # Given previous tokens, predict the next interaction
        # hence the [1:] 
        loss, log = self.loss(user_vec, items[:, 1:],  ratng[:, 1:])
        log[f'{prefix}_loss'] = loss
        return {f'{prefix}_loss': loss, 'loss':loss, 'log': log}
        
    def reg(self):
        # Regularize via weight decay instead of explicitly
        return 0.0, {}

In [14]:
from pytorch_lightning.logging import WandbLogger

n_dim = 64
model = AR(n_item, n_dim, n_resp, n_rank, 
           heads=8, depth=6, batch_size=64)
logger = WandbLogger(name="09_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     early_stop_callback=True,
                     gpus=1, progress_bar_refresh_rate=1) 

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [None]:
trainer.fit(model, train)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable

  | Name     | Type      | Params
---------------------------------------
0 | item     | Embedding | 8 M   
1 | resp     | Embedding | 704   
2 | reformer | Reformer  | 274 K 
3 | dropout  | Dropout   | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
trainer.test(model)

In [None]:
results = trainer.test(model)
results['avg_test_loss']