In [1]:
### Load preprocessed data

In [2]:
# conda  uninstall tensorboard; pip uninstall -y tensorboard; conda install tensorboard; conda install pytorch-lightning -c conda-forge

In [1]:
!python ../src/download_ml20.py

n_item 131263
n_user 138494
n_features 269757
n_rows 19950567


In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
fh = np.load('data/dataset_ml20_wide.npz')
# We have a bunch of feature columns and last column is the y-target
max_seq_len = 768 + 1
train_items = fh['train_items'].astype(np.int64)[:, :max_seq_len]
# Note that ratings are modified are on scale (1, 2, ... 10) 
train_ratng = fh['train_ratng'].astype(np.int64)[:, :max_seq_len]
test_items = fh['test_items'].astype(np.int64)[:, :max_seq_len]
test_ratng = fh['test_ratng'].astype(np.int64)[:, :max_seq_len]

n_user = train_items.shape[0]
n_rank = train_items.shape[1]
n_item = int(train_items.max() + 1)
n_resp = int(train_ratng.max() + 1)

train_items, val_items, train_ratng, val_ratng = train_test_split(train_items, train_ratng)

In [15]:
from torch import from_numpy
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import BatchSampler
from torch.utils.data import SequentialSampler

def dataloader(*arrs, batch_size=64):
    dataset = TensorDataset(*arrs)
    arr_size = len(arrs[0])
    bs = BatchSampler(SequentialSampler(range(arr_size)),
                      batch_size=batch_size, drop_last=False)
    return DataLoader(dataset, batch_sampler=bs, shuffle=False)
 
train = dataloader(from_numpy(train_items), from_numpy(train_ratng))
val = dataloader(from_numpy(val_items), from_numpy(val_ratng))
test = dataloader(from_numpy(test_items), from_numpy(test_ratng))

#### Data Structure

Notice that the inputs are now 2D. Each row in `train_items` represents is a 1D stream of items seen by a single user. Different rows will be from different user streams. Note that each stream is padded with zeros so it is a fixed input size. `train_ratng` is a similar structure, but gives the categorical rating output (scaled from [0.0, 0.5, ... 4.5, 5.0] to [0, 1,2, ...10]) that that user gave that item.

In [4]:
train_items

array([[   5, 1777,  158, ...,    0,    0,    0],
       [  25,   95,  141, ...,    0,    0,    0],
       [ 150,  296,  380, ...,    0,    0,    0],
       ...,
       [2021, 2193,   29, ...,    0,    0,    0],
       [1210, 1291, 1342, ...,    0,    0,    0],
       [2761, 2762, 2858, ..., 2474, 2745, 2757]])

In [5]:
train_ratng

array([[ 6,  8,  1, ...,  0,  0,  0],
       [ 2,  8,  8, ...,  0,  0,  0],
       [ 8,  8,  8, ...,  0,  0,  0],
       ...,
       [ 8, 10,  8, ...,  0,  0,  0],
       [ 6, 10,  6, ...,  0,  0,  0],
       [ 8,  4, 10, ...,  6,  8,  8]])

In [6]:
np.unique(train_ratng[train_items > 0])

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [7]:
np.unique(train_items)

array([     0,      2,      3, ..., 131158, 131162, 131237])

In [8]:
train_items.shape, test_items.shape

((93483, 769), (13849, 769))

In [9]:
!pip install -q reformer_pytorch



In [10]:
import random
import torch
import numpy as np
import pandas as pd
from torch import nn
from torch import from_numpy
import pytorch_lightning as pl
from torch.nn import functional as F
from reformer_pytorch import Reformer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

In [11]:
from abstract_model import AbstractModel

In [12]:
class AR(AbstractModel):
    def __init__(self, n_item, n_dim, n_resp, n_rank, p=0.1,
                 heads=2, depth=2, batch_size=32, weight_decay=1e-6):
        super().__init__()
        self.n_dim = n_dim
        self.n_item = n_item
        self.n_resp = n_resp
        self.save_hyperparameters()
        
        # This means that item=0 will always yield the zero vector
        self.item = nn.Embedding(n_item, n_dim, padding_idx=0)
        self.resp = nn.Embedding(n_resp, n_dim)
        self.reformer = Reformer(dim=n_dim, depth=depth, heads=heads, causal=True, max_seq_len=n_rank)
        self.batch_size = batch_size
        self.weight_decay = weight_decay
        self.dropout = nn.Dropout(p=p)
    
    def forward(self, items, ratng):
        item_vec = self.item(items)
        resp_vec = self.resp(ratng)
        intx_vec = self.dropout(item_vec * resp_vec)
        mask = items != 0
        user_vec = self.reformer(intx_vec, input_mask=mask)
        return user_vec
    
    def loss(self, user_raw, items, ratg):
        # user_vec is (batchsize, window, n_dim)
        batchsize, window, n_dim = user_raw.shape
        item_raw = self.item(items)
        user_bas, user_vec = user_raw[:, :, 0], user_raw[:, :, 1:]
        item_bas, item_vec = item_raw[:, :, 0], item_raw[:, :, 1:]
        pred = user_bas + item_bas + (user_vec * item_vec).sum(dim=2)
        # Ignore ratings that are zero -- zero isn't actually possible from the
        # user. Instead zero is empty padding that we should ignore.
        mask = ratg != 0
        loss_sum = F.mse_loss(pred[mask], ratg[mask] * 1.0, reduction='sum')
        loss_mean = loss_sum / (mask.sum() * 1.0)
        return loss_mean, {"mse": loss_mean}
        
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4, 
                                weight_decay=self.weight_decay)

    def step(self, batch, batch_nb, prefix='train', add_reg=True):
        items, ratng = batch
        # Pass in leading arrays, missing the last element
        # (hence the [:-1]) for every user that's to  be predicted
        user_vec = self.forward(items[:,  :-1], ratng[:, :-1])
        # Given previous tokens, predict the next interaction
        # hence the [1:] 
        loss, log = self.loss(user_vec, items[:, 1:],  ratng[:, 1:])
        log[f'{prefix}_loss'] = loss
        return {f'{prefix}_loss': loss, 'loss':loss, 'log': log}
        
    def reg(self):
        # Regularize via weight decay instead of explicitly
        return 0.0, {}

In [16]:
from pytorch_lightning.loggers.wandb import WandbLogger

n_dim = 48
model = AR(n_item, n_dim, n_resp, n_rank, 
           heads=8, depth=6)
logger = WandbLogger(name="09_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     gpus=0, progress_bar_refresh_rate=1) 

ImportError: You want to use `wandb` logger which is not installed yet, install it with `pip install wandb`.

In [None]:
trainer.fit(model, train, val)

In [None]:
trainer.test(model, test)

In [None]:
results = trainer.test(model)
results['avg_test_loss']