### Load preprocessed data

Run the script that downloads and processes the MovieLens data.
Uncomment it to run the download & processing script.

In [1]:
#!python ../src/download.py

In [2]:
import numpy as np
fh = np.load('data/dataset.npz', allow_pickle=True)

# We have a bunch of feature columns and last column is the y-target
# Note pytorch is finicky about need int64 types
train_x = fh['train_x'].astype(np.int64)
train_y = fh['train_y']
train_d = fh['train_dict']

# We've already split into train & test
test_x = fh['test_x'].astype(np.int64)
test_y = fh['test_y']
test_d = fh['test_dict']


n_user = int(fh['n_user'])
n_item = int(fh['n_item'])


In [3]:
train_d

array([{'Drama': 1.0}, {'Drama': 1.0}, {'Drama': 1.0}, ...,
       {'Drama': 1.0}, {'Comedy': 1.0, 'Drama': 1.0, 'Western': 1.0},
       {'Documentary': 1.0}], dtype=object)

In [4]:
from abstract_model import AbstractModel


### Hashed Feature  Class

In [5]:
import torch 
from torch import nn
from sklearn.feature_extraction import FeatureHasher



class HashEmbed(nn.Module):
    def __init__(self, n_dim, hash_size=1024):
        """
        A custom module for generating an embedding from
        a string feature fields that are hashed and embedded on the fly.
        """
        super().__init__()
        self.embedding_feats = nn.EmbeddingBag(hash_size, n_dim, mode='sum')
        self.hasher = FeatureHasher(hash_size, alternate_sign=False)
        self.hash_size = hash_size
        torch.nn.init.xavier_uniform_(self.embedding_feats.weight)

    def _move(self, arr, device=None):
        """ Transfer the input numpy array to the correct device
        """
        if device is None:
            device = self.embedding_feats.weight.data.device
        if 'int' in str(arr.dtype):
            arr = arr.astype(np.int64)
        if 'float' in str(arr.dtype):
            arr = arr.astype(np.float32)
        tarr = torch.from_numpy(arr).to(device)
        return tarr

    def forward(self, features):
        flat = np.ravel(features)
        items_csr = self.hasher.transform(flat)
        n_rows = len(features)
        assert items_csr.shape[0] == flat.shape[0]
        items_off = self._move(items_csr.indptr[:-1])
        items_idx = self._move(items_csr.indices)
        items_dat = self._move(items_csr.data)
        summed = self.embedding_feats(items_idx, offsets=items_off,
                                      per_sample_weights=items_dat)
        n_dim = summed.shape[-1]
        n_cols = int(np.prod(summed.shape) / n_rows / n_dim)
        shape = [n_rows, n_cols, n_dim]
        shaped = summed.reshape(shape)
        return shaped

In [6]:
HashEmbed(4)([{"dog": 1.0},  {"cat": 1.0}])

tensor([[[-0.0552,  0.0515, -0.0595, -0.0176]],

        [[ 0.0709,  0.0266, -0.0060, -0.0611]]], grad_fn=<ViewBackward>)

In [7]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss


class MF(AbstractModel):
    def __init__(self, n_user, n_item, 
                 k=18, c_vector=1.0, c_bias=1.0, c_feat=1.0, batch_size=128):
        super().__init__()
        self.save_data(train_x, train_y, test_x, test_y, train_d, test_d)
        # These are simple hyperparameters
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        self.c_bias = c_bias
        self.c_feat = c_feat
        self.batch_size = batch_size
        
        # These are learned and fit by PyTorch
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        self.itemf = HashEmbed(k, hash_size=1024)
        self.bias_itemf = HashEmbed(1, hash_size=1024)
    
    def forward(self, args):
        inputs, feat_dict = args
        # This is the most import function in this script
        # These are the user indices, and correspond to "u" variable
        user_id = inputs[:, 0]
        # Item indices, correspond to the "i" variable
        item_id = inputs[:, 1]
        # vector user = p_u
        vector_user = self.user(user_id)
        # vector item = q_i
        vector_base = self.item(item_id)
        
        # Newly added: 
        vector_feat = self.itemf(feat_dict)[:, 0, :]
        vector_item = vector_base + vector_feat
        bias_feat = self.bias_itemf(feat_dict).squeeze()
        
        # this is a dot product & a user-item interaction: p_u * q_i
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item + bias_feat)

        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def likelihood(self, prediction, target):
        # MSE error between target = R_ui and prediction = p_u * q_i
        loss_mse = F.mse_loss(prediction, target.squeeze())
        return loss_mse, {"mse": loss_mse}
    
    def prior(self):
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        # Compute L2 reularization over user (P) and item (Q) matrices
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        prior_feat = l2_regularize(self.itemf.embedding_feats.weight) * self.c_feat
        log = {"prior_user": prior_user, "prior_item": prior_item,
               "prior_bias_user": prior_bias_user, "prior_bias_item": prior_bias_item}
        # Add up the MSE loss + user & item regularization
        total = prior_user + prior_item + prior_bias_user + prior_bias_item
        return total, log

In [11]:
from pytorch_lightning.logging import WandbLogger


k = 7
c_bias = 1e-3
c_vector = 1e-5
model = MF(n_user, n_item,
           k=k, c_bias=c_bias, c_vector=c_vector,
           batch_size=1024)
model.save_data(train_x, train_y, train_x, train_y, train_d, test_d)

# add a logger
logger = WandbLogger(name="06_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     early_stop_callback=True,
                     gpus=1, progress_bar_refresh_rate=1) 

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


### Train model

In [12]:
trainer.test(model)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable


TypeError: 'int' object is not callable

In [10]:
debug

> [0;32m<ipython-input-7-87205455db51>[0m(37)[0;36mforward[0;34m()[0m
[0;32m     35 [0;31m[0;34m[0m[0m
[0m[0;32m     36 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0margs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 37 [0;31m        [0minputs[0m[0;34m,[0m [0mfeat_dict[0m [0;34m=[0m [0margs[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     38 [0;31m        [0;31m# This is the most import function in this script[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     39 [0;31m        [0;31m# These are the user indices, and correspond to "u" variable[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  p args


tensor([[2537,    5,  272,   11],
        [1059,    1,  208,    0],
        [3081, 1089,  318,    7],
        ...,
        [2663, 3554,   12,    4],
        [2642, 3484,   14,    0],
        [1342,    6,   78,    0]], device='cuda:0')


ipdb>  q


#### Run model

In [None]:
trainer.fit(model)

In [None]:
trainer.test(model)