### Install flexi hash embeddings

In [1]:
!pip install torch-scatter==latest+cpu -f https://pytorch-geometric.com/whl/torch-1.5.0.html

Looking in indexes: https://artifactory.vertigo.stitchfix.com/artifactory/api/pypi/stitchpy/simple
Looking in links: https://pytorch-geometric.com/whl/torch-1.5.0.html
Collecting torch-scatter==latest+cpu
  Downloading https://pytorch-geometric.com/whl/torch-1.5.0/torch_scatter-latest%2Bcpu-cp36-cp36m-linux_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 76.4 MB/s eta 0:00:01
[?25hInstalling collected packages: torch-scatter
  Attempting uninstall: torch-scatter
    Found existing installation: torch-scatter 2.0.5
    Uninstalling torch-scatter-2.0.5:
      Successfully uninstalled torch-scatter-2.0.5
Successfully installed torch-scatter-2.0.5


In [2]:
!pip install -q flexi-hash-embedding 



### Load preprocessed data

Run the script that downloads and processes the MovieLens data.
Uncomment it to run the download & processing script.

In [1]:
#!python ../src/download.py

In [4]:
import numpy as np
fh = np.load('data/dataset.npz', allow_pickle=True)

# We have a bunch of feature columns and last column is the y-target
# Note pytorch is finicky about need int64 types
train_x = fh['train_x'].astype(np.int64)
train_y = fh['train_y']
train_d = fh['train_dict']

# We've already split into train & test
test_x = fh['test_x'].astype(np.int64)
test_y = fh['test_y']
test_d = fh['test_dict']


n_user = int(fh['n_user'])
n_item = int(fh['n_item'])


In [5]:
train_d

array([{'Drama': 1.0}, {'Drama': 1.0}, {'Drama': 1.0}, ...,
       {'Drama': 1.0}, {'Comedy': 1.0, 'Drama': 1.0, 'Western': 1.0},
       {'Documentary': 1.0}], dtype=object)

In [6]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from abstract_model import AbstractModel

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl

from pytorch_lightning.loggers import TensorBoardLogger


def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss


class MF(AbstractModel):
    def __init__(self, n_user, n_item, train_x, train_y, test_x, test_y, 
                 train_d, test_d, k=18, c_vector=1.0, c_bias=1.0, batch_size=128):
        super().__init__()
        self.save_data(train_x, train_y, test_x, test_y, train_d, test_d)
        # These are simple hyperparameters
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        self.c_bias = c_bias
        self.batch_size = batch_size
        
        # These are learned and fit by PyTorch
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, inputs):
        # This is the most import function in this script
        # These are the user indices, and correspond to "u" variable
        user_id = inputs[:, 0]
        # Item indices, correspond to the "i" variable
        item_id = inputs[:, 1]
        # vector user = p_u
        vector_user = self.user(user_id)
        # vector item = q_i
        vector_item = self.item(item_id)
        # this is a dot product & a user-item interaction: p_u * q_i
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)

        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def likelihood(self, prediction, target):
        # MSE error between target = R_ui and prediction = p_u * q_i
        loss_mse = F.mse_loss(prediction, target.squeeze())
        return loss_mse
    
    def prior(self):
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        # Compute L2 reularization over user (P) and item (Q) matrices
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        # Add up the MSE loss + user & item regularization
        total = prior_user + prior_item + prior_bias_user + prior_bias_item
        return total
    


model = MF(n_user, n_item, train_x, train_y, test_x, test_y)

# add a logger
logger = TensorBoardLogger("tb_logs", name="bottleneck_model")

# We could have turned on multiple GPUs here, for example
# trainer = pl.Trainer(gpus=8, precision=16)    
trainer = pl.Trainer(max_epochs=5,
                     reload_dataloaders_every_epoch=True,
                     logger=logger)    

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


### Train model

In [5]:
trainer.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(28.0364)}
--------------------------------------------------------------------------------



{'test_loss': 28.036401748657227}

#### Run model

In [6]:
trainer.fit(model)
trainer.test(model)


  | Name      | Type      | Params
----------------------------------------
0 | user      | Embedding | 108 K 
1 | item      | Embedding | 71 K  
2 | bias_user | Embedding | 6 K   
3 | bias_item | Embedding | 3 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…






HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

--------------------------------------------------------------------------------
TEST RESULTS
{'test_loss': tensor(2.4373)}
--------------------------------------------------------------------------------



{'test_loss': 2.4373252391815186}