### Load preprocessed data

Run the script that downloads and processes the MovieLens data.
Uncomment it to run the download & processing script.

In [1]:
!wget https://raw.githubusercontent.com/cemoody/simple_mf/master/src/download.py
!wget https://raw.githubusercontent.com/cemoody/simple_mf/master/notebooks/abstract_model.py
!pip install pytorch_lightning wandb
!python download.py
!wandb login

--2022-04-08 19:39:57--  https://raw.githubusercontent.com/cemoody/simple_mf/master/src/download.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2743 (2.7K) [text/plain]
Saving to: ‘download.py’


2022-04-08 19:39:57 (40.3 MB/s) - ‘download.py’ saved [2743/2743]

--2022-04-08 19:39:57--  https://raw.githubusercontent.com/cemoody/simple_mf/master/notebooks/abstract_model.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1696 (1.7K) [text/plain]
Saving to: ‘abstract_model.py’


2022-04-08 19:39:57 (31.9 MB/s) - ‘abstract_model.p

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from torch import from_numpy
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import BatchSampler
from torch.utils.data import SequentialSampler

fh = np.load('data/dataset.npz')

# We have a bunch of feature columns and last column is the y-target
# Note pytorch is finicky about need int64 types
train_x = fh['train_x'].astype(np.int64)
train_y = fh['train_y']

# We've already split into train & test
X_test = fh['test_x'].astype(np.int64)
Y_test = fh['test_y']

X_train, X_val, Y_train, Y_val = train_test_split(train_x, train_y)


n_user = int(fh['n_user'])
n_item = int(fh['n_item'])
n_occu = int(fh['n_occu'])
n_rank = int(fh['n_ranks'])

# columns are user_id, item_id and other features 
# we won't use the 3rd and 4th columns
print(X_train)
print(' ')
print(Y_train)



def dataloader(*arrs, batch_size=1024):
    dataset = TensorDataset(*arrs)
    arr_size = len(arrs[0])
    bs = BatchSampler(SequentialSampler(range(arr_size)),
                      batch_size=batch_size, drop_last=False)
    return DataLoader(dataset, batch_sampler=bs, shuffle=False)
 
train = dataloader(from_numpy(X_train), from_numpy(Y_train))
test = dataloader(from_numpy(X_test), from_numpy(Y_test))
val = dataloader(from_numpy(X_val), from_numpy(Y_val))

[[ 673  455  302   20]
 [3129 2421  303    7]
 [4203  762  117   18]
 ...
 [3539 1614  563    4]
 [1530 1171  209    4]
 [5306 3260  549   17]]
 
[[1.]
 [1.]
 [2.]
 ...
 [5.]
 [4.]
 [4.]]


In [3]:
n_rank

2315

In [4]:
from abstract_model import AbstractModel

Now we add this new `total_variation` regularizer. Instead of regularizing the *norm* of vector, as we frequently do with L2 regularization, we penalize the difference in subsequent elements. This is useful in temporal models: you want day 0 close to day 1, but you don't care if day0 and day 1 deviate away from zero.  

In [5]:
def total_variation(array):
    return torch.sum(torch.abs(array[:-1, :] - array[1:, :]))

In [8]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl


def l2_regularize(array):
    return torch.sum(array ** 2.0)


class MF(AbstractModel):
    def __init__(self, n_user, n_item, n_occu, n_rank, 
                 k=18, kt=2, c_vector=1.0, c_bias=1.0,
                 c_ut=1.0, c_temp=1.0, c_ovector=1.0,
                 batch_size=128):
        super().__init__()
        # These are simple hyperparameters
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        self.c_ovector = c_ovector
        self.c_bias = c_bias
        self.batch_size = batch_size
        self.save_hyperparameters()
        
        # NEW: regularization hyperparams
        self.c_ut = c_ut
        self.c_temp = c_temp
        
        # These are learned and fit by PyTorch
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
        self.occu = nn.Embedding(n_occu, k)
        
        # NEW: temporal vectors
        self.user_temp = nn.Embedding(n_user, kt)
        self.temp = nn.Embedding(n_rank, kt)


    def forward(self, inputs):
        # This is the most import function in this script
        # These are the user indices, and correspond to "u" variable
        user_id = inputs[:, 0]
        # Item indices, correspond to the "i" variable
        item_id = inputs[:, 1]
        # vector user = p_u
        vector_user = self.user(user_id)
        # vector item = q_i
        vector_item = self.item(item_id)
        # this is a dot product & a user-item interaction: p_u * q_i
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        # occupation-item interaction
        occu_id = inputs[:, 3]
        vector_occu = self.occu(occu_id)
        oi_interaction = torch.sum(vector_user * vector_occu, dim=1)
        
        # NEW: user-time interaction
        rank = inputs[:, 2]
        vector_user_temp = self.user_temp(user_id)
        vector_temp = self.temp(rank)
        ut_interaction = torch.sum(vector_user_temp * vector_temp, dim=1)
        
        prediction = ui_interaction + oi_interaction + ut_interaction + biases
        return prediction

    def loss(self, prediction, target):
        # MSE error between target = R_ui and prediction = p_u * q_i
        loss_mse = F.mse_loss(prediction, target.squeeze())
        log = {"mse": loss_mse}
        self.log("loss", log)
        return loss_mse, log
    
    def reg(self):
        # Add new regularization to the biases
        reg_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        reg_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        reg_user =  l2_regularize(self.user.weight) * self.c_vector
        reg_item = l2_regularize(self.item.weight) * self.c_vector
        reg_occu = l2_regularize(self.occu.weight) * self.c_ovector
        
        # New: total variation regularization
        reg_ut = l2_regularize(self.user_temp.weight) * self.c_ut
        reg_tv = total_variation(self.temp.weight) * self.c_temp
        
        log = {"reg_user": reg_user, "reg_item": reg_item,
               "reg_bias_user": reg_bias_user, "reg_bias_item": reg_bias_item,
               "reg_occu": reg_occu, "reg_ut": reg_ut, "reg_tv": reg_tv
              }

        total = (reg_user + reg_item + reg_bias_user + reg_bias_item + reg_occu +
                 reg_ut + reg_tv)
        return total, log

In [9]:
from pytorch_lightning.loggers.wandb import WandbLogger

k = 6
kt = 2
c_bias = 1e-3
c_vector = 1e-5
c_ovector = 1e-8
c_ut = 1e-5
c_temp = 1e-5
model = MF(n_user, n_item, n_occu, n_rank,
           k=k, kt=kt, c_bias=c_bias, c_vector=c_vector,
           c_ovector=c_ovector, c_ut=c_ut, c_temp=c_temp,
           batch_size=1024)

# add a logger
logger = WandbLogger(name="04_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     progress_bar_refresh_rate=1) 

  "There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse"
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model, train, val)


  | Name      | Type      | Params
----------------------------------------
0 | user      | Embedding | 36.2 K
1 | item      | Embedding | 23.7 K
2 | bias_user | Embedding | 6.0 K 
3 | bias_item | Embedding | 4.0 K 
4 | occu      | Embedding | 126   
5 | user_temp | Embedding | 12.1 K
6 | temp      | Embedding | 4.6 K 
----------------------------------------
86.8 K    Trainable params
0         Non-trainable params
86.8 K    Total params
0.347     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
