### Load preprocessed data

In [1]:
import numpy as np
fh = np.load('../data/dataset.npz')
# We have a bunch of feature columns and last column is the y-target
train_x = fh['train_x'].astype(np.int64)
train_y = fh['train_y']

test_x = fh['test_x'].astype(np.int64)
test_y = fh['test_y']

n_user = int(fh['n_user'])
n_item = int(fh['n_item'])

### Create labels for each movie

In [2]:

import pandas as pd
cols = ['item_id', 'title', 'tags']
df = pd.read_csv("../data/ml-1m/movies.dat", delimiter="::", engine="python", names=cols)

label_item = [str(iid) for iid in range(df.item_id.max() + 1)]
for item, title in zip(df.item_id, df.title):
    label_item[item] = title
    
label_user = [str(uid) for uid in range(n_user)]
label_item

['0',
 'Toy Story (1995)',
 'Jumanji (1995)',
 'Grumpier Old Men (1995)',
 'Waiting to Exhale (1995)',
 'Father of the Bride Part II (1995)',
 'Heat (1995)',
 'Sabrina (1995)',
 'Tom and Huck (1995)',
 'Sudden Death (1995)',
 'GoldenEye (1995)',
 'American President, The (1995)',
 'Dracula: Dead and Loving It (1995)',
 'Balto (1995)',
 'Nixon (1995)',
 'Cutthroat Island (1995)',
 'Casino (1995)',
 'Sense and Sensibility (1995)',
 'Four Rooms (1995)',
 'Ace Ventura: When Nature Calls (1995)',
 'Money Train (1995)',
 'Get Shorty (1995)',
 'Copycat (1995)',
 'Assassins (1995)',
 'Powder (1995)',
 'Leaving Las Vegas (1995)',
 'Othello (1995)',
 'Now and Then (1995)',
 'Persuasion (1995)',
 'City of Lost Children, The (1995)',
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 'Dangerous Minds (1995)',
 'Twelve Monkeys (1995)',
 'Wings of Courage (1995)',
 'Babe (1995)',
 'Carrington (1995)',
 'Dead Man Walking (1995)',
 'Across the Sea of Time (1995)',
 'It Takes Two (1995)',
 'Clue

### Define the MF Model

In [3]:
import torch
from torch import nn
import torch.nn.functional as F

def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

In [4]:
class MF(nn.Module):
    itr = 0
    
    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))

    
    def __call__(self, train_x):
        user_id = train_x[:, 0]
        item_id = train_x[:, 1]
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        loss_mse = F.mse_loss(prediction, target.squeeze())
        
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_user = l2_regularize(self.bias_item.weight) * self.c_bias
        
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        total = loss_mse + prior_user + prior_item
        for name, var in locals().items():
            if type(var) is torch.Tensor and var.nelement() == 1 and self.writer is not None:
                self.writer.add_scalar(name, var, self.itr)
        return total

### Train model

In [5]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError

from loader import Loader
from datetime import datetime

In [6]:
# Hyperparameters
lr = 1e-2
k = 10
# New parameter for regularizing bias
c_bias = 1e-6
c_vector = 1e-6
batchsize = 1024
log_dir = 'runs/simple_mf_02_bias_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/simple_mf_02_bias_2018-08-22_21:51:38.919038


In [6]:
writer = SummaryWriter(log_dir=log_dir)

model = MF(n_user, n_item, writer=writer, k=k, c_bias=c_bias, c_vector=c_vector)
optimizer = torch.optim.Adam(model.parameters())
trainer = create_supervised_trainer(model, optimizer, model.loss)
metrics = {'accuracy': MeanSquaredError()}
evaluat = create_supervised_evaluator(model, metrics=metrics)
train_loader = Loader(train_x, train_y, batchsize=batchsize)
test_loader = Loader(test_x, test_y, batchsize=batchsize)


def log_training_loss(engine, log_interval=400):
    epoch = engine.state.epoch
    itr = engine.state.iteration
    fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
    msg = fmt.format(epoch, itr, len(train_loader), engine.state.output)
    model.itr = itr
    if itr % log_interval == 0:
        print(msg)

trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=log_training_loss)

def log_validation_results(engine):
    evaluat.run(test_loader)
    metrics = evaluat.state.metrics
    avg_accuracy = metrics['accuracy']
    print("Epoch[{}] Validation MSE: {:.2f} "
          .format(engine.state.epoch, avg_accuracy))
    writer.add_scalar("validation/avg_accuracy", avg_accuracy, engine.state.epoch)
    

trainer.add_event_handler(event_name=Events.EPOCH_COMPLETED, handler=log_validation_results)


model

MF(
  (user): Embedding(6041, 10)
  (item): Embedding(3953, 10)
  (bias_user): Embedding(6041, 1)
  (bias_item): Embedding(3953, 1)
)

#### Run model

In [7]:
trainer.run(train_loader, max_epochs=150)

Epoch[1] Iteration[400/879] Loss: 14.02
Epoch[1] Iteration[800/879] Loss: 11.28
Epoch[1] Validation MSE: 10.76 
Epoch[2] Iteration[1200/879] Loss: 8.87
Epoch[2] Iteration[1600/879] Loss: 7.11
Epoch[2] Validation MSE: 6.60 
Epoch[3] Iteration[2000/879] Loss: 5.74
Epoch[3] Iteration[2400/879] Loss: 4.46
Epoch[3] Validation MSE: 4.52 
Epoch[4] Iteration[2800/879] Loss: 4.20
Epoch[4] Iteration[3200/879] Loss: 3.89
Epoch[4] Validation MSE: 3.28 
Epoch[5] Iteration[3600/879] Loss: 2.96
Epoch[5] Iteration[4000/879] Loss: 2.72
Epoch[5] Validation MSE: 2.48 
Epoch[6] Iteration[4400/879] Loss: 2.25
Epoch[6] Iteration[4800/879] Loss: 2.19
Epoch[6] Iteration[5200/879] Loss: 1.86
Epoch[6] Validation MSE: 1.95 
Epoch[7] Iteration[5600/879] Loss: 1.89
Epoch[7] Iteration[6000/879] Loss: 1.70
Epoch[7] Validation MSE: 1.61 
Epoch[8] Iteration[6400/879] Loss: 1.38
Epoch[8] Iteration[6800/879] Loss: 1.37
Epoch[8] Validation MSE: 1.38 
Epoch[9] Iteration[7200/879] Loss: 1.23
Epoch[9] Iteration[7600/879] Lo

Epoch[67] Iteration[58800/879] Loss: 0.55
Epoch[67] Validation MSE: 0.78 
Epoch[68] Iteration[59200/879] Loss: 0.61
Epoch[68] Iteration[59600/879] Loss: 0.58
Epoch[68] Validation MSE: 0.77 
Epoch[69] Iteration[60000/879] Loss: 0.56
Epoch[69] Iteration[60400/879] Loss: 0.63
Epoch[69] Validation MSE: 0.77 
Epoch[70] Iteration[60800/879] Loss: 0.61
Epoch[70] Iteration[61200/879] Loss: 0.56
Epoch[70] Validation MSE: 0.77 
Epoch[71] Iteration[61600/879] Loss: 0.54
Epoch[71] Iteration[62000/879] Loss: 0.58
Epoch[71] Iteration[62400/879] Loss: 0.63
Epoch[71] Validation MSE: 0.77 
Epoch[72] Iteration[62800/879] Loss: 0.58
Epoch[72] Iteration[63200/879] Loss: 0.64
Epoch[72] Validation MSE: 0.77 
Epoch[73] Iteration[63600/879] Loss: 0.61
Epoch[73] Iteration[64000/879] Loss: 0.58
Epoch[73] Validation MSE: 0.77 
Epoch[74] Iteration[64400/879] Loss: 0.61
Epoch[74] Iteration[64800/879] Loss: 0.58
Epoch[74] Validation MSE: 0.77 
Epoch[75] Iteration[65200/879] Loss: 0.67
Epoch[75] Iteration[65600/879]

Epoch[132] Iteration[116000/879] Loss: 0.61
Epoch[132] Validation MSE: 0.77 
Epoch[133] Iteration[116400/879] Loss: 0.57
Epoch[133] Iteration[116800/879] Loss: 0.61
Epoch[133] Validation MSE: 0.77 
Epoch[134] Iteration[117200/879] Loss: 0.60
Epoch[134] Iteration[117600/879] Loss: 0.62
Epoch[134] Validation MSE: 0.77 
Epoch[135] Iteration[118000/879] Loss: 0.61
Epoch[135] Iteration[118400/879] Loss: 0.64
Epoch[135] Validation MSE: 0.77 
Epoch[136] Iteration[118800/879] Loss: 0.64
Epoch[136] Iteration[119200/879] Loss: 0.57
Epoch[136] Validation MSE: 0.77 
Epoch[137] Iteration[119600/879] Loss: 0.58
Epoch[137] Iteration[120000/879] Loss: 0.59
Epoch[137] Iteration[120400/879] Loss: 0.61
Epoch[137] Validation MSE: 0.77 
Epoch[138] Iteration[120800/879] Loss: 0.62
Epoch[138] Iteration[121200/879] Loss: 0.61
Epoch[138] Validation MSE: 0.77 
Epoch[139] Iteration[121600/879] Loss: 0.63
Epoch[139] Iteration[122000/879] Loss: 0.63
Epoch[139] Validation MSE: 0.77 
Epoch[140] Iteration[122400/879]

<ignite.engine.engine.State at 0x111822780>

In [8]:
torch.save(model.state_dict(), "../data/simple_mf_02_bias")

In [11]:
import pandas as pd
cols = ['item_id', 'title', 'tags']
df = pd.read_csv("../data/ml-1m/movies.dat", delimiter="::", engine="python", names=cols)

label_item = [str(iid) for iid in range(df.item_id.max() + 1)]
for item, title in zip(df.item_id, df.title):
    label_item[item] = title
    
label_user = [str(uid) for uid in range(n_user)]

In [12]:
writer.add_embedding(model.user.weight, metadata=label_user)
writer.add_embedding(model.item.weight, metadata=label_item)

