### Load preprocessed data

In [22]:
!pip install pytorch-ignite tensorboardX
!wget https://raw.githubusercontent.com/cemoody/simple_mf/master/notebooks/loader.py

--2019-11-18 17:14:01--  https://raw.githubusercontent.com/cemoody/simple_mf/master/notebooks/loader.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1014 [text/plain]
Saving to: ‘loader.py’


2019-11-18 17:14:02 (210 MB/s) - ‘loader.py’ saved [1014/1014]



In [1]:
!wget https://www.dropbox.com/s/nd1zxh538o6psal/skipgram_full.npz

--2019-11-18 16:56:06--  https://www.dropbox.com/s/nd1zxh538o6psal/skipgram_full.npz
Resolving www.dropbox.com (www.dropbox.com)... 162.125.65.1, 2620:100:6021:1::a27d:4101
Connecting to www.dropbox.com (www.dropbox.com)|162.125.65.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/nd1zxh538o6psal/skipgram_full.npz [following]
--2019-11-18 16:56:07--  https://www.dropbox.com/s/raw/nd1zxh538o6psal/skipgram_full.npz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc979bf5be5f3ed518428ba5623d.dl.dropboxusercontent.com/cd/0/inline/AsmCMf93WirnSmlkzuHxQU9OZaut_5lNSJmuaLVWNkpIv7Jvhj0xvCcbpGI0ToG7EY76LroIZpy9Ro62uhgwD7tWglNGA3WSHZjlL_KzGf-FMQ/file# [following]
--2019-11-18 16:56:07--  https://uc979bf5be5f3ed518428ba5623d.dl.dropboxusercontent.com/cd/0/inline/AsmCMf93WirnSmlkzuHxQU9OZaut_5lNSJmuaLVWNkpIv7Jvhj0xvCcbpGI0ToG7EY76LroIZpy9Ro62uhgwD7tWglNGA3WSHZjlL_KzGf-FMQ/file
R

If you'd like to play around with this notebook, start by downloading the skipgram dataset from here:

https://www.dropbox.com/s/nd1zxh538o6psal/skipgram_full.npz

WARNING: it's a 1Gb download, so it may take a while!

In [0]:
import numpy as np

codes = np.load("skipgram_full.npz")['coded']
# Remove duplicate skipgrams
codes = codes[codes[:, 0] != codes[:, 1]]
code2token = np.load("skipgram_full.npz", allow_pickle=True)['c2t'].tolist()
token2code = np.load("skipgram_full.npz", allow_pickle=True)['t2c'].tolist()

In [12]:
# First column is the first token code
# second column is the 2nd token code
# third column is the skip gram count
# fourth is PMI * 1e6
codes

array([[  13835,    3257,    4605,  592814],
       [  12071,    3257,      16,  491071],
       [   4136,    3257,       2, -621270],
       ...,
       [  12293,    1390,       1, 1092727],
       [   5103,    1390,       1, 2368132],
       [   6789,    1390,       1,  427689]], dtype=int32)

In [14]:
train_x = codes[:, :2].copy().astype(np.int64)
train_y = codes[:, 3].astype(np.float32) / 1e6
train_y

array([ 0.592814,  0.491071, -0.62127 , ...,  1.092727,  2.368132,
        0.427689], dtype=float32)

In [15]:
train_y.max()

12.09618

In [16]:
top_codes = np.argsort(train_y)[-10:]
[[code2token[c[0]], code2token[c[1]]] for c in codes[top_codes, :2]]

[['norris', 'roundhouse'],
 ['palpatine', 'skywalker'],
 ['palpatine', 'sith'],
 ['roundhouse', 'norris'],
 ['lankan', 'sri'],
 ['palpatine', 'anakin'],
 ['skywalker', 'palpatine'],
 ['anakin', 'palpatine'],
 ['blahblah', 'blah'],
 ['blah', 'blahblah']]

In [17]:
n_user = np.max(train_x[:, :2]) + 1
n_item = np.max(train_x[:, :2]) + 1
n_user

14003

### Define the MF Model

In [0]:
import torch
from torch import nn
import torch.nn.functional as F

def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss


class MF(nn.Module):
    itr = 0
    
    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        self.user.weight.data.normal_(0, 1.0 / n_user)
        self.item.weight.data.normal_(0, 1.0 / n_item)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))

    
    def __call__(self, train_x):
        user_id = train_x[:, 0]
        item_id = train_x[:, 1]
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        loss_mse = F.mse_loss(prediction, target.squeeze())
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        total = loss_mse #+ prior_user + prior_item
        for name, var in locals().items():
            if type(var) is torch.Tensor and var.nelement() == 1 and self.writer is not None:
                self.writer.add_scalar(name, var, self.itr)
        return total

### Train model

In [0]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError

from loader import Loader
from datetime import datetime

#### Hyperparameters

In [24]:
lr = 1e-3
k = 128
c_bias = 1e-9
c_vector = 1e-9
log_dir = 'runs/simple_mf_05_word2vec_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/simple_mf_05_word2vec_2019-11-18_17:14:14.092277


In [25]:
writer = SummaryWriter(log_dir=log_dir)
model = MF(n_user, n_item,  k=k, c_bias=c_bias, 
           c_vector=c_vector, writer=writer)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
trainer = create_supervised_trainer(model, optimizer, model.loss)
metrics = {'accuracy': MeanSquaredError()}
train_loader = Loader(train_x, train_y, batchsize=1024)


def log_training_loss(engine, log_interval=400):
    epoch = engine.state.epoch
    itr = engine.state.iteration
    fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
    msg = fmt.format(epoch, itr, len(train_loader), engine.state.output)
    model.itr = itr
    if itr % log_interval == 0:
        print(msg)

trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=log_training_loss)

model

MF(
  (user): Embedding(14003, 128)
  (item): Embedding(14003, 128)
  (bias_user): Embedding(14003, 1)
  (bias_item): Embedding(14003, 1)
)

In [0]:
# model.load_state_dict(torch.load("model_05_word2vec"))

#### Run model

In [0]:
trainer.run(train_loader, max_epochs=25)

Epoch[1] Iteration[400/60208] Loss: 2.97
Epoch[1] Iteration[800/60208] Loss: 1.66
Epoch[1] Iteration[1200/60208] Loss: 1.00
Epoch[1] Iteration[1600/60208] Loss: 0.81
Epoch[1] Iteration[2000/60208] Loss: 0.64
Epoch[1] Iteration[2400/60208] Loss: 0.63
Epoch[1] Iteration[2800/60208] Loss: 0.58
Epoch[1] Iteration[3200/60208] Loss: 0.54
Epoch[1] Iteration[3600/60208] Loss: 0.56
Epoch[1] Iteration[4000/60208] Loss: 0.49
Epoch[1] Iteration[4400/60208] Loss: 0.51
Epoch[1] Iteration[4800/60208] Loss: 0.56
Epoch[1] Iteration[5200/60208] Loss: 0.55
Epoch[1] Iteration[5600/60208] Loss: 0.53
Epoch[1] Iteration[6000/60208] Loss: 0.52
Epoch[1] Iteration[6400/60208] Loss: 0.47
Epoch[1] Iteration[6800/60208] Loss: 0.53
Epoch[1] Iteration[7200/60208] Loss: 0.48
Epoch[1] Iteration[7600/60208] Loss: 0.53
Epoch[1] Iteration[8000/60208] Loss: 0.53


In [0]:
torch.save(model.state_dict(), "model_05_word2vec")

#### Save the embeddings

In [0]:
label_token = ['|' + code2token[c] for c in range(n_user)]
writer.add_embedding(model.user.weight)
# writer.add_embedding(model.item.weight, metadata=label_token)

### Introspect the model

Evaluate what urban dictionary thinks are similar words.

In [0]:
vectors_raw = model.user.weight.data.numpy()
vectors = vectors_raw / np.sqrt((vectors_raw**2.0).sum(axis=1)[:, None])

In [0]:
(vectors[0]**2.0).sum()

In [0]:
def find_closest(token, n=10):
    code = token2code[token]
    vector = vectors[code]
    similarity = np.sum(vector[None, :] * vectors, axis=1)
    closest = np.argsort(similarity)[::-1]
    for code in closest[1:n]:
        print(code2token[code], similarity[code])

In [0]:
find_closest('dude')

In [0]:
find_closest('netflix')

In [0]:
find_closest('lol')

In [0]:
find_closest('hipster')

In [0]:
find_closest('crunk')

In [0]:
find_closest('bromance')

In [0]:
find_closest('barbie')

In [0]:
find_closest('relationship')

In [0]:
find_closest('pope')

In [0]:
find_closest('trump')

In [0]:
find_closest('selfie')

### Subtract and add word vectors

In [0]:
def add_subtract(center, minus, plus, n=10):
    vector = (vectors[token2code[center]]
             - vectors[token2code[minus]]
             + vectors[token2code[plus]])
    similarity = np.sum(vector[None, :] * vectors, axis=1)
    closest = np.argsort(similarity)[::-1]
    for code in closest[2:n]:
        print(code2token[code])

In [0]:
add_subtract('burrito', 'mexican', 'italian')

In [0]:
add_subtract('drunk', 'beer', 'weed')