### Load preprocessed data

In [80]:
import numpy as np

codes = np.load("../data/skipgram_full.npz")['coded']
code2token = np.load("../data/skipgram_full.npz")['c2t'].tolist()
token2code = np.load("../data/skipgram_full.npz")['t2c'].tolist()

In [52]:
# First column is the first token code
# second column is the 2nd token code
# third column is the skip gram count
codes

array([[  16,  570,   19],
       [4299,  570,   26],
       [8099, 6605,    2],
       ...,
       [6645, 3386,    1],
       [8619, 7315,    1],
       [3250, 5845,    1]], dtype=int32)

In [53]:
train_x = codes[:, :2].copy().astype(np.int64)
train_y = np.log(codes[:, 2]).astype(np.float32)

In [56]:
n_user = np.max(train_x[:, :2]) + 1
n_item = np.max(train_x[:, :2]) + 1
n_user

10000

### Define the MF Model

In [69]:
import torch
from torch import nn
import torch.nn.functional as F

def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss


class MF(nn.Module):
    itr = 0
    
    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        model.user.weight.data.normal_(0, 1.0 / n_user)
        model.item.weight.data.normal_(0, 1.0 / n_item)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))

    
    def __call__(self, train_x):
        user_id = train_x[:, 0]
        item_id = train_x[:, 1]
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        loss_mse = F.mse_loss(prediction, target.squeeze())
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_user = l2_regularize(self.bias_item.weight) * self.c_bias
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        total = loss_mse + prior_user + prior_item
        for name, var in locals().items():
            if type(var) is torch.Tensor and var.nelement() == 1 and self.writer is not None:
                self.writer.add_scalar(name, var, self.itr)
        return total

### Train model

In [70]:
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Loss
from tensorboardX import SummaryWriter
from ignite.metrics import MeanSquaredError

from loader import Loader
from datetime import datetime

#### Hyperparameters

In [88]:
lr = 1e-2
k = 128
c_bias = 1e-6
c_vector = 1e-6
log_dir = 'runs/simple_mf_05_word2vec_' + str(datetime.now()).replace(' ', '_')
print(log_dir)

runs/simple_mf_05_word2vec_2018-08-21_14:48:27.068048


In [89]:
writer = SummaryWriter(log_dir=log_dir)
model = MF(n_user, n_item,  k=k, c=c, c_bias=c_bias, 
           c_vector=c_vector, writer=writer)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
trainer = create_supervised_trainer(model, optimizer, model.loss)
metrics = {'accuracy': MeanSquaredError()}
train_loader = Loader(train_x, train_y, batchsize=1024)


def log_training_loss(engine, log_interval=400):
    epoch = engine.state.epoch
    itr = engine.state.iteration
    fmt = "Epoch[{}] Iteration[{}/{}] Loss: {:.2f}"
    msg = fmt.format(epoch, itr, len(train_loader), engine.state.output)
    model.itr = itr
    if itr % log_interval == 0:
        print(msg)

trainer.add_event_handler(event_name=Events.ITERATION_COMPLETED, handler=log_training_loss)

model

MF(
  (user): Embedding(10000, 128)
  (item): Embedding(10000, 128)
  (bias_user): Embedding(10000, 1)
  (bias_item): Embedding(10000, 1)
)

In [90]:
model.load_state_dict(torch.load("model_05_word2vec"))

#### Run model

In [None]:
trainer.run(train_loader, max_epochs=1)

Epoch[1] Iteration[400/22215] Loss: 0.46
Epoch[1] Iteration[800/22215] Loss: 0.46
Epoch[1] Iteration[1200/22215] Loss: 0.46
Epoch[1] Iteration[1600/22215] Loss: 0.44
Epoch[1] Iteration[2000/22215] Loss: 0.48
Epoch[1] Iteration[2400/22215] Loss: 0.50
Epoch[1] Iteration[2800/22215] Loss: 0.48
Epoch[1] Iteration[3200/22215] Loss: 0.48
Epoch[1] Iteration[3600/22215] Loss: 0.49
Epoch[1] Iteration[4000/22215] Loss: 0.46
Epoch[1] Iteration[4400/22215] Loss: 0.47
Epoch[1] Iteration[4800/22215] Loss: 0.50
Epoch[1] Iteration[5200/22215] Loss: 0.50
Epoch[1] Iteration[5600/22215] Loss: 0.52
Epoch[1] Iteration[6000/22215] Loss: 0.50
Epoch[1] Iteration[6400/22215] Loss: 0.52
Epoch[1] Iteration[6800/22215] Loss: 0.46
Epoch[1] Iteration[7200/22215] Loss: 0.47
Epoch[1] Iteration[7600/22215] Loss: 0.55
Epoch[1] Iteration[8000/22215] Loss: 0.47
Epoch[1] Iteration[8400/22215] Loss: 0.53
Epoch[1] Iteration[8800/22215] Loss: 0.53
Epoch[1] Iteration[9200/22215] Loss: 0.47
Epoch[1] Iteration[9600/22215] Loss:

In [86]:
torch.save(model.state_dict(), "model_05_word2vec")

#### Save the embeddings

In [94]:
label_token = ['|' + code2token[c] for c in range(n_user)]
writer.add_embedding(model.user.weight, metadata=label_token)
writer.add_embedding(model.item.weight, metadata=label_token)



### Introspect the model

Evaluate what urban dictionary thinks are similar words.

In [None]:
vectors_raw = model.item.weights.get()
vectors = vectors_raw / (vectors_raw**2.0).sum(axis=1)[:, None]

In [3]:
def find_closest(token):
    code = token2code[token]
    vector = vectors[code]
    similarity = np.dot(vector[None, :], vector)
    closest = np.argsort(similarity)
    for code in closest:
        print(code2token[code])

In [None]:
find_closest('yolo')

In [None]:
find_closest('barbie')

In [None]:
find_closest('relationship')

In [None]:
find_closest('pope')

In [None]:
find_closest('trump')

In [None]:
find_closest('blinding')

In [None]:
find_closest('conk')

In [None]:
find_closest('doofer')

In [None]:
find_closest('earwig')

In [None]:
find_closest('fuzz')

In [None]:
find_closest('honk')

In [None]:
find_closest('ivories')

In [None]:
find_closest('paddy')

In [None]:
find_closest('tosh')

### Subtract and add word vectors

In [4]:
def add_subtract(center, minus, plus):
    vector = (vectors[token2code[center]]
             - vectors[token2code[minus]]
             + vectors[token2code[plus]])
    similarity = np.dot(vector[None, :], vector)
    closest = np.argsort(similarity)
    for code in closest:
        print(code2token[code])

In [None]:
add_subtract('cop', 'fuzz', 'cookie')

In [None]:
add_subtract('cop', 'fuzz', 'crib')