### Load preprocessed data

If you'd like to play around with this notebook, start by downloading the skipgram dataset from here:

https://www.dropbox.com/s/nd1zxh538o6psal/skipgram_full.npz

WARNING: it's a 1Gb download, so it may take a while!

In [1]:
# !wget  -q https://www.dropbox.com/s/nd1zxh538o6psal/skipgram_full.npz

In [1]:
import numpy as np

codes = np.load("skipgram_full.npz")['coded']
# Remove duplicate skipgrams
codes = codes[codes[:, 0] != codes[:, 1]]
code2token = np.load("skipgram_full.npz", allow_pickle=True)['c2t'].tolist()
token2code = np.load("skipgram_full.npz", allow_pickle=True)['t2c'].tolist()

# First column is the first token code
# second column is the 2nd token code
# third column is the skip gram count
# fourth is PMI * 1e6
codes

array([[  13835,    3257,    4605,  592814],
       [  12071,    3257,      16,  491071],
       [   4136,    3257,       2, -621270],
       ...,
       [  12293,    1390,       1, 1092727],
       [   5103,    1390,       1, 2368132],
       [   6789,    1390,       1,  427689]], dtype=int32)

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from torch import from_numpy
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.utils.data import BatchSampler
from torch.utils.data import SequentialSampler

train_x = codes[:, :2].copy().astype(np.int64)
train_y = codes[:, 3].astype(np.float32) / 1e6
train_y, train_y.max()

X_train, X_val, Y_train, Y_val = train_test_split(train_x, train_y)

def dataloader(*arrs, batch_size=8192):
    dataset = TensorDataset(*arrs)
    arr_size = len(arrs[0])
    bs = BatchSampler(SequentialSampler(range(arr_size)),
                      batch_size=batch_size, drop_last=False)
    return DataLoader(dataset, batch_sampler=bs, shuffle=False)
 
train = dataloader(from_numpy(X_train), from_numpy(Y_train))
val = dataloader(from_numpy(X_val), from_numpy(Y_val))

In [8]:
top_codes = np.argsort(train_y)[-10:]
[[code2token[c[0]], code2token[c[1]]] for c in codes[top_codes, :2]]

[['norris', 'roundhouse'],
 ['palpatine', 'skywalker'],
 ['palpatine', 'sith'],
 ['roundhouse', 'norris'],
 ['lankan', 'sri'],
 ['palpatine', 'anakin'],
 ['skywalker', 'palpatine'],
 ['anakin', 'palpatine'],
 ['blahblah', 'blah'],
 ['blah', 'blahblah']]

In [9]:
n_user = np.max(train_x[:, :2]) + 1
n_item = np.max(train_x[:, :2]) + 1
n_user

14003

In [10]:
from abstract_model import AbstractModel

In [11]:
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl

from pytorch_lightning.loggers import TensorBoardLogger


def l2_regularize(array):
    return torch.sum(array ** 2.0)


class MF(AbstractModel):
    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0, batch_size=128):
        super().__init__()
        # These are simple hyperparameters
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_vector = c_vector
        self.c_bias = c_bias
        self.batch_size = batch_size
        self.save_hyperparameters()
        
        # These are learned and fit by PyTorch
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, inputs):
        # This is the most import function in this script
        # These are the user indices, and correspond to "u" variable
        user_id = inputs[:, 0]
        # Item indices, correspond to the "i" variable
        item_id = inputs[:, 1]
        # vector user = p_u
        vector_user = self.user(user_id)
        # vector item = q_i
        vector_item = self.item(item_id)
        # this is a dot product & a user-item interaction: p_u * q_i
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)

        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        # MSE error between target = R_ui and prediction = p_u * q_i
        loss_mse = F.mse_loss(prediction, target.squeeze())
        return loss_mse, {"mse": loss_mse}
    
    def reg(self):
        # Add new regularization to the biases
        reg_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        reg_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        # Compute L2 reularization over user (P) and item (Q) matrices
        reg_user =  l2_regularize(self.user.weight) * self.c_vector
        reg_item = l2_regularize(self.item.weight) * self.c_vector
        # Add up the MSE loss + user & item regularization
        log = {"reg_user": reg_user, "reg_item": reg_item,
               "reg_bias_user": reg_bias_user, "reg_bias_item": reg_bias_item}
        total = reg_user + reg_item + reg_bias_user + reg_bias_item
        return total, log

In [12]:
from pytorch_lightning.loggers.wandb import WandbLogger

k = 128
c_bias = 1e-3
c_vector = 1e-5
model = MF(n_user, n_item, k=k, c_bias=c_bias, c_vector=c_vector,
          batch_size=1024)

# add a logger
logger = WandbLogger(name="05_mf", project="simple_mf")

trainer = pl.Trainer(max_epochs=100, logger=logger,
                     early_stop_callback=True,
                     progress_bar_refresh_rate=1) 

GPU available: True, used: False
TPU available: False, using: 0 TPU cores


#### Run model

In [None]:
trainer.fit(model, train, val)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33msf-moody[0m (use `wandb login --relogin` to force relogin)



  | Name      | Type      | Params
----------------------------------------
0 | user      | Embedding | 1 M   
1 | item      | Embedding | 1 M   
2 | bias_user | Embedding | 14 K  
3 | bias_item | Embedding | 14 K  


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

In [None]:
torch.save(model.state_dict(), "model_05_word2vec")

### Introspect the model

In [None]:
vectors_raw = model.user.weight.data.numpy()
vectors = vectors_raw / np.sqrt((vectors_raw**2.0).sum(axis=1)[:, None])

In [None]:
(vectors[0]**2.0).sum()

In [None]:
def find_closest(token, n=10):
    code = token2code[token]
    vector = vectors[code]
    similarity = np.sum(vector[None, :] * vectors, axis=1)
    closest = np.argsort(similarity)[::-1]
    for code in closest[1:n]:
        print(code2token[code], similarity[code])

In [None]:
find_closest('dude')
# bro 0.6443894
# chick 0.6427469
# guy 0.6156572
# cool 0.5742106
# chill 0.5504999
# wanna 0.5483899
# hey 0.53496593
# mad 0.5258949

In [None]:
find_closest('lol')
# wtf 0.6517912
# lmao 0.5773032
# omg 0.55501175
# haha 0.5454682
# abbreviation 0.5257285

In [None]:
find_closest('hipster')
# hipsters 0.8625888
# indie 0.67652184
# ironic 0.63480437
# vintage 0.63287544
# trend 0.58198345
# thrift 0.58075386
# pretentious 0.5771992
# conformist 0.56134546
# subculture 0.5545582

In [None]:
find_closest('pope')
# orthodox 0.65916073
# protestant 0.6568552
# salvation 0.6357822
# christianity 0.62910753
# scripture 0.6278157
# bible 0.6104638
# catholic 0.60814005
# messiah 0.5917543
# christ 0.5841886

In [None]:
find_closest('selfie')
# selfies 0.6768813
# instagram 0.58078086
# photo 0.5547765
# pic 0.5447346
# snapchat 0.54272944
# upload 0.52603865
# photographer 0.5154379
# caption 0.49573278
# tweet 0.47855204

### Subtract and add word vectors

In [None]:
def add_subtract(center, minus, plus, n=10):
    vector = (vectors[token2code[center]]
             - vectors[token2code[minus]]
             + vectors[token2code[plus]])
    similarity = np.sum(vector[None, :] * vectors, axis=1)
    closest = np.argsort(similarity)[::-1]
    for code in closest[2:n]:
        tok = code2token[code]
        if tok != center and tok != minus and tok != plus:
            print(code2token[code])

In [None]:
add_subtract('burrito', 'mexican', 'italian')
# hamburger
# spaghetti
# cheeseburger
# steak
# patty