In [8]:
import time

import numpy as np
import matplotlib.pyplot as plt

from abc import abstractmethod, ABC
from typing import Optional, Literal

from numpy.typing import NDArray

np.random.seed(1234)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams["figure.figsize"] = [16, 9]

### Handy utility functions

In [9]:
def zeros(*dims: int) -> NDArray[np.float32]:
    return np.zeros(shape=tuple(dims), dtype=np.float32)


def ones(*dims: int) -> NDArray[np.float32]:
    return np.ones(shape=tuple(dims), dtype=np.float32)


def rand(*dims: int) -> NDArray[np.float32]:
    return np.random.rand(*dims).astype(np.float32)


def randn(*dims: int) -> NDArray[np.float32]:
    return np.random.randn(*dims).astype(np.float32)


def sigmoid(x: NDArray) -> NDArray:
    return 1.0 / (1.0 + np.exp(-x))

# Word2Vec

In [10]:
import pickle
import gzip

with gzip.open("text8.dat.gz", "rb") as f:
    train_dict, train_set, train_tokens = pickle.load(f)

train_set = np.random.permutation(train_set)

In [11]:
from collections import namedtuple

Config = namedtuple(
    "Config",
    (
        "dict_size",
        "vect_size",
        "neg_samples",
        "updates",
        "learning_rate",
        "learning_rate_decay",
        "decay_period",
        "log_period",
    ),
)

config = Config(
    dict_size=len(train_dict),
    vect_size=100,
    neg_samples=10,
    updates=5_000_000,
    learning_rate=0.1,
    learning_rate_decay=0.995,
    decay_period=10_000,
    log_period=10_000,
)

## Negative Sampling

In order to train distributed word representations, first:
 - calculate gradient of the cost function with respect to the `word_vector` and store in `word_grad`.
 - calculate gradient of the cost function with respect to the `context_vect` and store in `context_grad`.
 - calculate gradient of the cost function with respect to the sampled `negative_vects` and store in neg_context_grad.

In [12]:
def neg_sample(
    config: Config,
    train_set: NDArray[np.int64],
    train_tokens: NDArray[np.int64],
) -> tuple[NDArray, NDArray, float]:
    lr = config.learning_rate
    loss = 0.0
    V_p = randn(config.dict_size, config.vect_size)
    V_o = randn(config.dict_size, config.vect_size)

    for i in range(config.updates):
        w_idx: int = train_set[i % len(train_set), 0]
        c_idx: int = train_set[i % len(train_set), 1]
        n_idx: NDArray[np.int64] = np.random.randint(0, len(train_tokens), config.neg_samples)
        n_idx: NDArray[np.int64] = train_tokens[n_idx]

        w = V_p[w_idx, :]  # word vector, shape `(dim,)`
        c = V_o[c_idx, :]  # context vector, shape `(dim,)`
        n = V_o[n_idx, :]  # sampled noise vectors, shape `(k, dim)`

        # Cost and gradient calculation
        # -----------------------------
        σ_p = sigmoid(+w @ c.T)  # shape `(1,)`
        σ_n = sigmoid(-w @ n.T)  # shape `(k,)`
        loss -= np.log(σ_p) + np.sum(np.log(σ_n))

        if (i + 1) % config.log_period == 0:
            print(f"Update {i+1}\tLoss: {loss / config.log_period:>2.2f}")
            final_loss = loss / config.log_period
            loss = 0.0

        grad_w = (σ_p - 1.0) * c + (1.0 - σ_n) @ n
        grad_c = (σ_p - 1.0) * w
        grad_n = (1.0 - σ_n).reshape(-1, 1) * w

        V_p[w_idx, :] -= lr * grad_w
        V_o[c_idx, :] -= lr * grad_c
        V_o[n_idx, :] -= lr * grad_n

        if i % config.decay_period == 0:
            lr = lr * config.learning_rate_decay

    return V_p, V_o, final_loss

In [13]:
V_p, V_o, loss = neg_sample(config, train_set, train_tokens)

Update 10000	Loss: 36.18
Update 20000	Loss: 28.55
Update 30000	Loss: 23.31
Update 40000	Loss: 19.56
Update 50000	Loss: 17.02
Update 60000	Loss: 15.54
Update 70000	Loss: 14.13
Update 80000	Loss: 13.11
Update 90000	Loss: 12.12
Update 100000	Loss: 11.62
Update 110000	Loss: 10.89
Update 120000	Loss: 10.51
Update 130000	Loss: 10.12
Update 140000	Loss: 9.83
Update 150000	Loss: 9.45
Update 160000	Loss: 9.25
Update 170000	Loss: 8.86
Update 180000	Loss: 8.51
Update 190000	Loss: 8.34
Update 200000	Loss: 8.18
Update 210000	Loss: 7.97
Update 220000	Loss: 7.83
Update 230000	Loss: 7.63
Update 240000	Loss: 7.61
Update 250000	Loss: 7.48
Update 260000	Loss: 7.24
Update 270000	Loss: 7.16
Update 280000	Loss: 7.00
Update 290000	Loss: 7.03
Update 300000	Loss: 6.94
Update 310000	Loss: 6.82
Update 320000	Loss: 6.76
Update 330000	Loss: 6.63
Update 340000	Loss: 6.68
Update 350000	Loss: 6.47
Update 360000	Loss: 6.40
Update 370000	Loss: 6.33
Update 380000	Loss: 6.36
Update 390000	Loss: 6.27
Update 400000	Loss: 6

### Word similarity

In [16]:
def lookup_word_idx(word, word_dict):
    try:
        return np.argwhere(np.array(word_dict) == word)[0][0]
    except:
        raise Exception("No such word in dict: {}".format(word))


def similar_words(embeddings, word, word_dict, hits):
    word_idx = lookup_word_idx(word, word_dict)
    similarity_scores = embeddings @ embeddings[word_idx]
    similar_word_idxs = np.argsort(-similarity_scores)
    return [word_dict[i] for i in similar_word_idxs[:hits]]

In [17]:
print("\n\nTraining cost: {0:>2.2f}\n\n".format(loss))

Vp_norm = V_p / np.linalg.norm(V_p, axis=1).reshape(-1, 1)
for w in ["zero", "computer", "cars", "home", "album"]:
    similar = similar_words(Vp_norm, w, train_dict, 5)
    print("Words similar to {}: {}".format(w, ", ".join(similar)))



Training cost: 3.43


Words similar to zero: zero, four, three, five, seven
Words similar to computer: computer, software, applications, video, systems
Words similar to cars: cars, vehicles, built, players, electric
Words similar to home: home, city, park, building, players
Words similar to album: album, released, band, appeared, song
