<a href="https://colab.research.google.com/github/alyson-mei/ml_stuff/blob/beta/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup


In [1]:
!pip install --upgrade nltk bokeh umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [2]:
import itertools
import random
import string
from collections import Counter
from itertools import chain

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import umap
from IPython.display import clear_output
from matplotlib import pyplot as plt
from nltk.tokenize import WordPunctTokenizer
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from tqdm.auto import tqdm as tqdma



In [28]:
MIN_COUNT = 5
WINDOW_RADIUS = 5

BATCH_SIZE = 2048
SIZE = 10
NUM_NEGATIVES = 1
N_EPOCHS = 50

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
def softmax(x, T = 0.5):
    exp = np.exp((x - max(x))/T)
    return x/np.sum(x)

In [5]:
# download the data:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./quora.txt -nc
# alternative download link: https://yadi.sk/i/BPQrUu1NaTduEw

--2025-01-04 10:03:45--  https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.2.18, 2620:100:6017:18::a27d:212
Connecting to www.dropbox.com (www.dropbox.com)|162.125.2.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/p0t2dw6oqs6oxpd6zz534/quora.txt?rlkey=bjupppwua4zmd4elz8octecy9&dl=1 [following]
--2025-01-04 10:03:45--  https://www.dropbox.com/scl/fi/p0t2dw6oqs6oxpd6zz534/quora.txt?rlkey=bjupppwua4zmd4elz8octecy9&dl=1
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc7efdf7583a08418b62e30727c8.dl.dropboxusercontent.com/cd/0/inline/ChiF_Y3l5pMOM5HyfMf2x4v-MTLIzi7df0BKv7sdjgtl_d_fVN6i1-27VrB0L_v3bGaT-VaDKXjRaFzBRK58UWh4O1VhR3_GCrv3ouvz-emqV6tZJPl8gtKNh0oDNnIxj8U/file?dl=1# [following]
--2025-01-04 10:03:46--  https://uc7efdf7583a08418b62e30727c8.dl.dropboxusercontent.com/cd/0/inline/ChiF_Y3l5pM

In [6]:
data = list(open("./quora.txt", encoding="utf-8"))
data[50]

"What TV shows or books help you read people's body language?\n"

## Tokenization

In [7]:
tokenizer = WordPunctTokenizer()

print(tokenizer.tokenize(data[50]))

['What', 'TV', 'shows', 'or', 'books', 'help', 'you', 'read', 'people', "'", 's', 'body', 'language', '?']


In [10]:
data_tok = [
    tokenizer.tokenize(
        line.translate(str.maketrans("", "", string.punctuation)).lower()
    )
    for line in data
]
data_tok = [x for x in data_tok if len(x) >= 3]

In [11]:
vocabulary_with_counter = Counter(chain.from_iterable(data_tok))

word_count_dict = dict()
for word, counter in vocabulary_with_counter.items():
    if counter >= MIN_COUNT:
        word_count_dict[word] = counter

vocabulary = set(word_count_dict.keys())
del vocabulary_with_counter

NUM_WORDS = len(vocabulary)

In [12]:
word_to_index = {word: index for index, word in enumerate(vocabulary)}
index_to_word = {index: word for word, index in word_to_index.items()}

In [13]:
data = {i: {} for i in range(NUM_WORDS)}

for text in tqdma(data_tok):
    for i, central_word in enumerate(text):
        context_indices = range(max(0, i -  WINDOW_RADIUS), min(i + WINDOW_RADIUS, len(text)))
        for j in context_indices:
            if j == i:
                continue
            context_word = text[j]
            if central_word in vocabulary and context_word in vocabulary and central_word != context_word:
                central_idx = word_to_index[central_word]
                context_idx = word_to_index[context_word]
                if context_idx not in data[central_idx]:
                    data[central_idx][context_idx] = 1
                else:
                    data[central_idx][context_idx] += 1


  0%|          | 0/537174 [00:00<?, ?it/s]

## Subsampling

Probability of dropping a word based on its frequency
 $f(w)$:

$$
P_\text{drop}(w)= 1 - \textrm{min}\left(1, \sqrt{\frac{t}{f(w)}}\right)
$$

In [14]:
def subsample_frequent_words(word_count_dict, threshold=1e-5):
    total_words = sum(word_count_dict.values())

    total_words = sum(word_count_dict.values())
    keep_prob_dict = {
        word: np.sqrt(min(1., threshold / ((word_count_dict[word] / total_words)))) for word in word_count_dict.keys()
    }

    return keep_prob_dict

In [15]:
keep_prob_dict = subsample_frequent_words(word_count_dict)
assert keep_prob_dict.keys() == word_count_dict.keys()

In [16]:
keep_prob_array = np.array(
    [keep_prob_dict[index_to_word[idx]] for idx in range(len(word_to_index))]
)

keep_prob_array /= np.sum(keep_prob_array)

data_val = []
data_id = []

for i in tqdma(range(NUM_WORDS)):
    keys = np.array(list(data[i].keys()))
    values = np.array(list(data[i].values())) * keep_prob_array[keys]
    argsort = np.argsort(np.array(values))[-SIZE:]
    if len(values) >= SIZE:
        new_values = softmax(np.array(values)[argsort])
        data_val.append(new_values)
        data_id.append(np.array(keys)[argsort])
    else:
        new_values = np.concatenate((softmax(np.array(values)[argsort]), np.zeros(SIZE - len(values))))
        new_keys = np.concatenate((keys, np.random.randint(low = 0, high = NUM_WORDS, size = SIZE - len(values))))
        data_val.append(new_values)
        data_id.append(new_keys)

data_val = torch.tensor(data_val).type(torch.float32)
data_id = torch.tensor(data_id)
keep_prob_array = torch.tensor(keep_prob_array)

  0%|          | 0/28546 [00:00<?, ?it/s]

  data_val = torch.tensor(data_val).type(torch.float32)


## Negative sampling
Probability of using a word as a negative sample:

$$
P_n(w) = \frac{f(w)^{3/4}}{Z}
$$


In [17]:
def get_negative_sampling_prob(word_count_dict):
    total_words = sum(word_count_dict.values())

    negative_sampling_prob_dict = {
        word: (word_count_dict[word]/total_words)**(3/4) for word in word_count_dict.keys()
    }
    Z = sum(negative_sampling_prob_dict.values())
    negative_sampling_prob_dict = {
        word: negative_sampling_prob_dict[word]/Z for word in negative_sampling_prob_dict.keys()
    }

    return negative_sampling_prob_dict

In [18]:
negative_sampling_prob_dict = get_negative_sampling_prob(word_count_dict)
assert negative_sampling_prob_dict.keys() == negative_sampling_prob_dict.keys()
assert np.allclose(sum(negative_sampling_prob_dict.values()), 1)

negative_sampling_prob_array = np.array(
    [
        negative_sampling_prob_dict[index_to_word[idx]]
        for idx in range(len(word_to_index))
    ]
)

## Some tests

In [19]:
word_idx = word_to_index['python']

for idx in data_id[word_idx]:
    print(index_to_word[idx.item()])

programming
php
scraping
r
learning
script
django
java
ruby
learn


In [20]:
center_id = torch.arange(4)
context_id = data_id[center_id]

center_embeddings = nn.Embedding(NUM_WORDS, 8)
context_embeddings = nn.Embedding(NUM_WORDS, 8)

center_embeds = center_embeddings(center_id)
context_embeds = context_embeddings(context_id)
weights = data_val[center_id].type(torch.float32)

cos_sim = F.cosine_similarity(center_embeds.unsqueeze(1).expand(4, SIZE, 8), context_embeds, dim = 2)
score = torch.bmm(cos_sim.unsqueeze(1), weights.unsqueeze(2)).squeeze()
score

tensor([ 0.2547,  0.0056,  0.0667, -0.1229], grad_fn=<SqueezeBackward0>)

## SkipGramModel

### Model and train

In [21]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size = NUM_WORDS, embedding_dim = 64):
        super().__init__()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.cos_sim = None
        self.weights = None

    def forward(self, center_id, context_id): # center words from batch choice, context words from data_id
        center_embeds = self.center_embeddings(center_id)
        context_embeds = self.context_embeddings(context_id)
        self.weights = data_val[center_id].type(torch.float32)

        self.cos_sim = F.cosine_similarity(center_embeds.unsqueeze(1).expand(context_embeds.shape), context_embeds, dim = 2)
        score = torch.bmm(self.cos_sim.unsqueeze(1), self.weights.unsqueeze(2)).squeeze()

        return score

In [22]:
batch_center = torch.arange(BATCH_SIZE)
batch_context = data_id[batch_center]
model = SkipGramModel(NUM_WORDS, 16)
model(batch_center, batch_context)

tensor([-0.0666,  0.0075,  0.1176,  ...,  0.1354, -0.0638, -0.1076],
       grad_fn=<SqueezeBackward0>)

In [23]:
def get_word_vector(word, embedding_matrix, word_to_index=word_to_index):
    return embedding_matrix[word_to_index[word]]

In [24]:
def find_nearest(word, embedding_matrix, word_to_index=word_to_index, k=10):
    word_vector = get_word_vector(word, embedding_matrix)[None, :]
    dists = F.cosine_similarity(embedding_matrix, word_vector)
    index_sorted = torch.argsort(dists)
    top_k = index_sorted[-k:]
    return [(index_to_word[x], dists[x].item()) for x in top_k.numpy()]

In [45]:
def train_skipgram(
    model,
    data_id,
    data_val,
    word_to_index,
    optimizer,
    lr_scheduler,
    n_epochs = N_EPOCHS,
    device = DEVICE,
    batch_size = BATCH_SIZE,
    num_negatives = NUM_NEGATIVES
):

    pos_labels = torch.ones(batch_size).to(device)
    loss_history = []

    for epoch in tqdma(range(n_epochs)):
        for i in range(0, NUM_WORDS - batch_size, batch_size):
            center_id = torch.multinomial(keep_prob_array, batch_size)
            context_id = data_id[center_id].to(device)

            optimizer.zero_grad()
            scores = model(center_id, context_id)

            loss = criterion(scores, pos_labels)

            loss.backward()
            optimizer.step()

            loss_history.append(loss.item())
            lr_scheduler.step(loss_history[-1])

        print(
            f"Epoch {epoch}, Loss: {np.mean(loss_history[- NUM_WORDS // batch_size:])}, learning rate: {lr_scheduler._last_lr}"
        )
        model.eval()

        with torch.no_grad():
            embedding_matrix_context = model.context_embeddings.weight
            print(f'Nearest to "python": ')
            print(find_nearest("python", embedding_matrix_context, k = 10))

            similarity_1 = F.cosine_similarity(
                get_word_vector("iphone", embedding_matrix_context)[None, :],
                get_word_vector("apple", embedding_matrix_context)[None, :],
            )
            similarity_2 = F.cosine_similarity(
                get_word_vector("iphone", embedding_matrix_context)[None, :],
                get_word_vector("dell", embedding_matrix_context)[None, :],
            )
            print(f'"iphone" is closer to "apple" than to "dell": {(similarity_1 > similarity_2).item()}')
            similarity_1 = F.cosine_similarity(
                get_word_vector("windows", embedding_matrix_context)[None, :],
                get_word_vector("laptop", embedding_matrix_context)[None, :],
            )
            similarity_2 = F.cosine_similarity(
                get_word_vector("windows", embedding_matrix_context)[None, :],
                get_word_vector("macbook", embedding_matrix_context)[None, :],
            )
            print(f'"windows" is closer to "laptop" than to "macbook": {(similarity_1 > similarity_2).item()}')

            center_id = torch.tensor(word_to_index['python']).reshape(1).to(device)
            context_id = data_id[center_id].to(device)

            print(f'Score for "python" {model(center_id, context_id)}')
            print('Cosine similarities for "python" with top-SIZE words: ')
            print(f'{model.cos_sim} \n')

        model.train()

In [53]:
vocab_size = NUM_WORDS
embedding_dim = 64

model = SkipGramModel(vocab_size, embedding_dim).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=5e-2)
lr_scheduler = ReduceLROnPlateau(optimizer, factor=0.9)
criterion = nn.BCEWithLogitsLoss()

In [54]:
batch_size = BATCH_SIZE
train_skipgram(
    model,
    data_id,
    data_val,
    word_to_index,
    optimizer,
    lr_scheduler,
    batch_size = batch_size
)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 0, Loss: 0.6784409605539762, learning rate: [0.05]
Nearest to "python": 
[('luffy', 0.43478330969810486), ('normative', 0.4398637115955353), ('confuse', 0.44137993454933167), ('physiologically', 0.4416586458683014), ('hdmi', 0.4420633018016815), ('burmese', 0.44318726658821106), ('stalling', 0.4594016671180725), ('tom', 0.4599189758300781), ('ic', 0.4747101366519928), ('python', 1.0)]
"iphone" is closer to "apple" than to "dell": False
"windows" is closer to "laptop" than to "macbook": False
Score for "python" 0.07131633907556534
Cosine similarities for "python" with top-SIZE words: 
tensor([[ 0.0847,  0.0909,  0.1772,  0.0247,  0.1615,  0.1081,  0.1297,  0.0667,
          0.0090, -0.0535]]) 

Epoch 1, Loss: 0.6420469624655587, learning rate: [0.05]
Nearest to "python": 
[('describing', 0.42232441902160645), ('hdmi', 0.4239504337310791), ('3k', 0.4270451068878174), ('confuse', 0.4344664514064789), ('physiologically', 0.4360431134700775), ('tom', 0.4394197165966034), ('normative',

### Eval

In [73]:
embedding_matrix_context = model.context_embeddings.weight

In [74]:
similarity_1 = F.cosine_similarity(
    get_word_vector("iphone", embedding_matrix_context)[None, :],
    get_word_vector("apple", embedding_matrix_context)[None, :],
)
similarity_2 = F.cosine_similarity(
    get_word_vector("iphone", embedding_matrix_context)[None, :],
    get_word_vector("dell", embedding_matrix_context)[None, :],
)
assert similarity_1 > similarity_2

In [75]:
similarity_1 = F.cosine_similarity(
    get_word_vector("windows", embedding_matrix_context)[None, :],
    get_word_vector("laptop", embedding_matrix_context)[None, :],
)
similarity_2 = F.cosine_similarity(
    get_word_vector("windows", embedding_matrix_context)[None, :],
    get_word_vector("macbook", embedding_matrix_context)[None, :],
)
assert similarity_1 > similarity_2

In [76]:
def find_nearest(word, embedding_matrix, word_to_index=word_to_index, k=10):
    word_vector = get_word_vector(word, embedding_matrix)[None, :]
    dists = F.cosine_similarity(embedding_matrix, word_vector)
    index_sorted = torch.argsort(dists)
    top_k = index_sorted[-k:]
    return [(index_to_word[x], dists[x].item()) for x in top_k.numpy()]

In [85]:
find_nearest("python", embedding_matrix_context, k = 10)

[('database', 0.9997155666351318),
 ('developers', 0.9997214078903198),
 ('eclipse', 0.9997521638870239),
 ('queries', 0.9997551441192627),
 ('frameworks', 0.9997574090957642),
 ('developer', 0.9997648000717163),
 ('server', 0.9997810125350952),
 ('aws', 0.9997812509536743),
 ('java', 0.9998776316642761),
 ('python', 1.0000001192092896)]

In [86]:
find_nearest("iphone", embedding_matrix_context, k = 10)

[('android', 0.9993677139282227),
 ('ios', 0.9993709325790405),
 ('upgrade', 0.9993985891342163),
 ('facetime', 0.9994360208511353),
 ('update', 0.9994602799415588),
 ('unlocked', 0.9994757175445557),
 ('plus', 0.9994993209838867),
 ('rooting', 0.9995481371879578),
 ('6s', 0.9997088313102722),
 ('iphone', 1.0)]

In [87]:
find_nearest("learn", embedding_matrix_context, k = 10)

[('editing', 0.999640941619873),
 ('scratch', 0.9996446371078491),
 ('data', 0.999656617641449),
 ('tips', 0.9996582269668579),
 ('amazon', 0.9996594190597534),
 ('unix', 0.9996654987335205),
 ('languages', 0.9996708035469055),
 ('programmers', 0.9996929168701172),
 ('programming', 0.9996968507766724),
 ('learn', 1.0000001192092896)]

In [92]:
top_k = 5000
_top_words = sorted([x for x in word_count_dict.items()], key=lambda x: x[1])[
    -top_k - 100 : -100
]  # ignoring 100 most frequent words
top_words = [x[0] for x in _top_words]
del _top_words

word_embeddings = torch.cat(
    [embedding_matrix_context[word_to_index[x]][None, :] for x in top_words], dim=0
).detach().numpy()

In [93]:
import bokeh.models as bm
import bokeh.plotting as pl
from bokeh.io import output_notebook

output_notebook()


def draw_vectors(
    x,
    y,
    radius=10,
    alpha=0.25,
    color="blue",
    width=600,
    height=400,
    show=True,
    **kwargs,
):
    """draws an interactive plot for data points with auxilirary info on hover"""
    if isinstance(color, str):
        color = [color] * len(x)
    data_source = bm.ColumnDataSource({"x": x, "y": y, "color": color, **kwargs})

    fig = pl.figure(active_scroll="wheel_zoom", width=width, height=height)
    fig.scatter("x", "y", size=radius, color="color", alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show:
        pl.show(fig)
    return fig

In [94]:
embedding = umap.UMAP(n_neighbors=5).fit_transform(word_embeddings)



In [95]:
draw_vectors(embedding[:, 0], embedding[:, 1], token=top_words)