# Word to vec


In [None]:
!curl -O http://mattmahoney.net/dc/text8.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 29.8M  100 29.8M    0     0  2016k      0  0:00:15  0:00:15 --:--:-- 2060k


In [None]:
!unzip text8.zip

Archive:  text8.zip
  inflating: text8                   


In [None]:
!pip install -q catalyst==20.10.1

[K     |████████████████████████████████| 475 kB 5.3 MB/s 
[K     |████████████████████████████████| 181 kB 52.8 MB/s 
[K     |████████████████████████████████| 125 kB 46.8 MB/s 
[K     |████████████████████████████████| 63 kB 1.5 MB/s 
[?25h

In [None]:
import re
from collections import Counter
from tqdm.notebook import tqdm
import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import plotly.express as px
from sklearn.manifold import TSNE

In [None]:
class Corpus:
    def __init__(
        self, path, voc_max_size: int = 40000, min_word_freq: int = 20, max_corp_size=5e6
    ):
        corpus = []
        sentences = []
        with open(path, "r") as inp:
            for line in inp:
                corpus.append(line.split())
                sentences.append(line)
        corpus = np.array(corpus)
        self.corpus = corpus
        most_freq_word = \
            Counter(' '.join(sentences).split()).most_common(voc_max_size)
        most_freq_word = np.array(most_freq_word)
        most_freq_word = \
            most_freq_word[most_freq_word[:, 1].astype(int) > min_word_freq]
        
        print('Vocabulary size is:' + str(len(most_freq_word)))
        self.vocabulary = set(most_freq_word[:, 0])
        self.vocabulary.update(["<PAD>"])
        self.vocabulary.update(["<UNK>"])
        self.word_freq = most_freq_word
        self.idx_to_word = dict(list(enumerate(self.vocabulary)))
        self.word_to_idx = \
            dict([(i[1], i[0]) for i in enumerate(self.vocabulary)])
        self.W = None
        self.P = None
        self.positive_pairs = None
        
    def make_positive_dataset(self, window_size=2):

        if not self.W is None:
            return self.W, self.P
        W = []
        P = []
        pbar = tqdm(self.corpus)
        pbar.set_description('Creating context dataset')
        for message in pbar:

            if len(self.corpus) == 1:
                iter_ = tqdm(enumerate(message), total=len(message))
            else:
                iter_ = enumerate(message)
            
            for idx, word in iter_:
                if word not in self.vocabulary:
                    word = "<UNK>"
                start_idx = max(0, idx - window_size)
                end_idx = min(len(message), idx+window_size+1)
                pos_in_window = window_size
                if idx - window_size < 0:
                    pos_in_window += idx - window_size
                    
                co_words = message[start_idx:end_idx] 
                co_words = np.delete(co_words, pos_in_window) 
                filtered_co_words = []
                
                for co_word in co_words:
                    if co_word in self.vocabulary:
                        filtered_co_words.append(co_word)
                    else:
                        filtered_co_words.append("<UNK>")
                while len(filtered_co_words) < 2*window_size:
                    filtered_co_words.append("<PAD>")
                W.append(self.word_to_idx[word])
                co_word_idx = [self.word_to_idx[co_word] for co_word in filtered_co_words]
                P.append(co_word_idx)
        self.W = W
        self.P = P
        del self.corpus
        return W, P
    
    def make_positive_pairs(self):

        if not self.positive_pairs is None:
            return self.positive_pairs
        if self.W is None:
            self.make_positive_dataset()
        pairs = []
        pbar = tqdm(zip(self.W, self.P), total=len(self.W))
        pbar.set_description('Creating positive pairs')
        for w, p in pbar:
            for cur_p in p:
                if cur_p != self.word_to_idx["<PAD>"]:  # pad
                    pairs.append([w, cur_p])
        self.positive_pairs = pairs
        return pairs


In [None]:
corp = Corpus("text8")

Vocabulary size is:30964


In [None]:
pairs = corp.make_positive_pairs()

In [None]:
class Dataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __getitem__(self, idx):
        return {
            "word": torch.tensor(self.pairs[idx][0]),
            "context": torch.tensor(self.pairs[idx][1])
        }

    def __len__(self):
        return len(self.pairs)

In [None]:
train_ds = Dataset(pairs)
train_dl = DataLoader(train_ds, batch_size=2048)
loaders = {"train": train_dl}

In [None]:
class word2vec(nn.Module):
    def __init__(self, voc_size, emb_dim):
        super().__init__()
        self.encoder = nn.Embedding(voc_size, emb_dim)
        self.decoder = nn.Linear(emb_dim, voc_size, bias=False)
        self.voc_size = voc_size
        self.emb_dim = emb_dim
        self.init_emb()

    def forward(self, word):
        return self.decoder(self.encoder(word))

    def init_emb(self):
        initrange = 0.5 / self.emb_dim
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.weight.data.uniform_(0, 0)

    def get_embedding(self):
      return self.encoder    

In [None]:
from catalyst import dl


model = word2vec(len(corp.vocabulary), 300)
runner = dl.SupervisedRunner(
    input_key=["word"], input_target_key=["context"]
)

In [None]:
runner.train(
    model=model,
    optimizer=torch.optim.Adam(model.parameters()),
    loaders=loaders,
    criterion=nn.CrossEntropyLoss(),
    callbacks = [dl.CriterionCallback(input_key="context")],
    num_epochs=1,
    logdir="simple_w2v_1",
    verbose=True
)


Attention, there is only one dataloader - train



1/1 * Epoch (train): 100% 33214/33214 [1:12:44<00:00,  7.61it/s, loss=6.228]
[2022-04-29 15:27:12,281] 
1/1 * Epoch 1 (_base): lr=0.0010 | momentum=0.9000
1/1 * Epoch 1 (train): loss=6.7656
Top best models:
simple_w2v_1/checkpoints/train.1.pth	6.7656


In [None]:
from catalyst import utils

model = word2vec(len(corp.vocabulary), 300)
optimizer = ...
criterion = ...
checkpoint = utils.load_checkpoint('/content/simple_w2v_1/checkpoints/best_full.pth')
utils.unpack_checkpoint(
    checkpoint=checkpoint,
    model=model,
)

In [None]:
embeds = model.get_embedding()
embed_tensor = embeds.weight

In [None]:
embed_tensor

Parameter containing:
tensor([[-0.0607, -0.0840, -0.0520,  ..., -0.1182, -0.0951, -0.1704],
        [-0.0161, -0.0255, -0.0198,  ..., -0.1047, -0.0698, -0.0588],
        [-0.0199, -0.0887, -0.0395,  ..., -0.1287, -0.0131, -0.0778],
        ...,
        [ 0.1828, -0.0311,  0.0550,  ..., -0.0963,  0.0135,  0.0738],
        [ 0.0836, -0.1168,  0.2295,  ..., -0.1063, -0.0057,  0.1488],
        [-0.0035, -0.0186,  0.0998,  ..., -0.0816,  0.0021,  0.2033]],
       requires_grad=True)

In [None]:
corp.vocabulary

In [None]:
numpy_v_emb = embed_tensor.detach().numpy()
numpy_v_emb = numpy_v_emb

resized_v_emb = TSNE(n_components=2, learning_rate='auto',
                   init='random').fit_transform(numpy_v_emb)

In [None]:
x_embed = []
y_embed = []

for i in range(len(resized_v_emb)):
  x_embed.append(resized_v_emb[i][0])
  y_embed.append(resized_v_emb[i][1])

In [None]:
from IPython.display import Javascript
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 15000})'''))
fig = px.scatter(x=x_embed, y=y_embed, hover_name=list(corp.vocabulary))
fig.show()

<IPython.core.display.Javascript object>

In [None]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300')

