In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Word2Vec using CBOW

## Importing Libraries

In [2]:
import pandas as pd
import torchtext
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, Vocab
import numpy as np
from torchtext.data import to_map_style_dataset
from torch.utils.data import DataLoader
from functools import partial
from torch import optim
import torch
import torch.nn as nn 
from torch.optim.lr_scheduler import LambdaLR
import html

## Reading given Corpus file.

Here we are taking just a subset of data in order to save some compute.

In [79]:
filename = 'reviews_Movies_and_TV.json'
lim = 100000

In [None]:
with open(filename, "r") as f:
    with open("newfile.json", "w") as fp:
        for i in range(lim):
            fp.write(f.readline())

FileNotFoundError: ignored

In [3]:
df = pd.read_json("./drive/MyDrive/data/newfile.json", lines=True)

In [4]:
corpus = df["reviewText"].to_numpy()

In [5]:
corpus[:10]

array(["This has some great tips as always and is helping me to complete my Good Eats collection.  I haven't tried any of the recipes yet, but I will soon.  Sometimes it's just lovely to let Alton entertain us.",
       "This is a great pastry guide.  I love how Alton's collections can break it down so baking isn't so mystical and scary.  I might even try some of these recipes some day.",
       "I have to admit that I am a fan of Giada's cooking and I had great expectations when I ordered this set. They were however, crushed. While I still love Giada's cooking, this set is just a way for Food Network to make money. They really cheated with these DVD's. All they have are the video from the show, no text recipes, no link to the on line shows and no computer support. They play in Windows media player but the set does not contain the recipes. You can get more by taping the shows and then going to the web to download recipes. Another disappointment is the so so transfer quality to DVD. Per

In [6]:
print(torchtext.__version__)

0.14.1


## Preprocessing the text

Here I am converting some basic text in order for model to avoid unnecessary tokens.

_for eg: don't --> do not_

In [7]:
tokenizer = get_tokenizer("basic_english")

In [8]:
def clean_text(text):
    import re

    text = re.sub(r"([a-zA-Z]+)n[\'’]t", r"\1 not", text)
    text = re.sub(r"([iI])[\'’]m", r"\1 am", text)
    text = re.sub(r"([iI])[\'’]ll", r"\1 will", text)
    text = re.sub(r"[^a-zA-Z0-9\:\$\-\,\%\.\?\!]+", " ", text)
    text = html.unescape(text)
    # text = re.sub(r"([a-zA-Z]+)[\'’]s", r"\1 is", text)

    text = re.sub(r"_(.*?)_", r"\1", text)
    return text


In [9]:
tokens = []

for sent in corpus:
   tokens.append(tokenizer(clean_text(sent)))

In [10]:
# tokens = np.array(tokens)
tokens[:2]

[['this',
  'has',
  'some',
  'great',
  'tips',
  'as',
  'always',
  'and',
  'is',
  'helping',
  'me',
  'to',
  'complete',
  'my',
  'good',
  'eats',
  'collection',
  '.',
  'i',
  'have',
  'not',
  'tried',
  'any',
  'of',
  'the',
  'recipes',
  'yet',
  ',',
  'but',
  'i',
  'will',
  'soon',
  '.',
  'sometimes',
  'it',
  's',
  'just',
  'lovely',
  'to',
  'let',
  'alton',
  'entertain',
  'us',
  '.'],
 ['this',
  'is',
  'a',
  'great',
  'pastry',
  'guide',
  '.',
  'i',
  'love',
  'how',
  'alton',
  's',
  'collections',
  'can',
  'break',
  'it',
  'down',
  'so',
  'baking',
  'is',
  'not',
  'so',
  'mystical',
  'and',
  'scary',
  '.',
  'i',
  'might',
  'even',
  'try',
  'some',
  'of',
  'these',
  'recipes',
  'some',
  'day',
  '.']]

## Vocabulary Preparation

Here I am only considering words which are appeared more than 9 times. Otherwise they are considered to be $<unk>$

In [11]:
MIN_WORD_FREQUENCY = 10

vocab = build_vocab_from_iterator(tokens, min_freq=MIN_WORD_FREQUENCY, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [13]:
vocab.get_stoi()

{'zuzu': 27369,
 'zoomed': 27367,
 'yucky': 27361,
 'yearbook': 27359,
 'yamaha': 27357,
 'wozniac': 27351,
 'worshipping': 27350,
 'word-of-mouth': 27349,
 'woken': 27346,
 'wiseguy': 27344,
 'winslett': 27343,
 'winningham': 27342,
 'windy': 27341,
 'wimps': 27340,
 'wildness': 27339,
 'whooping': 27336,
 'whirling': 27334,
 'wendell': 27330,
 'well-choreographed': 27326,
 'warping': 27323,
 'wannabes': 27321,
 'waned': 27319,
 'wallpaper': 27318,
 'walk-on': 27316,
 'walberg': 27315,
 'wah': 27314,
 'wackiness': 27313,
 'vulnerabilities': 27311,
 'voyages': 27310,
 'volunteered': 27309,
 'violinist': 27308,
 'violates': 27306,
 'viii': 27305,
 'vigilantes': 27304,
 'vacations': 27296,
 'usurped': 27294,
 'upstage': 27293,
 'unwillingness': 27291,
 'unwed': 27289,
 'uninvolving': 27285,
 'unheralded': 27284,
 'ungodly': 27283,
 'unemotional': 27282,
 'unearthed': 27281,
 'undersea': 27278,
 'unconvincingly': 27276,
 'unbreakable': 27273,
 'unbound': 27272,
 'unapologetically': 27271,

In [12]:
print(f"Total words in text: {len(tokens)}")
print(f"Unique words: {len(vocab)}")

Total words in text: 100000
Unique words: 27035


## Model

Here I am declaring simple model with just **2 layers**

1. Embedding Layer with 300 dimensions as mentioned in the paper that it worked well after trying a lot of dimensions.
2. Linear layer that will give back the output as the whole vocabulary.

_Here we are only interested in the embedding layer as those are the **featurized representation** of the words._

In [105]:
# class CBOW(nn.Module):
#     def __init__(self, vocab, embedding_dim, max_norm):
#         super(CBOW, self).__init__()
#         self.embeddings = nn.Embedding(vocab, embedding_dim, max_norm)
#         self.linear = nn.Linear(embedding_dim, vocab)

#     def forward(self, input):
#         x = self.embeddings(input)
#         x = x.mean(axis=1)
#         x = self.linear(x)
#         return x


class CBOW2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_norm=5.0):
        super(CBOW2, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=max_norm)
        # self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_word, output_word, neg_words):
        input_emb = self.in_embeddings(input_word)
        output_emb = self.out_embeddings(output_word)
        neg_emb = self.out_embeddings(neg_words)

        print(output_emb.unsqueeze(1).shape, input_emb.shape)
        output_loss = torch.bmm(output_emb.unsqueeze(1), input_emb).squeeze().sigmoid().log()
        neg_loss = torch.bmm(neg_emb.neg(), input_emb).squeeze().sigmoid().log().sum(1)

        return -(output_loss.sum() + neg_loss.sum())


In [13]:
class CBOW_NS_Module(nn.Module):
    def __init__(self, emb_size, emb_dims, device):
        super(CBOW_NS_Module, self).__init__()
        self.device = device
        self.emb_size = emb_size
        self.emb_dims = emb_dims
        self.u_embeddings = nn.Embedding(emb_size, emb_dims)
        self.v_embeddings = nn.Embedding(emb_size, emb_dims)

    def forward(self, src_words, trg_words, wmasks, labels):
        p_src_emb = []
        for src_word in src_words:
            p_src_emb.append(self.u_embeddings(torch.tensor(src_word, dtype=torch.long).to(self.device)).sum(dim=0))
        src_emb = torch.stack(p_src_emb)

        trg_emb = self.v_embeddings(torch.tensor(trg_words, dtype=torch.long).to(self.device))

        wmasks = torch.tensor(wmasks, dtype=torch.float).to(self.device)
        labels = torch.tensor(labels, dtype=torch.float).to(self.device)

        pred = torch.bmm(src_emb.unsqueeze(1), trg_emb.permute(0, 2, 1)).squeeze(1)

        loss = nn.functional.binary_cross_entropy_with_logits(pred.float(), labels, reduction="none", weight=wmasks)
        loss = (loss.mean(dim=1) * wmasks.shape[1] / wmasks.sum(dim=1)).mean()

        return loss

    def get_embeddings(self):
        return self.u_embeddings.weight.data.cpu().numpy()

## Preparing Dataset

- Here I am taking the _window = 4_ i.e. 4 words before and 4 words after to grab the context, as authors mention 3-5 word window works best for the large dataset.

- I am also truncating the sequence to maximum of 256 length and creating input-output tensors for them.

In [14]:
import random

CBOW_WINDOW = 4
SEQ_LEN = 256
NEG_SAMPLE_SIZE = 4

def collate_cbow(batch, text_pipeline, vocab, word_freq):
    batch_input, batch_output, batch_neg = [], [], []
    batch_src_words, batch_trg_words, wmasks, labels = [], [], [], []
    for text in batch:
        text_tokens_ids = text_pipeline(text)
        if len(text_tokens_ids) < CBOW_WINDOW * 2 + 1:
            continue

        if SEQ_LEN:
            text_tokens_ids = text_tokens_ids[:SEQ_LEN]

        # for idx in range(len(text_tokens_ids) - CBOW_WINDOW * 2):
        #     token_id_sequence = text_tokens_ids[idx : (idx + CBOW_WINDOW * 2 + 1)]
        #     output = token_id_sequence.pop(CBOW_WINDOW)
        #     input_ = token_id_sequence
        #     batch_input.append(input_)
        #     batch_output.append(output)

        for idx in range(len(text_tokens_ids) - CBOW_WINDOW * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_WINDOW * 2 + 1)]

            # Taking out the focused target word
            output = token_id_sequence.pop(CBOW_WINDOW)

            # Rest of the context
            input_ = token_id_sequence

            neg_samples = []
            for j in range(NEG_SAMPLE_SIZE):
                rnd_word = random.randint(0, len(vocab) - 1)
                while rnd_word in input_:
                    rnd_word = random.randint(0, len(vocab) - 1)
                neg_samples.append(rnd_word)

            batch_src_words += [input_]
            batch_trg_words += [[output] + neg_samples]
            labels += [[1] + [0] * len(neg_samples)]
            wmasks += [[1] * (len(neg_samples)+1)]

            batch_input.append(input_)
            batch_output.append(output)
            batch_neg.append(neg_samples)
        
    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    batch_neg = torch.tensor(batch_neg, dtype=torch.long)

    batch_src_words = torch.tensor(batch_src_words, dtype=torch.long)
    batch_trg_words = torch.tensor(batch_trg_words, dtype=torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    wmasks = torch.tensor(wmasks, dtype=torch.long)


    return batch_src_words, batch_trg_words, wmasks, labels

In [15]:
word_freq = torch.zeros(len(vocab))
for word in vocab.get_itos():
    word_freq[vocab[word]] = word_freq[vocab[word]] + 1

# Normalize word frequencies to create a probability distribution
word_freq = word_freq / word_freq.sum()

In [16]:
len(word_freq)

27035

In [38]:
matched_style_corpus = to_map_style_dataset(corpus)
text_pipeline = lambda x: vocab(tokenizer(x))
train_dataloader = DataLoader(
        matched_style_corpus,
        batch_size=512,
        shuffle=True,
        collate_fn=partial(collate_cbow, text_pipeline=text_pipeline, vocab=vocab, word_freq=word_freq),
    )

In [39]:
len(train_dataloader)

196

In [40]:
for i in train_dataloader:
  print(i[0].shape, i[1].shape, i[2].shape)
  print(i)
  break

torch.Size([52230, 8]) torch.Size([52230, 5]) torch.Size([52230, 5])
(tensor([[   10,   811,     5,  ..., 24205,   112,    39],
        [  811,     5,   353,  ...,   112,    39,   349],
        [    5,   353,     6,  ...,    39,   349,     0],
        ...,
        [    7,  6727,     2,  ...,    27,  2844,     1],
        [ 6727,     2,  5404,  ...,  2844,     1,   320],
        [    2,  5404,    77,  ...,     1,   320,    25]]), tensor([[    6,  9811,   860, 19143,  7329],
        [  128, 25369, 18515, 24717, 10186],
        [24205, 12849,   140,   904, 14450],
        ...,
        [   77, 22523,  6310, 25915,  4545],
        [  737,  7269,  3947, 10458, 18817],
        [   27, 18072, 24406, 13207, 25223]]), tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        ...,
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]]), tensor([[1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0],
        ...,
        [1, 0, 0, 0, 0],
    

## Initialising Model

- Here I am initialising the model, loss function, optimizer, and scheduler.

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [42]:
EMBED_DIMENSION = 300 
EMBED_MAX_NORM = 1 

model = CBOW_NS_Module(len(vocab), EMBED_DIMENSION, device)
model.to(device)

CBOW_NS_Module(
  (u_embeddings): Embedding(27035, 300)
  (v_embeddings): Embedding(27035, 300)
)

In [43]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.025)

In [44]:
lr_lambda = lambda epoch: (5 - epoch) / 5
lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda, verbose=True)

Adjusting learning rate of group 0 to 2.5000e-02.


In [None]:
import tqdm
patience = 5
no_progress_num = 0
best_epoch_loss = float('inf')
for epoch in range(5):
    model.train()

    epoch_loss = 0
    for mini_batch in tqdm.tqdm(train_dataloader):
        batch_loss = model(*mini_batch)
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        epoch_loss += batch_loss
    print(f"Epoch {epoch+1}: Loss {epoch_loss}")

  p_src_emb.append(self.u_embeddings(torch.tensor(src_word, dtype=torch.long).to(self.device)).sum(dim=0))
  trg_emb = self.v_embeddings(torch.tensor(trg_words, dtype=torch.long).to(self.device))
  wmasks = torch.tensor(wmasks, dtype=torch.float).to(self.device)
  labels = torch.tensor(labels, dtype=torch.float).to(self.device)
 13%|█▎        | 26/196 [15:02<1:36:33, 34.08s/it]

## Training Section

In [74]:
EPOCHS = 5

In [36]:
loss_list = []
val_list = []
model.to(device)

CBOW_NS_Module(
  (u_embeddings): Embedding(27035, 300)
  (v_embeddings): Embedding(27035, 300)
)

In [76]:
for epoch in range(EPOCHS):
    model.train()
    running_loss = []

    for i, batch_data in enumerate(train_dataloader):

        inputs = batch_data[0].to(device)
        labels = batch_data[1].to(device)
        neg_samples = batch_data[2].to(device)

        optimizer.zero_grad()

        loss = model(inputs, labels, neg_samples)
        loss.backward()
        optimizer.step()
        
        running_loss.append(loss.item())
        break
    
    running_loss = np.array(running_loss)
    epoch_loss = np.mean(running_loss)
    # print(running_loss.shape, epoch_loss)
    loss_list.append(epoch_loss)
    currloss = loss_list[-1]
    print(f"Epoch: {epoch + 1}/{EPOCHS}, Train Loss={currloss:.5f}, Val Loss={0:.5f}")

    lr_scheduler.step()

torch.Size([5681, 1, 300]) torch.Size([5681, 8, 300])


RuntimeError: ignored

## Saving the model and Vocabulary

In [None]:
torch.save(model, "saved_modelv2.pt")
torch.save(vocab, "vocabv2.pt")

NameError: ignored

## Fetching the embeddings

As discussed above, the embedding layer itself is the ***featurized representation*** of the words.

In [None]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
embed_norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
embed_norms = np.reshape(embed_norms, (len(embed_norms), 1))
embeddings = embeddings / embed_norms
embeddings.shape

NameError: ignored

In [None]:
tokens = vocab.get_itos()
len(tokens)

27370

In [None]:
def get_top_similar(word, top = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : top + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        dist = dists[sim_word_id]
        topN_dict[sim_word] = np.round(dist, 3)
    return topN_dict

In [None]:
get_top_similar("titanic")

{'ship': 0.316,
 'holocaust': 0.289,
 'moon': 0.279,
 'tyburn': 0.271,
 '1996': 0.265,
 'germans': 0.258,
 '1912': 0.258,
 'submarines': 0.256,
 'passion': 0.25,
 'kiss': 0.248}