<a href="https://colab.research.google.com/github/axel-sirota/implement-nlp-word-embedding/blob/main/module3/Module3_Demo3_Build_CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS
import warnings
import os
from textblob import TextBlob, Word
import nltk
nltk.download('punkt')
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
fi

Overwriting get_data.sh


In [67]:
!bash get_data.sh


In [68]:
path = './yelp.csv'
yelp = pd.read_csv(path)
text_df = yelp.text

In [69]:
EMBEDDING_DIM = 300
EPOCHS = 10
BATCH_SIZE = 150
CORPUS_SIZE = 1000
train_size = 25000

In [70]:
def build_vocab(data_iter, tokenizer):
    """Builds vocabulary from iterator"""
    vocab = build_vocab_from_iterator(
        yield_tokens(data_iter, tokenizer),
        specials=["<unk>"],
        min_freq=10,
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)


In [71]:
text_sampled = text_df.sample(CORPUS_SIZE).values

In [72]:
tokenizer = lambda x: TextBlob(x).words
vocab = build_vocab(text_sampled, tokenizer)
print(f'Vocab size is {len(vocab)}')

Vocab size is 1396


In [73]:
vocab

Vocab()

In [74]:
vocab(tokenizer("This is a fantastic ice cream"))

[73, 8, 4, 403, 451, 356]

In [75]:
next(iter(text_sampled))

"Absolutely unbelievable!!! This is so worth the drive from Scottsdale to Tempe! But get there early as they close early and run out of food! It's so delicious my mouth is watering just thinking about it. I love French pastries, crepes and desserts! The macaroons melt in your mouth. I will be returning soon!"

In [76]:
vocab_size = len(vocab)
word_to_ix = {}
for sentence in text_sampled:
  for word in tokenizer(sentence):
    word_to_ix[word] = vocab([word])[0]

In [77]:
ix_to_word = {ix:word for word, ix in word_to_ix.items()}

In [78]:
data = []
for sentence in text_sampled:
  tokenized_sentence = tokenizer(sentence)
  for i in range(2, len(tokenized_sentence) - 2):
    context = [tokenized_sentence[i - 2], tokenized_sentence[i - 1],
               tokenized_sentence[i + 1], tokenized_sentence[i + 2]]
    target = tokenized_sentence[i]
    data.append((context, target))

In [79]:
print(f'Lenght of input (sampled) text set is {len(data)}, reducing it to {train_size}')

Lenght of input (sampled) text set is 135377, reducing it to 25000


In [80]:
data = data[:train_size]

In [81]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long).to(device)

In [82]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

In [83]:
model = CBOW(vocab_size, EMBEDDING_DIM).to(device)

In [84]:
def loss_function(y_pred, y):
  return nn.functional.nll_loss(y_pred, y)

optimizer = torch.optim.AdamW(model.parameters())

In [85]:
data_pd = pd.DataFrame(data)

In [86]:
data_pd

Unnamed: 0,0,1
0,"[Absolutely, unbelievable, is, so]",This
1,"[unbelievable, This, so, worth]",is
2,"[This, is, worth, the]",so
3,"[is, so, the, drive]",worth
4,"[so, worth, drive, from]",the
...,...,...
24995,"[my, fix, fry, bread]",of
24996,"[fix, of, bread, There]",fry
24997,"[of, fry, There, needs]",bread
24998,"[fry, bread, needs, to]",There


In [87]:
for epoch in range(EPOCHS):
  total_loss = 0
  ix = 0
  for context, target in data:
      context_vector = make_context_vector(context, word_to_ix)
      log_probs = model(context_vector)
      total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]).to(device))
      if ix > BATCH_SIZE:
        print(f"-"*59)
        print(f"Epoch: {epoch}, Batch: {ix+1}, Loss: {total_loss}")
        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss = 0

In [88]:
context = ['People','create','to', 'direct']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['People', 'create', 'to', 'direct']

Prediction: delicious


In [89]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(1396, 300)

In [90]:
def get_top_similar(word, topN=10):
    word_vec = model.to("cpu").get_word_emdedding(word).detach().numpy()[0]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]
    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = ix_to_word[sim_word_id]
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

model.eval()
for word, sim in get_top_similar("excellent").items():
    print("{}: {:.3f}".format(word, sim))



stores: 3.091
children: 2.749
love: 2.705
hear: 2.616
story: 2.612
hummus: 2.586
terrible: 2.489
due: 2.488
served: 2.473
same: 2.471
