<a href="https://colab.research.google.com/github/axel-sirota/implement-nlp-word-embedding/blob/main/module3/Module3_Demo3_Build_CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!nvidia-smi

Sat May 28 17:32:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    38W / 300W |   1445MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS
import warnings
import os
from textblob import TextBlob, Word
import nltk
nltk.download('punkt')
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
fi

Overwriting get_data.sh


In [30]:
!bash get_data.sh


In [31]:
path = './yelp.csv'
yelp = pd.read_csv(path)
text_df = yelp.text

In [32]:
EMBEDDING_DIM = 50
EPOCHS = 10
BATCH_SIZE = 2500
CORPUS_SIZE = 2000
train_size = 50000

In [33]:
def build_vocab(data_iter, tokenizer):
    """Builds vocabulary from iterator"""
    vocab = build_vocab_from_iterator(
        yield_tokens(data_iter, tokenizer),
        specials=["<unk>"],
        min_freq=10,
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)


In [34]:
text_sampled = text_df.sample(CORPUS_SIZE).values

In [35]:
tokenizer = lambda x: TextBlob(x).words
vocab = build_vocab(text_sampled, tokenizer)
print(f'Vocab size is {len(vocab)}')

Vocab size is 2295


In [36]:
vocab

Vocab()

In [37]:
vocab(tokenizer("This is a fantastic ice cream"))

[79, 8, 4, 384, 339, 264]

In [38]:
next(iter(text_sampled))

"Zipps has good bar food, but the service is usually horrible. The waitresses rarely come to your table. BUT, what really rubs me the wrong way is this....\nOn multiple occasions, when I pay the bill, the waiter doesn't bring me back the correct change. Even if my change is 1 cent, I'm entitled to receive my money back. \nI'm sorry, but not giving me back my  change because you deem it insignificant doesn't float well with me. They do have great food though."

In [39]:
vocab_size = len(vocab)
word_to_ix = {}
for sentence in text_sampled:
  for word in tokenizer(sentence):
    word_to_ix[word] = vocab([word])[0]

In [40]:
ix_to_word = {ix:word for word, ix in word_to_ix.items()}

In [41]:
data = []
for sentence in text_sampled:
  tokenized_sentence = tokenizer(sentence)
  for i in range(2, len(tokenized_sentence) - 2):
    context = [tokenized_sentence[i - 2], tokenized_sentence[i - 1],
               tokenized_sentence[i + 1], tokenized_sentence[i + 2]]
    target = tokenized_sentence[i]
    data.append((context, target))

In [42]:
print(f'Lenght of input (sampled) text set is {len(data)}, reducing it to {train_size}')

Lenght of input (sampled) text set is 261562, reducing it to 50000


In [43]:
data = data[:train_size]

In [44]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long).to(device)

In [45]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

In [46]:
model = CBOW(vocab_size, EMBEDDING_DIM).to(device)

In [47]:
def loss_function(y_pred, y):
  return nn.functional.nll_loss(y_pred, y)

optimizer = torch.optim.AdamW(model.parameters())

In [48]:
for epoch in range(EPOCHS):
  total_loss = 0
  n_rows = 1
  batches = 1
  for context, target in data:
      context_vector = make_context_vector(context, word_to_ix)
      log_probs = model(context_vector)
      total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]).to(device))
      if n_rows > BATCH_SIZE:
        print(f"-"*59)
        print(f"Epoch: {epoch}, Batch: {batches}, Loss: {total_loss}")
        batches += 1
        total_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss = 0
        n_rows = 0
      n_rows += 1

-----------------------------------------------------------
Epoch: 0, Batch: 1, Loss: 21423.90625
-----------------------------------------------------------
Epoch: 0, Batch: 2, Loss: 21304.876953125
-----------------------------------------------------------
Epoch: 0, Batch: 3, Loss: 21323.130859375
-----------------------------------------------------------
Epoch: 0, Batch: 4, Loss: 21183.296875
-----------------------------------------------------------
Epoch: 0, Batch: 5, Loss: 21090.353515625
-----------------------------------------------------------
Epoch: 0, Batch: 6, Loss: 21037.068359375
-----------------------------------------------------------
Epoch: 0, Batch: 7, Loss: 20893.548828125
-----------------------------------------------------------
Epoch: 0, Batch: 8, Loss: 20765.15625
-----------------------------------------------------------
Epoch: 0, Batch: 9, Loss: 20837.5234375
-----------------------------------------------------------
Epoch: 0, Batch: 10, Loss: 20748.64

In [49]:
context = ['People','create','to', 'direct']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['People', 'create', 'to', 'direct']

Prediction: the


In [50]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(2295, 50)

In [51]:
def get_top_similar(word, topN=10):
    word_vec = model.to("cpu").get_word_emdedding(word).detach().numpy()[0]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]
    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = ix_to_word[sim_word_id]
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

model.eval()
for word, sim in get_top_similar("excellent").items():
    print("{}: {:.3f}".format(word, sim))



Japanese: 3.070
season: 3.068
anywhere: 3.041
dishes: 2.797
nachos: 2.727
basis: 2.715
supposed: 2.639
restaurant: 2.577
affordable: 2.532
Yelp: 2.500
