<a href="https://colab.research.google.com/github/axel-sirota/implement-nlp-word-embedding/blob/main/module3/Module3_Demo3_Build_CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchdata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS
import warnings
import os
from textblob import TextBlob, Word
import nltk
nltk.download('punkt')
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
fi

Writing get_data.sh


In [4]:
!bash get_data.sh


--2022-05-25 17:27:18--  https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8091185 (7.7M) [text/plain]
Saving to: ‘yelp.csv’


2022-05-25 17:27:19 (136 MB/s) - ‘yelp.csv’ saved [8091185/8091185]



In [5]:
path = './yelp.csv'
yelp = pd.read_csv(path)
text_df = yelp.text

In [6]:
EMBEDDING_DIM = 300
CORPUS_SIZE = 10000
train_size = 100000

In [7]:
def build_vocab(data_iter, tokenizer):
    """Builds vocabulary from iterator"""
    vocab = build_vocab_from_iterator(
        yield_tokens(data_iter, tokenizer),
        specials=["<unk>"],
        min_freq=10,
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)


In [8]:
text_sampled = text_df.sample(CORPUS_SIZE).values

In [9]:
tokenizer = lambda x: TextBlob(x).words
vocab = build_vocab(text_sampled, tokenizer)
print(f'Vocab size is {len(vocab)}')

Vocab size is 6854


In [10]:
vocab

Vocab()

In [11]:
vocab(tokenizer("This is a fantastic ice cream"))

[78, 8, 4, 387, 385, 309]

In [14]:
next(iter(text_sampled))

'One hour. That is how long we waited after ordering before we decided we would rather eat anywhere that wasn\'t here. A crushing, impressively underwhelming experience in lack of service.\n\nWe arrived, found we had free valet parking? With a teensy parking lot, this works, thank you. A good selection of indoor and outdoor seating? Also a plus. Setting expectations and/or taking care of waiting customers? FAIL.\n\nOn a beautiful AZ autumn morning we came here (tho we entered Rockerij). We asked to be re-seated outside because inside of Rockerij is dark as a tomb and had loud, conversation intruding, music. We were reseated by the fireplace, and a waiter came over shortly to take our order. Bam. So far so good...or so we thought. 30 Mins after placing our order, I ask another waiter to check on the status of our food and he cleverly asks about what we ordered (exactly)...so we think it is being taken care of. 40 MINS after placing our order, I have still only had my coffee refilled onc

In [15]:
vocab_size = len(vocab)
word_to_ix = {}
for sentence in text_sampled:
  for word in tokenizer(sentence):
    word_to_ix[word] = vocab([word])[0]

In [16]:
ix_to_word = {ix:word for word, ix in word_to_ix.items()}

In [17]:
data = []
for sentence in text_sampled:
  tokenized_sentence = tokenizer(sentence)
  for i in range(2, len(tokenized_sentence) - 2):
    context = [tokenized_sentence[i - 2], tokenized_sentence[i - 1],
               tokenized_sentence[i + 1], tokenized_sentence[i + 2]]
    target = tokenized_sentence[i]
    data.append((context, target))

In [18]:
print(f'Lenght of input (sampled) text set is {len(data)}, reducing it to {train_size}')

Lenght of input (sampled) text set is 1294617, reducing it to 100000


In [19]:
data = data[:train_size]

In [20]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long).to(device)

In [21]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

In [22]:
model = CBOW(vocab_size, EMBEDDING_DIM).to(device)

In [23]:
def loss_function(y_pred, y):
  return nn.functional.nll_loss(y_pred, y)

optimizer = torch.optim.AdamW(model.parameters())

In [24]:
data_pd = pd.DataFrame(data)

In [25]:
data_pd

Unnamed: 0,0,1
0,"[One, hour, is, how]",That
1,"[hour, That, how, long]",is
2,"[That, is, long, we]",how
3,"[is, how, we, waited]",long
4,"[how, long, waited, after]",we
...,...,...
99995,"[ago, and, was, some]",it
99996,"[and, it, some, of]",was
99997,"[it, was, of, the]",some
99998,"[was, some, the, best]",of


In [27]:
epochs = 5
for epoch in range(epochs):
  total_loss = 0
  ix = 0
  for context, target in data:
      context_vector = make_context_vector(context, word_to_ix)
      log_probs = model(context_vector)
      total_loss += loss_function(log_probs, torch.tensor([word_to_ix[target]]).to(device))
  print(f"-"*59)
  print(f"Epoch: {epoch} Loss: {total_loss}")
  total_loss.backward()
  optimizer.step()
  optimizer.zero_grad()

-----------------------------------------------------------
Epoch: 0 Loss: 951782.1875
-----------------------------------------------------------
Epoch: 1 Loss: 930395.25
-----------------------------------------------------------
Epoch: 2 Loss: 909755.4375
-----------------------------------------------------------
Epoch: 3 Loss: 889897.75
-----------------------------------------------------------
Epoch: 4 Loss: 870840.25


In [28]:
context = ['People','create','to', 'direct']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Context: ['People', 'create', 'to', 'direct']

Prediction: forward


In [29]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(6854, 300)

In [45]:
def get_top_similar(word, topN=10):
    word_vec = model.to("cpu").get_word_emdedding(word).detach().numpy()[0]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]
    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = ix_to_word[sim_word_id]
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

model.eval()
for word, sim in get_top_similar("excellent").items():
    print("{}: {:.3f}".format(word, sim))



onions: 4.011
code: 3.965
Crust: 3.657
closest: 3.307
drag: 3.303
Salon: 3.157
wings: 3.147
Rama: 3.135
advice: 3.056
Culinary: 3.018
