# CBOW model trained on "20000 lieues sous les mers"
## Needed libraries

In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

import spacy
from spacy.lang.fr import French

In [2]:
#python -m spacy download fr_core_news_sm
spacy_fr = spacy.load("fr_core_news_sm")

## Tokenizing the corpus

In [36]:
# Create a tokenizer for the french language
tokenizer = French().Defaults.create_tokenizer()

with open("data/20_000_lieues_sous_les_mers.txt", "r", encoding="utf-8") as f:
    document = tokenizer(f.read())

# Define a filtered set of tokens by iterating on `document`
# filter on blank, comma etc
tokens = [tok.text for tok in document if tok.is_alpha]


# Make a list of unique tokens and dictionary that maps tokens to
# their index in that list.
idx2tok = []
tok2idx = {}
i =  0
for tok in tokens:
    if tok not in idx2tok:
        idx2tok.append(tok)
        tok2idx[tok] = i
        i += 1

## The continuous bag of words model

In [49]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # Define an Embedding module (`nn.Embedding`) and a linear
        # transform (`nn.Linear`) without bias.
        self.embeddings = nn.Embedding(self.vocab_size, self.embedding_size)
        self.U_transpose = nn.Linear(self.embedding_size, self.vocab_size, bias=False) #from model we dont want biais


    def forward(self, context):
        # Implements the forward pass
        # `context` is of size `batch_size` * NGRAMS

        # `e_i` is of size `batch_size` * NGRAMS * `embedding_size`
        e_i = self.embeddings(context)


        # `e_bar` is of size `batch_size` * `embedding_size`
        e_bar = torch.mean(e_i, 1) # average all the words belonging to the context


        # `UT_e_bar` is of size `batch_size` * `vocab_size`
        UT_e_bar = self.U_transpose(e_bar)


        # Use `F.log_softmax` function
        return F.log_softmax(UT_e_bar, dim=1)



# Set the size of vocabulary and size of embedding
VOCAB_SIZE = len(idx2tok)
EMBEDDING_SIZE = 64

# Create a Continuous bag of words model
cbow = CBOW(VOCAB_SIZE, EMBEDDING_SIZE)

## Preparing the data

In [50]:
def ngrams_iterator(token_list, ngrams):
    """Generates sucessive N-grams from a list of tokens."""

    # Creates `ngrams` lists shifted to the left
    token_list_shifts = [token_list[i:] for i in range(ngrams)]
    for ngram in zip(*token_list_shifts):
        # Get indexes of tokens
        idxs = [tok2idx[tok] for tok in ngram]

        # Get center element in `idxs`
        center = idxs.pop(ngrams // 2)

        # Yield the index of center word and indexes of context words
        # as a Numpy array (for Pytorch to automatically convert it to
        # a Tensor).
        yield center, np.array(idxs)


# Create center, context data
NGRAMS = 5
ngrams = list(ngrams_iterator(tokens, NGRAMS))

BATCH_SIZE = 256
data = torch.utils.data.DataLoader(ngrams, batch_size=BATCH_SIZE, shuffle=True)

## Learn CBOW model

In [51]:
# Use the Adam algorithm on the parameters of `cbow` with a learning
# rate of 0.01
optimizer = optim.Adam(cbow.parameters(), lr=0.01)

# Use a negative log-likelyhood loss from the `nn` submodule
nll_loss = nn.NLLLoss()

In [52]:
EPOCHS = 10
try:
    for epoch in range(EPOCHS):
        total_loss = 0
        for i, (center, context) in enumerate(data):
            # Reset the gradients of the computational graph
            cbow.zero_grad()

            # Forward pass
            # nlll_w_hat is of size ``batch_size*vocab_size``
            nll_w_hat = cbow.forward(context)


            # Compute negative log-likelyhood loss averaged over the
            # mini-batch
            loss = nll_loss(nll_w_hat, center)


            # Backward pass to compute gradients of each parameter
            loss.backward()

            # Gradient descent step according to the chosen optimizer
            optimizer.step()

            total_loss += loss.data

            if i % 20 == 0:
                loss_avg = float(total_loss / (i + 1))
                print(
                    f"Epoch ({epoch}/{EPOCHS}), batch: ({i}/{len(data)}), loss: {loss_avg}"
                )

        # Print average loss after each epoch
        loss_avg = float(total_loss / len(data))
        print("{}/{} loss {:.2f}".format(epoch, EPOCHS, loss_avg))

        # Predict if `predict_center_word` is implemented
        try:
            left_words = ["le", "capitaine"]
            right_words = ["me", "dit"]
            word = predict_center_word(cbow, *left_words, *right_words)[0]
            print(" ".join(left_words + [word] + right_words))
        except:
            pass

except KeyboardInterrupt:
    print("Stopped!")

Epoch (0/10), batch: (0/541), loss: 9.669973373413086
Epoch (0/10), batch: (20/541), loss: 9.35292911529541
Epoch (0/10), batch: (40/541), loss: 8.901827812194824
Epoch (0/10), batch: (60/541), loss: 8.625412940979004
Epoch (0/10), batch: (80/541), loss: 8.406564712524414
Epoch (0/10), batch: (100/541), loss: 8.243134498596191
Epoch (0/10), batch: (120/541), loss: 8.11301040649414
Epoch (0/10), batch: (140/541), loss: 7.999739646911621
Epoch (0/10), batch: (160/541), loss: 7.9211344718933105
Epoch (0/10), batch: (180/541), loss: 7.847839832305908
Epoch (0/10), batch: (200/541), loss: 7.782588481903076
Epoch (0/10), batch: (220/541), loss: 7.7256011962890625
Epoch (0/10), batch: (240/541), loss: 7.674098968505859
Epoch (0/10), batch: (260/541), loss: 7.628327369689941
Epoch (0/10), batch: (280/541), loss: 7.588433742523193
Epoch (0/10), batch: (300/541), loss: 7.546175003051758
Epoch (0/10), batch: (320/541), loss: 7.510591506958008
Epoch (0/10), batch: (340/541), loss: 7.47175788879394

In [54]:
-np.log(1/len(idx2tok))

9.629248055780836

In [75]:
def predict_center_word_idx(cbow, *context_words_idx, k=10):
    """Return k-best center words given indexes of context words."""

    # Create a fake minibatch containing just one example
    fake_minibatch = torch.LongTensor(context_words_idx).unsqueeze(0)
    
    # forward propagate thought the CBOW model
    dist_center = cbow(fake_minibatch).squeeze(0)
    
    # Retrieve top k-best indexes using 'torch.topk'
    _, best_idxs = torch.topk(dist_center, k=k)
    
    # Return actual tokens using `idx2tok`
    return [idx2tok[idx] for idx in best_idxs]


def predict_center_word(cbow, *context_words, k=10):
    """Return k-best center words given context words."""

    idxs = [tok2idx[tok] for tok in context_words]
    return predict_center_word_idx(cbow, *idxs, k=k)

In [76]:
print(predict_center_word(cbow, "vingt", "mille", "sous", "les"))
print(predict_center_word(cbow, "mille", "lieues", "les", "mers"))
print(predict_center_word(cbow, "le", "commandant", "fut", "le"))

['pieds', 'milles', 'mètres', 'trous', 'six', 'roches', 'lieues', 'livres', 'Pendant', 'de']
['par', 'du', 'de', 'deux', 'et', 'Dans', 'sept', 'dans', 'soit', 'commun']
['Farragut', 'Nautilus', 'tuerait', 'cria', 'ci', 'sur', 'frictionnions', 'habituel', 'second', 'attendre']


# Testing the embedding

Tokens by decreasing frequency

In [78]:
freq = np.zeros((len(idx2tok),), int)
for tok in tokens:
    freq[tok2idx[tok]] += 1
idxs = freq.argsort()[::-1]
words_decreasing_freq = list(zip(np.array(idx2tok)[idxs], freq[idxs]))

In [80]:
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors

In [84]:
m = WordEmbeddingsKeyedVectors(vector_size=EMBEDDING_SIZE)
m.add(idx2tok, cbow.embeddings.weight.detach().numpy())

In [85]:
m.most_similar("bassins")

[('comatules', 0.5313119292259216),
 ('embranchements', 0.529474139213562),
 ('Porcelaines', 0.5240614414215088),
 ('tatous', 0.5176289081573486),
 ('corsetés', 0.5093542337417603),
 ('côtes', 0.5086833238601685),
 ('entrailles', 0.4992835521697998),
 ('immodérément', 0.497053325176239),
 ('perspective', 0.4962666928768158),
 ('regardant', 0.492682546377182)]