# CBOW model trained on "20000 lieues sous les mers"
## Needed libraries

In [None]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

import spacy
from spacy.lang.fr import French

In [None]:
# python -m spacy download fr_core_news_sm
spacy_fr = spacy.load("fr_core_news_sm")

## Tokenizing the corpus

In [None]:
# Create a tokenizer for the french language
tokenizer = French().Defaults.create_tokenizer()

with open("data/20_000_lieues_sous_les_mers.txt", "r", encoding="utf-8") as f:
    document = tokenizer(f.read())

# Define a filtered set of tokens by iterating on `document`
tokens = [tok.text.lower() for tok in document if tok.is_alpha]

# Make a list of unique tokens and dictionary that maps tokens to
# their index in that list.
tok2idx = {}
idx2tok = []
i = 0
for tok in tokens:
    if tok not in tok2idx:
        tok2idx[tok] = i
        idx2tok.append(tok)
        i += 1




## The continuous bag of words model

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # Define an Embedding module (`nn.Embedding`) and a linear
        # transform (`nn.Linear`) without bias.
        self.embeddings = nn.Embedding(self.vocab_size, self.embedding_size)
        self.U_transpose = nn.Linear(self.embedding_size, self.vocab_size, bias=False)

    def forward(self, context):
        # Implements the forward pass
        # `context` is of size `batch_size` * NGRAMS

        # `e_i` is of size `batch_size` * NGRAMS * `embedding_size`
        e_i = self.embeddings(context)

        # `e_bar` is of size `batch_size` * `embedding_size`
        e_bar = torch.mean(e_i, 1)

        # `UT_e_bar` is of size `embedding_size` * `vocab_size`
        UT_e_bar = self.U_transpose(e_bar)

        # Use `F.log_softmax` function
        return F.log_softmax(UT_e_bar, dim=1)


# Set the size of vocabulary and size of embedding
VOCAB_SIZE = len(idx2tok)
EMBEDDING_SIZE = 64

# Create a Continuous bag of words model
cbow = CBOW(VOCAB_SIZE, EMBEDDING_SIZE)

## Preparing the data

In [None]:
def ngrams_iterator(token_list, ngrams):
    """Generates sucessive N-grams from a list of tokens."""

    # Creates `ngrams` lists shifted to the left
    token_list_shifts = [token_list[i:] for i in range(ngrams)]
    for ngram in zip(*token_list_shifts):
        # Get indexes of tokens
        idxs = [tok2idx[tok] for tok in ngram]

        # Get center element in `idxs`
        center = idxs.pop(ngrams // 2)

        # Yield the index of center word and indexes of context words
        # as a Numpy array (for Pytorch to automatically convert it to
        # a Tensor).
        yield center, np.array(idxs)


# Create center, context data
NGRAMS = 5
ngrams = list(ngrams_iterator(tokens, NGRAMS))

BATCH_SIZE = 256
data = torch.utils.data.DataLoader(ngrams, batch_size=BATCH_SIZE, shuffle=True)

## Learn CBOW model

In [None]:
# Use the Adam algorithm on the parameters of `cbow` with a learning
# rate of 0.01
optimizer = optim.Adam(cbow.parameters(), lr=0.01)

# Use a negative log-likelyhood loss from the `nn` submodule
nll_loss = nn.NLLLoss()

In [None]:
EPOCHS = 10
try:
    for epoch in range(EPOCHS):
        total_loss = 0
        for i, (center, context) in enumerate(data):
            # Reset the gradients of the computational graph
            cbow.zero_grad()

            # Forward pass
            nll_w_hat = cbow(context)

            # Compute negative log-likelyhood loss averaged over the
            # mini-batch
            loss = nll_loss(nll_w_hat, center)

            # Backward pass to compute gradients of each parameter
            loss.backward()

            # Gradient descent step according to the chosen optimizer
            optimizer.step()

            total_loss += loss.data

            if i % 20 == 0:
                loss_avg = float(total_loss / (i + 1))
                print(
                    f"Epoch ({epoch}/{EPOCHS}), batch: ({i}/{len(data)}), loss: {loss_avg}"
                )

        # Print average loss after each epoch
        loss_avg = float(total_loss / len(data))
        print("{}/{} loss {:.2f}".format(epoch, EPOCHS, loss_avg))

        # Predict if `predict_center_word` is implemented
        try:
            left_words = ["le", "capitaine"]
            right_words = ["me", "dit"]
            word = predict_center_word(cbow, *left_words, *right_words)[0]
            print(" ".join(left_words + [word] + right_words))
        except:
            pass

except KeyboardInterrupt:
    print("Stopped!")

## Prediction functions

Now that the model is learned we can give it a context it has never
seen and see what center word it predicts.

In [None]:
def predict_center_word_idx(cbow, *context_words_idx, k=10):
    """Return k-best center words given indexes of context words."""

    # Create a fake minibatch containing just one example
    fake_minibatch = torch.LongTensor(context_words_idx).unsqueeze(0)

    # Forward propagate throught the CBOW model
    dist_center = cbow(fake_minibatch).squeeze()

    # Retrieve top k-best indexes using `torch.topk`
    _, best_idxs = torch.topk(dist_center, k=k)

    # Return actual tokens using `idx2tok`
    return [idx2tok[idx] for idx in best_idxs]


def predict_center_word(cbow, *context_words, k=10):
    """Return k-best center words given context words."""

    idxs = [tok2idx[tok] for tok in context_words]
    return predict_center_word_idx(cbow, *idxs, k=k)

In [None]:
predict_center_word(cbow, "vingt", "mille", "sous", "les")
predict_center_word(cbow, "mille", "lieues", "les", "mers")
predict_center_word(cbow, "le", "commandant", "fut", "le")

## Testing the embedding

Tokens by decreasing frequency

In [None]:
freq = np.zeros((len(idx2tok),), int)
for tok in tokens:
    freq[tok2idx[tok]] += 1

idxs = freq.argsort()[::-1]
words_decreasing_freq = list(zip(np.array(idx2tok)[idxs], freq[idxs]))

We use the library `gensim` to easily compute most similar words for
the embedding we just learned.

In [None]:
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors

m = WordEmbeddingsKeyedVectors(vector_size=EMBEDDING_SIZE)
m.add(idx2tok, cbow.embeddings.weight.detach().numpy())

You can now test most similar words for, for example "lieues",
"mers", "professeur"... You can look at `words_decreasing_freq` to
test most frequent tokens.

In [None]:
m.most_similar("lieues")
m.most_similar("professeur")
m.most_similar("mers")
m.most_similar("a")
m.most_similar("été")
m.most_similar("ma")