# Keras - Bag of Words

Predict word relationships within a corpus.

### Setup notebook

In [None]:
import numpy as np
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

### Word embedding example

In [None]:
word_to_ix = {"hello": 0, "world": 1}

# Instantiate embedding; 2 words in vocab, 5 dimensional embeddings
embeds = nn.Embedding(2, 5)  

# Create look_up tensor
# Input; vocab[index]
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])

# Out; embedding vector/Variable
hello_embed = embeds(autograd.Variable(lookup_tensor))

print(word_to_ix["hello"])
print(lookup_tensor)
print(hello_embed)

### Data

In [None]:
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

### N-gram

In [None]:
# n-gram: 2 words to the left, 2 to the right
CONTEXT_SIZE = 2  
EMBEDDING_DIM = 10

In [None]:
def get_ngrams(wordlist, n):
    """create nested list with context words and word"""
    r = range(2*n+1)
    neighbors = list(zip(*[(wordlist[i:]) for i in r if i != n]))  # *[...] == unlist args
    return list(zip(neighbors, wordlist[n:]))

In [None]:
ngrams = get_ngrams(raw_text, CONTEXT_SIZE)
print(ngrams[:5])

#### Create vocabulary

In [None]:
vocab = set(raw_text)
VOCAB_SIZE = len(vocab)

#### Create dictionary

In [None]:
# create word indexed vocabulary
word_to_idx = {word: i for i, word in enumerate(vocab)}
print(word_to_idx)

In [None]:
# Convert word indices to pytorch.Variable - LongTensor
def idx_Variable(context, word_to_idx):
    """Convert a context to index Variable.
    Includes a single word context conversion.
    
    return Variable same size of word vector"""
    # Single or multiple words
    if isinstance(context, str):
        idx = [word_to_idx[context]]
    else: 
        idx = [word_to_idx[w] for w in context]
    return Variable(torch.LongTensor(idx))

### Bag of Words

Compute the log probabilities with the `log_softmax` function of the most related word in the vocabulary.

In [None]:
class CBOW(nn.Module):
    """Continuous Bag of Words class implementation"""
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)       # sum of w_embeddings
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        """compute vocab log scores"""
        embeds = self.embeddings(inputs)                   # in: context_size
                                                           # out: context_size x embedding_dim
        out = embeds.sum(dim=0)                            # out: embedding_dim
        out = F.relu(self.linear1(out))                    # out: 128
        out = self.linear2(out)                            # out: vocab_size
        log_probs = F.log_softmax(out, dim=0).view(1, -1)  # out: vocab_size
        return log_probs
    

### Create model

Ideas to try:
 - dynamic loss functions
 - dynamic layers, activations

In [None]:
model = CBOW(VOCAB_SIZE, EMBEDDING_DIM, CONTEXT_SIZE)
print(model)

### Define loss and optimizer

In [None]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

### Train model

In [None]:
losses = []

for epoch in range(100):
    total_loss = torch.Tensor([0])
    
    for context, target in get_ngrams(raw_text, CONTEXT_SIZE):
        
        # Step 1. Convert context words into integer indices Variable
        context_var = idx_Variable(context, word_to_idx)
        
        # Step 2. Zero out the gradients (reset) to avoid accumulation
        model.zero_grad()

        # Step 3. Forward pass
        log_probs = model(context_var)

        # Step 4. Compute loss
        loss = loss_function(log_probs, idx_Variable(target, word_to_idx))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        total_loss += loss.data
        
    losses.append(total_loss.item())
    
print(losses[-5:])  # The loss decreased every iteration over the training data!

### Test

In [None]:
# log probabilities
n = 10
for target in list(vocab)[:n]:
    test_var = idx_Variable(target, word_to_idx)
    log_probs = model(test_var)
    perc = torch.exp(log_probs)*100
    perc = perc.squeeze().detach().numpy() # add .detach() when Variable has gradients
    log_probs = log_probs.data.numpy()[0]
    
    # Sort words to relationship
    idx_to_word = {v:k for k, v in word_to_idx.items()}
    related = [idx_to_word[i] for i in np.argsort(log_probs)[::-1]]
    
    # min and max related words
    print("Target: {}\n-most related: {}\n-least related: {}\n".format(target, related[0], related[-1]))

In [None]:
# % probabilities - relationships with other words
print(['{} is {:.0f}% related'.format(word, perc) for perc, word in zip(perc, word_to_idx.keys())])