# Understanding Word2Vec
Word to vector technique is popularly known as Word2Vec. Word2Vec is basically taking the concept of co-occurrence based information to the next level by applying a single hidden layered neural network to it.  Word2Vec was originally proposed by a Google researcher Tomas Mikolov in 2013. Word2Vec belong to the category of Vector Space Models (VSM).  These models usually represent a word into a multi-dimensional vector, such vector are developed so that the similar or most often co-occurrence word are placed nearby in vector space.

In [None]:
## Author: Sunil Patel
## Copyright: Copyright 2018-2019, Packt Publishing Limited
## Version: 0.0.1
## Maintainer: Sunil Patel
## Email: snlpatel01213@hotmail.com
## Linkedin: https://www.linkedin.com/in/linus1/
## Contributor : {if you debug, append your name here}
## Contributor Email : {if you debug, append your email here}
## Status: active

# Importing Requirements 

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
from tensorboardX import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt 
import nltk
torch.manual_seed(1)

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
writer = SummaryWriter(log_dir='runs/')
nltk.download('popular')



# Selecting Device

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using Device : ",device)

# Some Preprocesssing

In [None]:
def remove_stop_words(text):
    all_sentence = []
    stop_words = set(stopwords.words('english')) 
    for each_sentence in text:
        word_tokens = word_tokenize(each_sentence)  
        filtered_sentence = [w for w in word_tokens if not w in stop_words] 
        all_sentence.append(' '.join(filtered_sentence))
    return all_sentence

# Reading data and Partitioning

In [None]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
text = open("data/testdata_en.txt").read().split()
text = remove_stop_words(text[:10000])
split_ind = (int)(len(text) * 0.8)
vocab = set(text)
vocab_size = len(vocab)
print('vocab_size:', vocab_size)
w2i = {w: i for i, w in enumerate(vocab)}
i2w = {i: w for i, w in enumerate(vocab)}

# CBOW
Word2Vec is a computationally reliable and mathematically stable approach that learns vector representation by using either CBOW (Continuous Bag of Words) or Skip Gram technique. These two techniques are different ways by which a Word2Vec model can be trained. Before we go into technicality and details of these two techniques works, let's see how in general a Word2Vec model is trained. A Word2Vec model is a single hidden layered neural network with linear or no activation (or linear activation). It has three layers an input layer, a hidden layer and an output layer as shown in the below-given figure : 
![](figures/CBOW.png)

## Create CBOW dataset

In [None]:
def create_cbow_dataset(text):
    """
    Creat data for CBOW
    """
    data = []
    for i in range(2, len(text) - 2):
        context = [text[i - 2], text[i - 1],
                   text[i + 1], text[i + 2]]
        target = text[i]
        data.append((context, target))
    return data

In [None]:
cbow_train = create_cbow_dataset(text)
print('cbow sample', cbow_train[0])

## Creating CBOW Model

Mathematically it can be given as given below  :
$$ \underbrace{{X_{(1,500)} * Wi_{(500,300)}}} \rightarrow H_{(1,300)} * \underbrace{{Wo_{(300,500)} \rightarrow \hat Y_{(1,500)}}}   \rightarrow  Softmax  \rightarrow Argmax  \rightarrow Error $$

Here in the above equation we are having vocabulary size of 500 so each token $ X $ can be given as one hot vector of size $ (1,500) $ . We want to keep our embeddings $ H $  dimension as 300 so we multiply input  $ X $  with weight matrix $ W_i $ of dimension $ (500,300) $ Now this embedding vector is multiplied with another weight matrix  of size (300,500) to convert it to a target vector representation which is mostly in a float number showing the likelihood for each vocab token. Softmax operation is applied to such output to calculate probability distribution. In this distribution, the target token is one which is having the highest probability. If this predicted token is the same as  then error is zero else error back propagates and weights are adjusted accordingly.



In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        embedded = self.embeddings(inputs).view((1, -1))
        hid = self.linear1(embedded)
        out = self.linear2(hid)
        log_probs = F.log_softmax(out)
        return log_probs, hid

## Traning CBOW

In [None]:
embd_size = 100
learning_rate = 0.01
n_epoch = 10

def train_cbow():
    hidden_size = 64
    losses = []
    loss_fn = nn.NLLLoss()
    model = CBOW(vocab_size, embd_size, CONTEXT_SIZE, hidden_size).to(device)
    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in tqdm(range(n_epoch)):
        total_loss = .0
        for context, target in cbow_train:
            ctx_idxs = [w2i[w] for w in context]
            ctx_var = Variable(torch.LongTensor(ctx_idxs).to(device))

            model.zero_grad()
            log_probs, _ = model(ctx_var)

            loss = loss_fn(log_probs, Variable(torch.LongTensor([w2i[target]]).to(device)))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        losses.append(total_loss)
    return model, losses

In [None]:
cbow_model, cbow_losses = train_cbow()

## Plotting Losses

In [None]:
plt.plot(cbow_losses)

## Examining quality on test
Finally, If we plot the 3D projection of the resulting vectors using TensorboardX then it will look like as given below.
![](figures/w2v_tensorboard.png)

In [None]:
# You have to use other dataset for test, but in this case I use training data because this dataset is too small
def test_cbow(test_data, model):
    print('====Test CBOW===')
    vector_array = []

    predicted_word_array = []
    correct_ct = 0
    for ctx, target in test_data:
        ctx_idxs = [w2i[w] for w in ctx]
        ctx_var = Variable(torch.LongTensor(ctx_idxs).to(device))

        model.zero_grad()
        log_probs, hidden = model(ctx_var)

        _, predicted = torch.max(log_probs.data, 1)
        
        predicted_word = i2w[int(predicted[0])]
        if (predicted_word not in predicted_word_array):
            vector_array.append(np.array(hidden.to('cpu').detach().numpy())[0])
            predicted_word_array.append(str(predicted_word))

        if predicted_word == target:
            correct_ct += 1
            if correct_ct == 10000:
                break
    # for visualization using tensorboardX
    writer.add_embedding(torch.Tensor(vector_array),metadata=predicted_word_array,global_step=2)
    writer.export_scalars_to_json("all_scalars.json")
    writer.close()


In [None]:
test_cbow(cbow_train, cbow_model)

# To Do 
- See how data is prepared for Skip Gram
- See how model is prepared for Skip Gram 
- Train skipgram model
- Insert TensorboardX related code to  `test_skipgram` function and visualize quality of your embeddings
- Tune parameters and repeat


In [None]:
def create_skipgram_dataset(text):
    """
    Create Data for Skipgram
    """
    import random
    data = []
    for i in range(2, len(text) - 2):
        data.append((text[i], text[i-2], 1))
        data.append((text[i], text[i-1], 1))
        data.append((text[i], text[i+1], 1))
        data.append((text[i], text[i+2], 1))
        # negative sampling
        for _ in range(4):
            if random.random() < 0.5 or i >= len(text) - 3:
                rand_id = random.randint(0, i-1)
            else:
                rand_id = random.randint(i+3, len(text)-1)
            data.append((text[i], text[rand_id], 0))
    return data

In [None]:
skipgram_train = create_skipgram_dataset(text)
print('skipgram sample', skipgram_train[0])

In [None]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        hidden = self.embeddings(context).view((1, -1))
        score = torch.mm(embed_focus, torch.t(hidden))
        log_probs = F.logsigmoid(score)
    
        return log_probs, hidden

In [None]:
def train_skipgram():
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocab_size, embd_size)
    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in range(n_epoch):
        total_loss = .0
        for in_w, out_w, target in skipgram_train:
            in_w_var = Variable(torch.LongTensor([w2i[in_w]]))
            out_w_var = Variable(torch.LongTensor([w2i[out_w]]))
            
            model.zero_grad()
            log_probs, _ = model(in_w_var, out_w_var)
            loss = loss_fn(log_probs[0], Variable(torch.Tensor([target])))
            
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        losses.append(total_loss)
    return model, losses
    
sg_model, sg_losses = train_skipgram()

In [None]:
# You have to use other dataset for test, but in this case I use training data because this dataset is too small
def test_skipgram(test_data, model):
    vector_array = []

    predicted_word_array = []
    correct_ct = 0
    for in_w, out_w, target in test_data:
        in_w_var = Variable(torch.LongTensor([w2i[in_w]]))
        out_w_var = Variable(torch.LongTensor([w2i[out_w]]))

        model.zero_grad()
        log_probs, hidden = model(ctx_var)

        _, hidden = model(in_w_var, out_w_var)
        
        predicted_word = i2w[int(predicted[0])]
        if (predicted_word not in predicted_word_array):
            vector_array.append(np.array(hidden.to('cpu').detach().numpy())[0])
            predicted_word_array.append(str(predicted_word))

        if predicted_word == target:
            correct_ct += 1
            if correct_ct == 10000:
                break
    # for visualization using tensorboardX
    writer.add_embedding(torch.Tensor(vector_array),metadata=predicted_word_array,global_step=2)
    writer.export_scalars_to_json("all_scalars.json")
    writer.close()

In [None]:
# 