**Results of this task**:
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)
 * qualitative evaluations of word vectors: nearest neighbors, word analogies

**Extra:**
 * quantitative evaluation:
   * for intrinsic evaluation you can find datasets [here](https://aclweb.org/aclwiki/Analogy_(State_of_the_art))
   * for extrincis evaluation you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation. If you chose to do this, please use the same datasets across tasks 3, 4, 5 and 6.

In [522]:
%reset -f
import gc
gc.collect()

0

In [523]:
import numpy as np
from itertools import islice

In [524]:
# with open('./dataset/corpus', 'rt') as fp:
#     corpus = fp.read()

# corpus = corpus.strip()
# corpus = corpus.split()[:5000]

In [525]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]
corpus = [w for doc in corpus for w in doc.split()]

In [526]:
class SkipGramBatcher:
    def __init__(self, corpus, window_size=4, batch_size=32):
        '''corpus - list of words'''
        self.corpus = corpus
        self.window_size = window_size
        self.batch_size = batch_size
        self.make_vocab()
        return
    
    def make_vocab(self):
        self.vocab = sorted(set(corpus))
        self.V = len(self.vocab)
        self.word2index = {w: idx for idx, w in enumerate(self.vocab)}
        self.index2word = {idx: w for idx, w in enumerate(self.vocab)}
        return
    
    def batch_gen(self):
        '''c - corpus, v - vocab ; i - central, j - side'''
        x_batch = np.empty(self.batch_size, dtype=np.int)
        y_batch = np.empty(self.batch_size, dtype=np.int)
        curr_idx = 0
        for c_i, w in enumerate(self.corpus):
            v_i = self.word2index[w]
            window_left_idx = c_i - self.window_size if c_i - self.window_size >= 0 else 0
            for side_w in self.corpus[window_left_idx: c_i] \
                          + self.corpus[c_i + 1 : c_i + self.window_size + 1]:
                v_j = self.word2index[side_w]
                x_batch[curr_idx] = v_i
                y_batch[curr_idx] = v_j
                curr_idx += 1
                if curr_idx == self.batch_size:
                    curr_idx = 0
                    yield (x_batch, y_batch)
        if curr_idx != 0:
            yield (x_batch, y_batch)

In [527]:
#max_iter = 3
#for x_batch, y_batch in islice(batcher.batch_gen(), max_iter):
    #print('x_batch.shape: {}, y_batch.shape: {}'.format(x_batch.shape, y_batch.shape))
    #print(x_batch)

In [528]:
import torch
import matplotlib.pyplot as plt

In [529]:
USE_GPU = True

dtype = torch.float32

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Constant to control how frequently we print train loss
print_every = 1

print('using device:', device)

using device: cuda


In [530]:
def plot_metric(param_name, param_values, train_values, val_values):
    plt.figure(figsize=(16, 8))
    plt.plot(param_values, train_values, 'o-', label='train')
    plt.plot(param_values, val_values, 'o-', label='val')

    plt.xlabel(param_name)
    plt.ylabel('metric')
    plt.legend(loc='best')
    plt.title(param_name)
    plt.show()

In [531]:
batcher = SkipGramBatcher(corpus)

In [532]:
V = batcher.V
N = 30
batch_size = batcher.batch_size
learning_rate = 1e-2

In [533]:
model = torch.nn.Sequential( torch.nn.Linear(V, N, bias=False),
                             torch.nn.Linear(N, V, bias=False),
                             torch.nn.LogSoftmax())

In [534]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [535]:
def train(model, optimizer, epochs=1):
    """    
    Inputs:
    - model: A PyTorch Module giving the model to train.
    - optimizer: An Optimizer object we will use to train the model
    - epochs: (Optional) A Python integer giving the number of epochs to train for
    
    Returns: Nothing, but prints model accuracies during training.
    """
    input_buff = np.empty((batch_size, V), dtype=np.float32)
    #model = model.to(device=device)  # move the model parameters to CPU/GPU
    for e in range(epochs):
        loss_values = []
        for t, (x_ids, y_ids) in enumerate(loader_train):
            # print generated batches
            #for elem in list(zip([batcher.index2word[elem] for elem in x_ids], [batcher.index2word[elem] for elem in y_ids])):
                #print(elem[0], elem[1])

            model.train()  # put model to training mode
            #x = x.to(device=device, dtype=dtype)  # move to device, e.g. GPU
            #y = y.to(device=device, dtype=torch.long)
            
            input_buff.fill(0)
            input_buff[np.arange(batch_size), x_ids] = 1

            x = torch.from_numpy(input_buff)
            y = torch.from_numpy(y_ids)
            
            scores = model(x)
            #print(scores.shape)
            #print('scores: {}\ny: {}'.format(scores, y))
            loss = torch.nn.functional.nll_loss(scores, y)
            #loss_values.append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
        #plt.plot(loss_values)
        #plt.show()

In [536]:
max_iter = 5000
loader_train = islice(batcher.batch_gen(), max_iter)

In [537]:
train(model, optimizer, epochs=15)

Iteration 0, loss = 2.7333
Iteration 1, loss = 2.7209
Iteration 2, loss = 2.7268
Iteration 3, loss = 2.7136
Iteration 4, loss = 2.7112
Iteration 5, loss = 2.7452
Iteration 6, loss = 2.7558


  input = module(input)


In [None]:
CHECK SOFTMAX DIMENSIONS!