- https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb

In [81]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

# Prerequisites

## Corpus

In [15]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
    'I live in Jakarta, Indonesia',
]

## Creating vocabulary

In [16]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['he', 'is', 'a', 'king'], ['she', 'is', 'a', 'queen'], ['he', 'is', 'a', 'man'], ['she', 'is', 'a', 'woman'], ['warsaw', 'is', 'poland', 'capital'], ['berlin', 'is', 'germany', 'capital'], ['paris', 'is', 'france', 'capital'], ['I', 'live', 'in', 'Jakarta,', 'Indonesia']]


In [17]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [18]:
print(vocabulary)
print()
print(word2idx)
print()
print(idx2word)
print()
print(vocabulary_size)

['he', 'is', 'a', 'king', 'she', 'queen', 'man', 'woman', 'warsaw', 'poland', 'capital', 'berlin', 'germany', 'paris', 'france', 'I', 'live', 'in', 'Jakarta,', 'Indonesia']

{'he': 0, 'is': 1, 'a': 2, 'king': 3, 'she': 4, 'queen': 5, 'man': 6, 'woman': 7, 'warsaw': 8, 'poland': 9, 'capital': 10, 'berlin': 11, 'germany': 12, 'paris': 13, 'france': 14, 'I': 15, 'live': 16, 'in': 17, 'Jakarta,': 18, 'Indonesia': 19}

{0: 'he', 1: 'is', 2: 'a', 3: 'king', 4: 'she', 5: 'queen', 6: 'man', 7: 'woman', 8: 'warsaw', 9: 'poland', 10: 'capital', 11: 'berlin', 12: 'germany', 13: 'paris', 14: 'france', 15: 'I', 16: 'live', 17: 'in', 18: 'Jakarta,', 19: 'Indonesia'}

20


In [42]:
word2idx['I'], word2idx['live'], word2idx['in'], word2idx['Jakarta,'], word2idx['Indonesia']

(15, 16, 17, 18, 19)

In [88]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    print('sentence:',sentence)
    indices = [word2idx[word] for word in sentence]
    print('indices:',indices)
    # for each word, threated as center word    
    print(range(len(indices)))
    print()
    for center_word_pos in range(len(indices)):
        print('center_word_pos:',center_word_pos)
        # for each window position
        print(range(-window_size, window_size + 1))
        print()
        for w in range(-window_size, window_size + 1):                    
            context_word_pos = center_word_pos + w
            print('context_word_pos:',context_word_pos,'w:',w)
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            print('in')
            context_word_idx = indices[context_word_pos]
            print('context_word_idx:',context_word_idx,indices[context_word_pos])
            print('xx:',(indices[center_word_pos], context_word_idx))
            idx_pairs.append((indices[center_word_pos], context_word_idx))
        print()

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

sentence: ['he', 'is', 'a', 'king']
indices: [0, 1, 2, 3]
range(0, 4)

center_word_pos: 0
range(-2, 3)

context_word_pos: -2 w: -2
context_word_pos: -1 w: -1
context_word_pos: 0 w: 0
context_word_pos: 1 w: 1
in
context_word_idx: 1 1
xx: (0, 1)
context_word_pos: 2 w: 2
in
context_word_idx: 2 2
xx: (0, 2)

center_word_pos: 1
range(-2, 3)

context_word_pos: -1 w: -2
context_word_pos: 0 w: -1
in
context_word_idx: 0 0
xx: (1, 0)
context_word_pos: 1 w: 0
context_word_pos: 2 w: 1
in
context_word_idx: 2 2
xx: (1, 2)
context_word_pos: 3 w: 2
in
context_word_idx: 3 3
xx: (1, 3)

center_word_pos: 2
range(-2, 3)

context_word_pos: 0 w: -2
in
context_word_idx: 0 0
xx: (2, 0)
context_word_pos: 1 w: -1
in
context_word_idx: 1 1
xx: (2, 1)
context_word_pos: 2 w: 0
context_word_pos: 3 w: 1
in
context_word_idx: 3 3
xx: (2, 3)
context_word_pos: 4 w: 2

center_word_pos: 3
range(-2, 3)

context_word_pos: 1 w: -2
in
context_word_idx: 1 1
xx: (3, 1)
context_word_pos: 2 w: -1
in
context_word_idx: 2 2
xx: (3, 2

In [89]:
idx_pairs

array([[ 0,  1],
       [ 0,  2],
       [ 1,  0],
       [ 1,  2],
       [ 1,  3],
       [ 2,  0],
       [ 2,  1],
       [ 2,  3],
       [ 3,  1],
       [ 3,  2],
       [ 4,  1],
       [ 4,  2],
       [ 1,  4],
       [ 1,  2],
       [ 1,  5],
       [ 2,  4],
       [ 2,  1],
       [ 2,  5],
       [ 5,  1],
       [ 5,  2],
       [ 0,  1],
       [ 0,  2],
       [ 1,  0],
       [ 1,  2],
       [ 1,  6],
       [ 2,  0],
       [ 2,  1],
       [ 2,  6],
       [ 6,  1],
       [ 6,  2],
       [ 4,  1],
       [ 4,  2],
       [ 1,  4],
       [ 1,  2],
       [ 1,  7],
       [ 2,  4],
       [ 2,  1],
       [ 2,  7],
       [ 7,  1],
       [ 7,  2],
       [ 8,  1],
       [ 8,  9],
       [ 1,  8],
       [ 1,  9],
       [ 1, 10],
       [ 9,  8],
       [ 9,  1],
       [ 9, 10],
       [10,  1],
       [10,  9],
       [11,  1],
       [11, 12],
       [ 1, 11],
       [ 1, 12],
       [ 1, 10],
       [12, 11],
       [12,  1],
       [12, 10],
       [10,  1

In [90]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [91]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 5.2130126953125
Loss at epo 10: 4.713010311126709
Loss at epo 20: 4.363424777984619
Loss at epo 30: 4.097565650939941
Loss at epo 40: 3.8842580318450928
Loss at epo 50: 3.706723213195801
Loss at epo 60: 3.5549564361572266
Loss at epo 70: 3.4225199222564697
Loss at epo 80: 3.3050670623779297
Loss at epo 90: 3.1995489597320557


In [92]:
from collections import Counter
import torch.nn as nn

# Let's say you have 2 sentences(lowercased, punctuations removed) :
sentences = "i am new to PyTorch i am having fun"

words = sentences.split(' ')
    
vocab = Counter(words) # create a dictionary
vocab = sorted(vocab, key=vocab.get, reverse=True)
vocab_size = len(vocab)

# map words to unique indices
word2idx = {word: ind for ind, word in enumerate(vocab)} 

# word2idx = {'i': 0, 'am': 1, 'new': 2, 'to': 3, 'pytorch': 4, 'having': 5, 'fun': 6}

encoded_sentences = [word2idx[word] for word in words]

# encoded_sentences = [0, 1, 2, 3, 4, 0, 1, 5, 6]

# let's say you want embedding dimension to be 3
emb_dim = 3 

In [93]:
emb_layer = nn.Embedding(vocab_size, emb_dim)
word_vectors = emb_layer(torch.LongTensor(encoded_sentences))

In [94]:
emb_layer.weight.requires_grad

True