In [1]:
import torch

from collections import defaultdict

In [2]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

In [3]:
tokenized_corpus = [sent.split() for sent in corpus]

# word2idx, idx2word 사전 만들기

In [4]:
word2idx = defaultdict(lambda : len(word2idx))

# word2idx dict
for sent in tokenized_corpus:
    for token in sent:
        word2idx[token]
        
# idx2word
idx2word = {}
for word, idx in word2idx.items():
    idx2word[idx] = word

In [63]:
window_size = 2
n_words = len(word2idx)

# Skip gram의 context 형태 만들기

In [286]:
import numpy as np

In [287]:
idx_pair_corpus = []

for token_ls in tokenized_corpus:
    idx_ls = [word2idx[word] for word in token_ls]

    temp_tokenized_idx_ls = []
    for i, idx in enumerate(idx_ls):
        for w in range(-window_size, window_size+1):
            if i+w < 0 or i+w >= len(idx_ls) or w == 0: continue
            else :
                temp_tokenized_idx_ls.append([idx_ls[i], idx_ls[i+w]])
    
    idx_pair_corpus.append(temp_tokenized_idx_ls)

idx_pair_corpus = np.array(idx_pair_corpus)

In [288]:
idx_pair_corpus[0]

array([[0, 1],
       [0, 2],
       [1, 0],
       [1, 2],
       [1, 3],
       [2, 0],
       [2, 1],
       [2, 3],
       [3, 1],
       [3, 2]])

# loss function

In [289]:
from torch.autograd import Variable
import torch.nn.functional as F

In [290]:
def one_hot_encode(word_idx):
    x = torch.zeros(n_words).float()
    x[word_idx] = 1.0
    return x

In [291]:
embedding_size = 5

w1 = Variable(torch.randn((embedding_size, n_words)).float(), requires_grad = True)
w2 = Variable(torch.randn((n_words, embedding_size)).float(), requires_grad = True)

In [310]:
learning_rate = 0.001

for epoch in range(1000):
    loss_val = 0
    for idx_pairs in idx_pair_corpus:
        for center_word_idx, target_word_idx in idx_pairs:
            x = Variable(one_hot_encode(center_word_idx), requires_grad = False)
            y_true = Variable(torch.tensor([target_word_idx]))

            z1 = torch.matmul(w1, x)
            z2 = torch.matmul(w2, z1)

            log_softmax = F.log_softmax(z2, dim = 0)

            loss = F.nll_loss(log_softmax.view(1,-1), y_true)

            loss_val += loss.data
            loss.backward()

            w1.data -= learning_rate * w1.grad.data
            w2.data -= learning_rate * w2.grad.data

            w1.grad.data.zero_()
            w2.grad.data.zero_()

    if epoch % 100 == 0:    
        print('Loss at epo :%s, %s'%(epoch, loss_val/len(idx_pairs)))

Loss at epo :0, tensor(10.9632)
Loss at epo :100, tensor(10.8446)
Loss at epo :200, tensor(10.8273)
Loss at epo :300, tensor(10.8148)
Loss at epo :400, tensor(10.8045)
Loss at epo :500, tensor(10.7957)
Loss at epo :600, tensor(10.7881)
Loss at epo :700, tensor(10.7815)
Loss at epo :800, tensor(10.7756)
Loss at epo :900, tensor(10.7704)


In [197]:
y_true

tensor(2)

In [185]:
a,b = idx_pair

In [187]:
a
b

1

In [118]:
torch.matmul(embedding_matrix.permute(1,0), get_input_layer(0))

tensor([-0.0969, -1.3024, -1.3910,  0.0915,  0.2805])