In [15]:
from torch.autograd import Variable
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.optim as optim

In [16]:
context_size = 3
embed_size = 2
xmax = 2
alpha = 0.75
batch_size = 20
l_rate = 0.001
num_epochs = 2

In [17]:
corpus = ["Golovkin won his first major world championship, the WBA interim middleweight title, by defeating Milton Núñez in 2010. The WBA elevated him to Regular champion status in the same year. In 2014, Golovkin was elevated to the status of WBA (Super) champion and successfully defended both his titles against Daniel Geale. Later that year he defeated Marco Antonio Rubio to win WBC interim middleweight title, and defeated David Lemieux for the IBF middleweight title in 2015. After Canelo Álvarez vacated his WBC middleweight title in 2016, Golovkin was elevated to full champion and held three of the four major world titles in boxing until being stripped by the IBF in 2018 for not fighting Serhiy Derevianchenko. Golovkin lost all his belts in 2018 following a loss to Alvarez but regained his IBF and IBO titles by defeating Derevianchenko in 2019."]

In [18]:
tokens = []
for i in range(len(corpus)):
    sents = corpus[i].split(".")
    for j in range(len(sents)):
        tokens = tokens +sents[j].lower().split(" ")

In [19]:
my_dict = list(set(tokens))

In [20]:
word2idx={}
idx2word={}
for ind,token in enumerate(my_dict):
    word2idx[token]=ind
    idx2word[ind]=token

In [21]:
ss=len(my_dict)
zero_matrix=np.zeros((ss,ss))
for idx in range (len(tokens)-1):
    ind1 = word2idx[tokens[idx]]
    ind2 = word2idx[tokens[idx+1]]
    zero_matrix[ind1,ind2]+=1

In [22]:
coocs = np.transpose(np.nonzero(zero_matrix))

In [23]:
def wf(x):
    if x < xmax:
        return (x/xmax)**alpha
    return 1

In [24]:
vocab_size=len(my_dict)
w_list_size=len(tokens)
l_embed, r_embed = [
    [Variable(torch.from_numpy(np.random.normal(0, 0.01, (embed_size, 1))),
        requires_grad = True) for j in range(vocab_size)] for i in range(2)]
l_biases, r_biases = [
    [Variable(torch.from_numpy(np.random.normal(0, 0.01, 1)), 
        requires_grad = True) for j in range(vocab_size)] for i in range(2)]

In [25]:
optimizer = optim.Adam(l_embed + r_embed + l_biases + r_biases, lr = l_rate)

In [26]:
def gen_batch():
    sample = np.random.choice(np.arange(len(coocs)), size=batch_size, replace=False)
    l_vecs, r_vecs, covals, l_v_bias, r_v_bias = [], [], [], [], []
    for chosen in sample:
        ind = tuple(coocs[chosen])
        l_vecs.append(l_embed[ind[0]])
        r_vecs.append(r_embed[ind[1]])
        covals.append(zero_matrix[ind])
        l_v_bias.append(l_biases[ind[0]])
        r_v_bias.append(r_biases[ind[1]])
    return l_vecs, r_vecs, covals, l_v_bias, r_v_bias

In [27]:
# тренировочная модель
for epoch in range(num_epochs):
    num_batches = int(w_list_size/batch_size)
    avg_loss = 0.0
    for batch in range(num_batches):
        optimizer.zero_grad()
        l_vecs, r_vecs, covals, l_v_bias, r_v_bias = gen_batch()
        loss = sum([torch.mul((torch.dot(l_vecs[i].view(-1), r_vecs[i].view(-1)) +
                l_v_bias[i] + r_v_bias[i] - np.log(covals[i]))**2,
                wf(covals[i])) for i in range(batch_size)])
        avg_loss += loss.data[0]/num_batches
        loss.backward()
        optimizer.step()

In [28]:
# визуализация
if embed_size == 2:
    word_inds = np.random.choice(np.arange(len(my_dict)), size=10, replace=False)
    for word_ind in word_inds:
        w_embed = (l_embed[word_ind].data + r_embed[word_ind].data).numpy()
        x, y = w_embed[0][0], w_embed[1][0]
        plt.scatter(x, y)
        plt.annotate(my_dict[word_ind], xy=(x, y), xytext=(5, 2),
            textcoords='offset points', ha='right', va='bottom')
    plt.savefig("test_pic.png")