In [None]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import plotly.express as px
from sklearn.manifold import TSNE



In [None]:
corpus = [
          'let me tell you about mv best friend his name is yuriy we have known each other for ages we live in the same town and went to the same school now we study at the same university and though we study at different faculties we see each other almost every day my best friend is the first to come and support me in any difficult situation we have a lot in common we both do sports regularly that is because we want to be strong and look handsome we really look very much alike we have short dark hair grey eyes and a sport figure we also have many similar features of character we are merry smart and active', 
          'as yura is an easy going person he can easily make friends with anyone he likes communicating with new people yura likes travelling we often visit new places especially in summer we have already been to the crimea poltava and lvov this year we plan to visit the carpathian mountains yura is hobby is computer games he can spend hours in the virtual world as for me i like the internet where i find interesting information and make friends with people all over the world but my best friend lives in ukraine i am glad i have such a friend as yura'
]


In [None]:
tokenized_corpus = [x.split() for x in corpus]

In [None]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [None]:
window_size = 2
idx_pairs = []

for sentence in tokenized_corpus:
    idx = [word2idx[word] for word in sentence]
    
    for center_word_pos in range(len(idx)):
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            
            if context_word_pos < 0 or context_word_pos >= len(idx) or center_word_pos == context_word_pos:
                continue
            context_word_idx = idx[context_word_pos]
            idx_pairs.append((idx[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs)

In [None]:
#v_embedding - центральное слово
#u_embedding - контекстное слово

In [None]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size, dtype=torch.float64)
    x[word_idx] = 1.0
    return x

In [None]:
embedding_dim = 10
v_embedding = torch.rand([embedding_dim, vocabulary_size], dtype=torch.float64, requires_grad = True)
u_embedding = torch.rand([embedding_dim, vocabulary_size], dtype=torch.float64, requires_grad = True)
learning_rate = 0.005
epochs_cnt = 500

In [None]:
for epoch in range(epochs_cnt):
  loss_value = 0
  for center_word_idx, context_word_idx in idx_pairs:

    input_vec = get_input_layer(center_word_idx)
    output_vec = torch.from_numpy(np.array([context_word_idx]))
 
    y1 = torch.matmul(v_embedding, input_vec)
    y2 = torch.matmul(y1, u_embedding)   

    log_softmax = F.log_softmax(y2, dim=0)

    loss = F.nll_loss(log_softmax.view(1,-1), output_vec)
    loss_value += loss.data
    loss.backward()

    v_embedding.data -= learning_rate * v_embedding.grad.data
    u_embedding.data -= learning_rate * u_embedding.grad.data

    v_embedding.grad.data.zero_()
    u_embedding.grad.data.zero_()
  if epoch % 10 == 0:    
        print(f'Loss at epoch {epoch}: {loss_value/len(idx_pairs)}')

In [None]:
#тест
input_vec = get_input_layer(31)
 
y1 = torch.matmul(v_embedding, input_vec)
y2 = torch.matmul(y1, u_embedding)   

log_softmax = F.log_softmax(y2, dim=0)

best_word_val = log_softmax[0]
best_word_idx = 0

for i in range(len(log_softmax)):
  if best_word_val < log_softmax[i]:
    best_word_val = log_softmax[i]
    best_word_idx = i

idx2word[best_word_idx]

'though'

In [None]:
numpy_v_emb = v_embedding.detach().numpy()
numpy_v_emb = numpy_v_emb.transpose()

resized_v_emb = TSNE(n_components=2, learning_rate='auto',
                   init='random').fit_transform(numpy_v_emb)

In [None]:
x_embed = []
y_embed = []

for i in range(len(resized_v_emb)):
  x_embed.append(resized_v_emb[i][0])
  y_embed.append(resized_v_emb[i][1])

In [None]:
fig = px.scatter(x=x_embed, y=y_embed, hover_name=vocabulary)
fig.show()