# Task 1. Word2vec step by step

In [None]:
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Определим софтмакс. 

Заметим, что мы вычитаем максимальное значение массива для стабильности

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x)) # for numerical stability
    return e_x / e_x.sum(axis=0)

##  Опишем класс word2vec

In [None]:
class word2vec():
    def __init__ (self, mode='skip-gram', emb_dim=5, window_size=2, lr=0.01, n_epochs=1000):
        self.emb_dim = emb_dim
        self.window_size = window_size
        self.lr = lr
        self.epochs = n_epochs
    
    # Create one-hot-encoding vectors
    def word2onehot(self, word):
        ohe = [0 for i in range(0, self.v_count)]  # zero init
        index = self.word2index[word]
        ohe[index] = 1  # one hot
        return ohe
    
    
    # function for generating training data for skipgram model
    def generate_skipgram_training_data(self, corpus):

        self.vocab = [word for row in corpus for word in row ]
        self.vocab = set(self.vocab)

        self.v_count = len(self.vocab)  # dictionary size == size of one-hot vectors

        # lookup dicts
        self.word2index = dict((word, i) for i, word in enumerate(self.vocab))
        self.index2word = dict((i, word) for i, word in enumerate(self.vocab))

        training_data = []
        # for each sentence
        for sentence in corpus:
            sent_len = len(sentence)

            # iterate for every word
            for i, word in enumerate(sentence):

                w_input = self.word2onehot(word)

                # create contexts
                w_context = []
                for j in range(i-self.window_size, i+self.window_size+1):
                    if j!=i and j>=0 and j<=sent_len-1 :
                        w_context.append(self.word2onehot(sentence[j]))

                training_data.append([w_input, w_context])

        return np.array(training_data)
    
    #Forward pass
    def forward_pass(self, x):
        h = # DO IT 
        u = # DO IT 
        y = # DO IT 
        return y, h, u
    
    # Backpropagation
    def backprop(self, err, h, x):        
        dl_dw2 = # DO IT 
        dl_dw1 = # DO IT 

        # Update weights 
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)
    
    # TRAIN W2V model
    def train(self, training_data):
        
        #initialize matrices
        self.w1 = np.random.uniform(-0.8, 0.8, (self.v_count, self.emb_dim))     # context matrix
        self.w2 = np.random.uniform(-0.8, 0.8, (self.emb_dim, self.v_count))     # embedding matrix
        self.losses = []
        
        # CYCLE THROUGH EACH EPOCH
        for i in range(0, self.epochs):

            loss = 0

            # for each epochs
            for w_input, w_context in training_data:

                w_context = np.array(w_context)
                # forward pass
                y_pred, h, u = self.forward_pass(w_input)
                
                # calculate summary error for context 
                sum_context_error = np.sum([np.subtract(y_pred, word) for word in w_context], axis=0)

                # backprop
                self.backprop(sum_context_error, h, w_input)
                loss += -np.sum([np.log(y_pred).dot(word) for word in w_context], axis=0)
            
            loss /= len(training_data) # average loss
            self.losses.append(loss)
    
    def plot_loss_curve(self):
        plt.plot(range(0,self.epochs), self.losses)
        
    # input a word, returns a vector (if available)
    def word_vec(self, word):
        w_index = self.word2index[word]
        v_w = self.w1[w_index] + self.w2[:,w_index]  # sum up two embeddings      
        return v_w
    
    # input a vector, returns nearest word(s)
    def vec_sim(self, input_vec, top_n):

        word_sim = {}
        for word in self.vocab:
            wv = self.word_vec(word)  # get vector for word from vocab
            dot_prod = np.dot(input_vec, wv)
            denominator = np.linalg.norm(input_vec) * np.linalg.norm(wv)
            cos_sim = dot_prod / denominator

            word_sim[word] = cos_sim
        
        words_sorted = sorted(word_sim.items(), key=lambda x: x[1], reverse=True)

        for word, sim in words_sorted[:top_n]:
            print (word, sim)


Если возникнут затруднения, то где-то в этой аудитории есть примеры решений ... 

<div class="img-with-text">
    <img src="https://g1-addtext.ft-uc.com/MjAxODEwMTQ/addtext_com_MTcxMzQwMTIyMDY.jpg" alt="""
        
    #Forward pass
    def forward_pass(self, x):
        h = np.dot(self.w1.T, x)
        u = np.dot(self.w2.T, h)
        y = softmax(u)
        return y, h, u
    
    # Backpropagation
    def backprop(self, err, h, x):        
        dl_dw2 = np.outer(h, err)  
        dl_dw1 = np.outer(x, np.dot(self.w2, err.T))
        # Update weights 
        self.w1 = self.w1 - (self.lr * dl_dw1)
        self.w2 = self.w2 - (self.lr * dl_dw2)
    
    """ />
</div>

## Посмотрим, что в итоге получилось на нашем любимом корпусе

In [None]:
corpus = ['о боже мама мама я схожу с ума',
          'ее улыбка мама кругом голова',
          'о боже мама мама пьяный без вина',
          'ее улыбка мама самая самая'
         ]
corpus = [phrase.split() for phrase in corpus]

# initialize w2v model
w2v = word2vec()

In [None]:
# generate training data
training_data = w2v.generate_skipgram_training_data(corpus)

In [None]:
w2v.vocab

In [None]:
w2v.word2index

In [None]:
w2v.index2word

In [None]:
training_data[0][0]  # это центральное слово -  input one-hot vector

In [None]:
training_data[0][1] # это контекстные слова -  target one-hot vectors

In [None]:
%%time
# train word2vec model
w2v.train(training_data)

In [None]:
w2v.plot_loss_curve()

In [None]:
vec = w2v.word_vec('самая')
vec

In [None]:
w2v.vec_sim(vec, 3)