### Import Required Packages

In [0]:
import numpy as np
from collections import defaultdict

### Create Required Functions

In [0]:
def softmax(v):
    exps = np.exp(v)
    return exps / np.sum(exps)

### Create Skip-Gram Class

In [0]:
class Word2Vec:
    
    def __init__(self, settings):
        # n is hidden node size
        self.n = settings['n']
        
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window_size = settings['window_size']
        
    def onehot(self, word):
        vec = [0 for _ in range(len(self.vocab))]
        vec[self.word_index[word]] = 1
        return vec
        
    def generate_training_data(self, settings, corpus):
        
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
                
        self.vocab = sorted(list(word_counts.keys()), reverse = False)
        self.word_index = {word: i for i, word in enumerate(self.vocab)}
        self.index_word = {i: word for i, word in enumerate(self.vocab)}
        
        # V is vocab size
        self.V = len(self.vocab)
        
        training_data = []
        for sentence in corpus:
            sentence_length = len(sentence)
            
            for i, word in enumerate(sentence):
                
                x_t = self.onehot(word)
                
                x_context = []
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if j != i and j >= 0 and j < sentence_length:
                        x_context.append(self.onehot(sentence[j]))
                
                training_data.append([x_t, x_context])
        return np.array(training_data)
        
    def forward_pass(self, x):
        h = np.dot(np.transpose(self.W1), x)
        u_c = np.dot(np.transpose(self.W2), h)
        y_c = softmax(u_c)
        return y_c, h, u_c
    
    def back_prop(self, e, h, x):
        dl_dW2 = np.outer(h, e)
        dl_dW1 = np.outer(x, np.dot(self.W2, e.reshape(-1, 1)))
        
        self.W2 = self.W2 - self.eta * dl_dW2
        self.W1 = self.W1 - self.eta * dl_dW1
        pass
    
    def train(self, training_data):
        self.W1 = np.random.uniform(-1, 1, (self.V, self.n))
        self.W2 = np.random.uniform(-1, 1, (self.n, self.V))
        
        for i in range(self.epochs):
            self.loss = 0
            
            for x_t, x_c in training_data:
                y_pred, h, u = self.forward_pass(x_t)
                
                EI = np.sum([np.subtract(y_pred, word) for word in x_c], axis=0)
                
                self.back_prop(EI, h, x_t)
                
                self.loss += -np.sum([np.log(np.dot(y_pred, word)) for word in x_c])
            
            if i % (self.epochs / 10) == 0: print('Epoch: ' + str(i) + ' Loss: ' + str(self.loss))
        pass
    
    def get_word_vec(self, word):
        word_idx = self.word_index[word]
        return self.W1[word_idx, :]
    
    def word_sim(self, word, top_n = 5):
        word_vec = self.get_word_vec(word)
        
        word_sim = {}
        for vocab_word in self.word_index:
            if word != vocab_word:
                vec_w2 = self.get_word_vec(vocab_word)
                num = np.dot(word_vec, vec_w2)
                denom = np.linalg.norm(word_vec) * np.linalg.norm(vec_w2)
                word_sim[vocab_word] = num / denom
                
        words_sorted = sorted(word_sim.items(), key = lambda item: item[1], reverse = True)
        
        for word, sim in words_sorted[:top_n]:
            print(word, sim)

In [155]:
corpus = [['this', 'is', 'the', 'first', 'sentence'],
          ['this', 'is', 'the', 'second', 'sentence'],
          ['the', 'third', 'sentence', 'is', 'different', 'from', 'the', 'first', 'two'],
          ['so', 'is', 'the', 'fourth']]


settings = {'n': 10, 'learning_rate': .01, 'epochs': 2000, 'window_size': 2}
w2v = Word2Vec(settings)

training_data = w2v.generate_training_data(settings, corpus)

w2v.train(training_data)


Epoch: 0 Loss: 196.14718056660485
Epoch: 200 Loss: 116.83386586821497
Epoch: 400 Loss: 116.15797649524616
Epoch: 600 Loss: 116.01218234845193
Epoch: 800 Loss: 115.94068324917843
Epoch: 1000 Loss: 115.89182065651333
Epoch: 1200 Loss: 115.85329889158156
Epoch: 1400 Loss: 115.8209280241729
Epoch: 1600 Loss: 115.79286460959213
Epoch: 1800 Loss: 115.76811224422065


In [156]:
w2v.word_sim('second', 5)

third 0.9093000152003493
different 0.5868957135620084
fourth 0.5644147557840082
this 0.5431390836968107
so 0.5047346072149316
