### Import Required Packages

In [0]:
import numpy as np
from collections import defaultdict

### Create Required Functions

In [0]:
def softmax(v):
    exps = np.exp(v)
    return exps / np.sum(exps)

### Create Skip-Gram Class

In [0]:
class SkipGram:
    
    def __init__(self, settings):
        # n is hidden node size
        self.n = settings['n']
        
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window_size = settings['window_size']
        
    def onehot(self, word):
        vec = [0 for _ in range(len(self.vocab))]
        vec[self.word_index[word]] = 1
        return vec
        
    def generate_training_data(self, settings, corpus):
        
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
                
        self.vocab = sorted(list(word_counts.keys()), reverse = False)
        self.word_index = {word: i for i, word in enumerate(self.vocab)}
        self.index_word = {i: word for i, word in enumerate(self.vocab)}
        
        # V is vocab size
        self.V = len(self.vocab)
        
        training_data = []
        for sentence in corpus:
            sentence_length = len(sentence)
            
            for i, word in enumerate(sentence):
                
                x_t = self.onehot(word)
                
                x_context = []
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if j != i and j >= 0 and j < sentence_length:
                        x_context.append(self.onehot(sentence[j]))
                
                training_data.append([x_t, x_context])
        return np.array(training_data)
        
    def forward_pass(self, x):
        h = np.dot(np.transpose(self.W1), x)
        u_c = np.dot(np.transpose(self.W2), h)
        y_c = softmax(u_c)
        return y_c, h, u_c
    
    def back_prop(self, e, h, x):
        dl_dW2 = np.outer(h, e)
        dl_dW1 = np.outer(x, np.dot(self.W2, e.reshape(-1, 1)))
        
        self.W2 = self.W2 - self.eta * dl_dW2
        self.W1 = self.W1 - self.eta * dl_dW1
        pass
    
    def train(self, training_data):
        self.W1 = np.random.uniform(-1, 1, (self.V, self.n))
        self.W2 = np.random.uniform(-1, 1, (self.n, self.V))
        
        for i in range(self.epochs):
            self.loss = 0
            
            for x_t, x_c in training_data:
                y_pred, h, u = self.forward_pass(x_t)
                
                EI = np.sum([np.subtract(y_pred, word) for word in x_c], axis=0)
                
                self.back_prop(EI, h, x_t)
                
                self.loss += -np.sum([np.log(np.dot(y_pred, word)) for word in x_c])
            
            if i % (self.epochs / 10) == 0: print('Epoch: ' + str(i) + ' Loss: ' + str(self.loss))
        pass
    
    def get_word_vec(self, word):
        word_idx = self.word_index[word]
        return self.W1[word_idx, :]
    
    def word_sim(self, word, top_n = 5):
        word_vec = self.get_word_vec(word)
        
        word_sim = {}
        for vocab_word in self.word_index:
            if word != vocab_word:
                vec_w2 = self.get_word_vec(vocab_word)
                num = np.dot(word_vec, vec_w2)
                denom = np.linalg.norm(word_vec) * np.linalg.norm(vec_w2)
                word_sim[vocab_word] = num / denom
                
        words_sorted = sorted(word_sim.items(), key = lambda item: item[1], reverse = True)
        
        for word, sim in words_sorted[:top_n]:
            print(word, sim)

### Train Skip-Gram Model

In [5]:
corpus = [['this', 'is', 'the', 'first', 'sentence'],
          ['this', 'is', 'the', 'second', 'sentence'],
          ['the', 'third', 'sentence', 'is', 'different', 'from', 'the', 'first', 'two'],
          ['so', 'is', 'the', 'fourth']]


settings = {'n': 10, 'learning_rate': .01, 'epochs': 2000, 'window_size': 2}
w2v = SkipGram(settings)

training_data = w2v.generate_training_data(settings, corpus)

w2v.train(training_data)


Epoch: 0 Loss: 193.72658241217098
Epoch: 200 Loss: 116.96757315913926
Epoch: 400 Loss: 116.26246897117
Epoch: 600 Loss: 116.10599936148041
Epoch: 800 Loss: 116.02523817824081
Epoch: 1000 Loss: 115.96739267999725
Epoch: 1200 Loss: 115.92040033163948
Epoch: 1400 Loss: 115.88025338410868
Epoch: 1600 Loss: 115.845181137609
Epoch: 1800 Loss: 115.81419021261999


### Check Word Similarity

In [6]:
w2v.word_sim('second', 5)

third 0.8365993206690362
fourth 0.7177220681545701
this 0.44937228712435234
first 0.44589031511869737
different 0.3691023587339741


### Create Continuous Bag of Words Model

In [0]:
class CBOW:
    
    def __init__(self, settings):
        # n is hidden node size
        self.n = settings['n']
        
        self.eta = settings['learning_rate']
        self.epochs = settings['epochs']
        self.window_size = settings['window_size']
        
    def onehot(self, word):
        vec = [0 for _ in range(len(self.vocab))]
        vec[self.word_index[word]] = 1
        return vec
        
    def generate_training_data(self, settings, corpus):
        
        word_counts = defaultdict(int)
        for row in corpus:
            for word in row:
                word_counts[word] += 1
                
        self.vocab = sorted(list(word_counts.keys()), reverse = False)
        self.word_index = {word: i for i, word in enumerate(self.vocab)}
        self.index_word = {i: word for i, word in enumerate(self.vocab)}
        
        # V is vocab size
        self.V = len(self.vocab)
        
        training_data = []
        for sentence in corpus:
            sentence_length = len(sentence)
            
            for i, word in enumerate(sentence):
                
                x_t = self.onehot(word)
                
                x_context = []
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if j != i and j >= 0 and j < sentence_length:
                        x_context.append(self.onehot(sentence[j]))
                
                training_data.append([x_t, x_context])
        return np.array(training_data)
        
    def forward_pass(self, c_list):
        c = np.sum(c_list, axis = 0)
        h = np.dot(np.transpose(self.W1), c)
        u_w = np.dot(np.transpose(self.W2), h)
        y_w = softmax(u_w)
        return y_w, h, u_w
    
    def back_prop(self, e, h, x_list):
        x = np.sum(x_list)
        dl_dW2 = np.outer(h, e)
        dl_dW1 = np.outer(x, np.dot(self.W2, e.reshape(-1, 1)))
        
        self.W2 = self.W2 - self.eta * dl_dW2
        self.W1 = self.W1 - self.eta * dl_dW1
        pass
    
    def train(self, training_data):
        self.W1 = np.random.uniform(-1, 1, (self.V, self.n))
        self.W2 = np.random.uniform(-1, 1, (self.n, self.V))
        
        for i in range(self.epochs):
            self.loss = 0
            
            for x_t, x_c in training_data:
                y_pred, h, u = self.forward_pass(x_c)
                
                EI = np.subtract(y_pred, x_t)
                
                self.back_prop(EI, h, x_c)
                
                self.loss += -1 * np.log(np.dot(y_pred, x_t))
            
            if i % (self.epochs / 10) == 0: print('Epoch: ' + str(i) + ' Loss: ' + str(self.loss))
        pass
    
    def get_word_vec(self, word):
        word_idx = self.word_index[word]
        return self.W1[word_idx, :]
    
    def word_sim(self, word, top_n = 5):
        word_vec = self.get_word_vec(word)
        
        word_sim = {}
        for vocab_word in self.word_index:
            if word != vocab_word:
                vec_w2 = self.get_word_vec(vocab_word)
                num = np.dot(word_vec, vec_w2)
                denom = np.linalg.norm(word_vec) * np.linalg.norm(vec_w2)
                word_sim[vocab_word] = num / denom
                
        words_sorted = sorted(word_sim.items(), key = lambda item: item[1], reverse = True)
        
        for word, sim in words_sorted[:top_n]:
            print(word, sim)

### Train CBOW Model

In [14]:
corpus = [['this', 'is', 'the', 'first', 'sentence'],
          ['this', 'is', 'the', 'second', 'sentence'],
          ['the', 'third', 'sentence', 'is', 'different', 'from', 'the', 'first', 'two'],
          ['so', 'is', 'the', 'fourth']]


settings = {'n': 10, 'learning_rate': .01, 'epochs': 20000, 'window_size': 2}
w2v = CBOW(settings)

training_data = w2v.generate_training_data(settings, corpus)

w2v.train(training_data)


Epoch: 0 Loss: 82.86428218609369
Epoch: 2000 Loss: 10.442705091118516
Epoch: 4000 Loss: 9.896170470942401
Epoch: 6000 Loss: 9.6445903860504
Epoch: 8000 Loss: 9.52388844895357
Epoch: 10000 Loss: 9.464740190624738
Epoch: 12000 Loss: 9.431717901409051
Epoch: 14000 Loss: 9.409964885133926
Epoch: 16000 Loss: 9.39398624743513
Epoch: 18000 Loss: 9.381637976518839


### Check Word Similarity

In [15]:
w2v.word_sim('second', 5)

third 0.5076415538150308
fourth 0.4988226933669053
this 0.45611406569040525
the 0.2525756156837082
first 0.20578864355360912
