In [27]:
import numpy as np
import matplotlib.pylab as plt
import random
from numpy import dot
from numpy.linalg import norm
import copy

# Turn testing to False while training your model.
# Please, turn it back to True while submitting your code.
_TESTING = False

In [4]:
a = np.array([1,2,3,4,5,6,7,8,9])

In [5]:
seed = 10417617 
_eps = 1e-5 # useful to avoid overflow while using cross entropy.

In [6]:

#line_li = []
#word_li = []
#line = f.readline()
#while line:
#    line_li.append([line.strip()])
#    word_li.extend(line.strip().split(' '))
#    line = f.readline()
#line = line_li[0]
#vocab = sorted(list(set(word_li)))
#f.close()

In [7]:
def weight_init(x_len,y_len):
    b = np.sqrt(6.0/(x_len+y_len))
    if _TESTING:
        np.random.seed(seed) 
    return np.random.normal(-b, b, (x_len, y_len))

In [8]:
def cosineSimilarity(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [37]:
class CBOW:
    """
    This class is an implementation of the continous bag of words model by first principles.
    CBOW model tries to predict the center words from surrounding words.
    """
    def __init__(self, text, window, n, learning_rate=1e-4):
        """
        Recommended hyperparameters.
        Initialize self.n, self.text,self.window,self.lr,self.vocab,self.word2index, self.V and self.U
        n = desired size of word vectors
        window = size of window
        vocab = vocabulary of all words in the dataset -- make sure it is sorted.
        word2index = index for each word in the vocabulary 
        V: input vector matrix of shape [n, len(vocab)] (W)
        U: output vector matrix of shape [len(vocab),n] (W')
        # use weight_init to initialize weights
        """
        self.text = text
        self.temp_text = text.strip().split(' ')
        self.vocab = sorted(list(set(text.strip().split(' '))))
        self.n = n
        self.window = window
        self.lr = learning_rate
        self.word2index = dict(zip(self.vocab, [i for i in range(len(self.vocab))]))
        self.V = weight_init(n, len(self.vocab))
        self.U = weight_init(len(self.vocab), n)
        self.text_index = [self.word2index[i] for i in self.temp_text]
        
        self.train_left_X = []
        self.train_right_X = []
        self.train_Y = []
        for i in range(self.window, len(self.vocab) - self.window):
            self.train_left_X.append(self.temp_text[i - self.window : i])
            self.train_right_X.append(self.temp_text[i + 1 : i + self.window + 1])
            self.train_Y.append(self.text_index[i])
            # print(self.temp_text[i - self.window : i], self.temp_text[i], self.temp_text[i + 1 : i + self.window + 1])
        self.idx = [i for i in range(len(self.train_Y))]
         

    def one_hot_vector(self, index):
        """
        Function for one hot vector encoding.
        :input: a word index
        :return: one hot encoding
        """
        ret = np.zeros((1, len(self.vocab)))
        ret[0, index] = 1
        return ret[0]

    def get_vector_representation(self, one_hot=None):
        """"
        Function for vector representation from one-hot encoding.
        :input: one hot encoding
        :return: word vector
        """
        return (self.V.dot(np.expand_dims(one_hot, 1))).T[0]

    def get_average_context(self, left_context=None, right_context=None):
        """
        Function for average vector generation from surrounding context of current word.
        :input: surrounding context (left/right)
        :return: averaged vector representation
        """
        left_context.extend(right_context)
        all_onehot = [self.one_hot_vector(self.word2index[i]) for i in left_context]
        return self.get_vector_representation(np.stack(all_onehot).mean(0))

    def get_score(self, avg_vector=None):
        """
        Function for product score given an averaged vector in current context of the center word.
        :input: averaged vector
        :return: product score with matrix U.
        """
        return self.U.dot(avg_vector)

    def softmax(self, x):
        """
        Function to return the softmax output of given function.
        """
        exp_x = np.exp(x)
        softmax_x = exp_x / (np.sum(exp_x) + _eps)
        return softmax_x 

    def compute_cross_entropy_error(self, y, y_hat):
        """
        Given one hot encoding and the output of the softmax function, this function computes the
        cross entropy error.
        ---------
        :input y: one_hot encoding of the current center word
        :input y_hat: output of the softmax function
        :return: cross entropy error
        """
        return - np.log(y_hat + _eps) * y

    def compute_EH(self, e):
        """
        Function to compute the value of EH, 
        the sum of the output vectors of all words in the vocabulary,
        weighted by their prediction error.
        Look at https://arxiv.org/pdf/1411.2738.pdf for more information.
        ---------
        :input: e: prediction error. 
        :return: value of EH
        """
        self.eh = (self.U * np.expand_dims(e, 1)).sum(0)
        return self.eh

    def update_U(self, e, avg_vector): # avg_vector: hidden h
        """
        Given the cross entropy error occured in the current sample, this function updates the U matrix.
        :return self.U
        """
        self.U = self.U - self.lr * (np.expand_dims(e, 1).dot(np.expand_dims(avg_vector, 0)))
        return self.U

    def update_V(self, e, left_context=None, right_context=None):
        """
        This function updates the V matrix.
        :return: self.V
        NOTE: Do *not* divide by the context length (as mentioned in
        https://arxiv.org/pdf/1411.2738.pdf) You will need to either Take an appropriate sum. 
        or
        multiply your updates with the context length instead.
        based on your implementation.
        """
        eh = self.compute_EH(e)
        left_context.extend(right_context)
        all_index = [self.word2index[i] for i in left_context]
        self.V[:, all_index] -= self.lr * len(all_index) * np.expand_dims(eh, 1)
        return self.V

    def fit(self, epoch):
        for ep in range(epoch):
            total_err = 0
            random.shuffle(self.idx)
            for i in self.idx:
                train_left = self.train_left_X[i]
                
                train_right = self.train_right_X[i]
                train_target = self.train_Y[i]
                avg_vector = self.get_average_context(left_context=train_left, right_context=train_right)
                score = self.get_score(avg_vector)
                pred = self.softmax(score)
                e = pred - self.one_hot_vector(train_target)
                error = self.compute_cross_entropy_error(self.one_hot_vector(train_target), pred)
                total_err += error
                self.update_U(e, avg_vector)
                self.update_V(e, left_context=train_left, right_context=train_right)
            print('----- epoch: {} CE: {}------'.format(ep, total_err.sum() / len(self.idx)))
    
    def predict(self, word):
        index = self.word2index[word]
        return (self.U[index] + self.V[:, index]) / 2

In [38]:
f = open('./data.txt')
a = CBOW(f.readline(), 2, 100)
a.fit(1000)

----- epoch: 0 CE: 7.488471782588354------
----- epoch: 1 CE: 7.487727317427425------
----- epoch: 2 CE: 7.486729507498391------
----- epoch: 3 CE: 7.485487658415663------
----- epoch: 4 CE: 7.484029036849684------
----- epoch: 5 CE: 7.482333583013913------
----- epoch: 6 CE: 7.480427070295531------
----- epoch: 7 CE: 7.478310816980172------
----- epoch: 8 CE: 7.475939589838471------
----- epoch: 9 CE: 7.473352285250231------
----- epoch: 10 CE: 7.470537906979288------
----- epoch: 11 CE: 7.467500785591814------
----- epoch: 12 CE: 7.46422989741232------
----- epoch: 13 CE: 7.460701751778257------
----- epoch: 14 CE: 7.456941394527926------
----- epoch: 15 CE: 7.452944273802208------
----- epoch: 16 CE: 7.448661757567692------
----- epoch: 17 CE: 7.444119136127311------
----- epoch: 18 CE: 7.439303322536477------
----- epoch: 19 CE: 7.434255498230487------
----- epoch: 20 CE: 7.4288557172583385------
----- epoch: 21 CE: 7.423246721048898------
----- epoch: 22 CE: 7.417261667122427-----

----- epoch: 184 CE: 3.173442131691417------
----- epoch: 185 CE: 3.1434159350159563------
----- epoch: 186 CE: 3.113183893721283------
----- epoch: 187 CE: 3.0826206519816632------
----- epoch: 188 CE: 3.0517360130886444------
----- epoch: 189 CE: 3.0229620834946043------
----- epoch: 190 CE: 2.992461927524104------
----- epoch: 191 CE: 2.9633297423400533------
----- epoch: 192 CE: 2.93402718411701------
----- epoch: 193 CE: 2.9039954597590905------
----- epoch: 194 CE: 2.8749616098855877------
----- epoch: 195 CE: 2.8449587273446575------
----- epoch: 196 CE: 2.8162566495180315------
----- epoch: 197 CE: 2.788116239980124------
----- epoch: 198 CE: 2.758896190766958------
----- epoch: 199 CE: 2.7295897478310494------
----- epoch: 200 CE: 2.7015182746458772------
----- epoch: 201 CE: 2.673561733417366------
----- epoch: 202 CE: 2.6454415525800377------
----- epoch: 203 CE: 2.6169062285------
----- epoch: 204 CE: 2.588456446340016------
----- epoch: 205 CE: 2.560524090889976------
----

KeyboardInterrupt: 

In [39]:
similarity = []
ground = a.predict('dollar')
for word in a.vocab:
    similarity.append(cosineSimilarity(ground, a.predict(word)))
index = np.argsort(similarity)[-6:]
for i in index:
    print(a.vocab[i])

latter
sustained
supply
whole
centre
dollar


In [40]:
similarity = []
ground = a.predict('stock')
for word in a.vocab:
    similarity.append(cosineSimilarity(ground, a.predict(word)))
index = np.argsort(similarity)[-6:]
for i in index:
    print(a.vocab[i])

ratification
watch
fiveweek
south
global
stock


In [41]:
similarity = []
ground = a.predict('mortgage')
for word in a.vocab:
    similarity.append(cosineSimilarity(ground, a.predict(word)))
index = np.argsort(similarity)[-6:]
for i in index:
    print(a.vocab[i])

china
strategic
leaders
economics
commerce
mortgage


In [28]:
U = copy.deepcopy(a.U)

In [29]:
V = copy.deepcopy(a.V)