## Simple Tutorial On Word2vec implementation

### Specify the imports

In [23]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import np_utils
import tensorflow as tf

### Tokenize the corpus

In [24]:
def tokenize(corpus):
    """Tokenize the corpus text.
    :param corpus: list containing a string of text (example: ["I like playing football with my friends"])
    :return corpus_tokenized: indexed list of words in the corpus, in the same order as the original corpus (the example above would return [[1, 2, 3, 4]])
    :return V: size of vocabulary
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    corpus_tokenized = tokenizer.texts_to_sequences(corpus)
    V = len(tokenizer.word_index)
    return corpus_tokenized, V

#Test
tokenize(['I am coming home tomorrow', 'The dog like to run after the ball'])

([[2, 3, 4, 5, 6], [1, 7, 8, 9, 10, 11, 1, 12]], 12)

### Compute the One-hot-Encoding representation of the Corpus

In [30]:
def to_categorical(y, num_classes=None):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
    # Returns
        A binary matrix representation of the input.
    """
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

def corpus2io(corpus_tokenized, V, window_size):
    """Converts corpus text into context and center words
    # Arguments
        corpus_tokenized: corpus text
        window_size: size of context window
    # Returns
        context and center words (arrays)
    """
    for words in corpus_tokenized:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels = []
            s = index - window_size
            e = index + window_size + 1
            contexts.append([words[i]-1 for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word-1)
            x = np_utils.to_categorical(contexts, V)
            y = np_utils.to_categorical(labels, V)
            yield (x, y.ravel())

def testOneHotEncoding():
    window_size = 2
    corpus = ["I like playing football with my friends"]
    corpus_tokenized, V = tokenize(corpus)
    for i, (x, y) in enumerate(corpus2io(corpus_tokenized, V, window_size)):
        print(i, "\n center word =", y, "\n context words =\n",x)

testOneHotEncoding()

0 
 center word = [1. 0. 0. 0. 0. 0. 0.] 
 context words =
 [[[0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]]]
1 
 center word = [0. 1. 0. 0. 0. 0. 0.] 
 context words =
 [[[1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]]]
2 
 center word = [0. 0. 1. 0. 0. 0. 0.] 
 context words =
 [[[1. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]]]
3 
 center word = [0. 0. 0. 1. 0. 0. 0.] 
 context words =
 [[[0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]]]
4 
 center word = [0. 0. 0. 0. 1. 0. 0.] 
 context words =
 [[[0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 1.]]]
5 
 center word = [0. 0. 0. 0. 0. 1. 0.] 
 context words =
 [[[0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1.]]]
6 
 center word = [0. 0. 0. 0. 0. 0. 1.] 
 context words =
 [[[0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]]]


### Softmax Evaluation

In [26]:
def softmax(x):
    """Calculate softmax based probability for given input vector
    # Arguments
        x: numpy array/list
    # Returns
        softmax of input array
    """
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

### Continious-Bag-Of-Words Implementation

In [33]:
def cbow(context, label, W1, W2, loss, eta):
    """
    Implementation of Continuous-Bag-of-Words Word2Vec model
    :param context: all the context words (these represent the inputs)
    :param label: the center word (this represents the label)
    :param W1: weights from the input to the hidden layer
    :param W2: weights from the hidden to the output layer
    :param loss: float that represents the current value of the loss function
    :param eta; Learning rate
    :return: updated weights and loss
    """
    x = np.mean(context, axis=0)
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y_pred = softmax(u)
    e = -label + y_pred
    dW2 = np.outer(h, e)
    dW1 = np.outer(x, np.dot(W2, e))
    new_W1 = W1 - eta * dW1
    new_W2 = W2 - eta * dW2
    loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))
    return new_W1, new_W2, loss

def testCbowImplementation():
    #user-defined parameters
    corpus = ["I like playing football with my friends"] #our example text corpus
    N = 2 #assume that the hidden layer has dimensionality = 2
    window_size = 2 #symmetrical
    eta = 0.1 #learning rate
    corpus_tokenized, V = tokenize(corpus)
    #initialize weights (with random values) and loss function
    np.random.seed(100)
    W1 = np.random.rand(V, N)
    W2 = np.random.rand(N, V)
    loss = 0.
    for i, (context, label) in enumerate(corpus2io(corpus_tokenized, V, window_size)):
        print("W1.shape: {}".format(W1.shape))
        _, x_row, x_col = context.shape
        context = context.reshape((x_row, x_col))
        print("context.shape: {}".format(context.shape))
        W1, W2, loss = cbow(context, label, W1, W2, loss, eta)
        print("Training example #{} \n-------------------- \n\n \t label = {}, \n \t context = {}".format(i, label, context))
        print("\t W1 = {}\n\t W2 = {} \n\t loss = {}\n".format(W1, W2, loss))
        
testCbowImplementation()

W1.shape: (7, 2)
context.shape: (2, 7)
Training example #0 
-------------------- 

 	 label = [1. 0. 0. 0. 0. 0. 0.], 
 	 context = [[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]]
	 W1 = [[ 0.54340494  0.27836939]
 [ 0.40739238  0.86832668]
 [-0.01240636  0.14511967]
 [ 0.67074908  0.82585276]
 [ 0.13670659  0.57509333]
 [ 0.89132195  0.20920212]
 [ 0.18532822  0.10837689]]
	 W2 = [[2.37522044e-01 9.74588882e-01 8.08598345e-01 1.69452859e-01
  8.13081663e-01 2.71730692e-01 4.28973634e-01]
 [9.80158449e-01 8.08565553e-01 3.29167094e-01 1.69808842e-01
  3.65755980e-01 4.13558689e-04 2.46279033e-01]] 
	 loss = 1.7750419435896188

W1.shape: (7, 2)
context.shape: (3, 7)
Training example #1 
-------------------- 

 	 label = [0. 1. 0. 0. 0. 0. 0.], 
 	 context = [[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]]
	 W1 = [[5.56639604e-01 2.89686158e-01]
 [4.07392378e-01 8.68326677e-01]
 [8.28305801e-04 1.56436438e-01]
 [6.83983747e-01 8.37169528e-01]
 [1.36706590e-01 5.7

### Skip-Gram Implementation

In [36]:
def skipgram(context, x, W1, W2, loss, eta):
    """
    Implementation of Skip-Gram Word2Vec model
    :param context: all the context words (these represent the labels)
    :param label: the center word (this represents the input)
    :param W1: weights from the input to the hidden layer
    :param W2: weights from the hidden to the output layer
    :param loss: float that represents the current value of the loss function
    :return: updated weights and loss
    """
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y_pred = softmax(u)
    e = np.array([-label + y_pred.T for label in context])
    dW2 = np.outer(h, np.sum(e, axis=0))
    dW1 = np.outer(x, np.dot(W2, np.sum(e, axis=0)))
    new_W1 = W1 - eta * dW1
    new_W2 = W2 - eta * dW2
    loss += -np.sum([u[label == 1] for label in context]) + len(context) * np.log(np.sum(np.exp(u)))
    return new_W1, new_W2, loss

def testSkipGramImplementation():
    #user-defined parameters
    corpus = ["I like playing football with my friends"] #our example text corpus
    N = 2 #assume that the hidden layer has dimensionality = 2
    window_size = 2 #symmetrical
    eta = 0.1 #learning rate
    corpus_tokenized, V = tokenize(corpus)
    #initialize weights (with random values) and loss function
    np.random.seed(100)
    W1 = np.random.rand(V, N)
    W2 = np.random.rand(N, V)
    loss = 0.
    for i, (context, label) in enumerate(corpus2io(corpus_tokenized, V, window_size)):
        print("W1.shape: {}".format(W1.shape))
        _, x_row, x_col = context.shape
        context = context.reshape((x_row, x_col))
        print("context.shape: {}".format(context.shape))
        W1, W2, loss = skipgram(context, label, W1, W2, loss, eta)
        print("Training example #{} \n-------------------- \n\n \t label = {}, \n \t context = {}".format(i, label, context))
        print("\t W1 = {}\n\t W2 = {} \n\t loss = {}\n".format(W1, W2, loss))
        
testSkipGramImplementation()

W1.shape: (7, 2)
context.shape: (2, 7)
Training example #0 
-------------------- 

 	 label = [1. 0. 0. 0. 0. 0. 0.], 
 	 context = [[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]]
	 W1 = [[0.60454916 0.30236044]
 [0.42451759 0.84477613]
 [0.00471886 0.12156912]
 [0.67074908 0.82585276]
 [0.13670659 0.57509333]
 [0.89132195 0.20920212]
 [0.18532822 0.10837689]]
	 W2 = [[ 2.04840301e-01  1.01127493e+00  8.48700217e-01  1.60239872e-01
   7.98680093e-01  2.62275625e-01  4.17937082e-01]
 [ 9.32418944e-01  8.34375540e-01  3.55074638e-01  1.69416325e-01
   3.63844467e-01 -3.55302092e-04  2.45373897e-01]] 
	 loss = 3.447952656315579

W1.shape: (7, 2)
context.shape: (3, 7)
Training example #1 
-------------------- 

 	 label = [0. 1. 0. 0. 0. 0. 0.], 
 	 context = [[1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]]
	 W1 = [[0.60454916 0.30236044]
 [0.36581851 0.83603688]
 [0.00471886 0.12156912]
 [0.67074908 0.82585276]
 [0.13670659 0.57509333]
 [0.89132195 0.20920212]
 [0

In [15]:
l = list(range(10))
ll = np.array(l, 'int')
print("l[-1] = {0}\tlen(l) = {1}".format(l[-1], len(l)))
print("ll = {0}\nll.ravel() = {1}".format(ll, ll.ravel()))

l[-1] = 9	len(l) = 10
ll = [0 1 2 3 4 5 6 7 8 9]
ll.ravel() = [0 1 2 3 4 5 6 7 8 9]
