## Simple Tutorial On Word2Vec Implementation with Keras

### Specify the library imports

In [1]:
from keras.models import Sequential
from keras.layers import (Dense) 
from keras.preprocessing.text import (one_hot, Tokenizer)
import numpy as np

Using TensorFlow backend.


### Implementation

In [4]:
corpus_raw = 'He is the king . The king is royal . She is the royal  queen '
# convert to lower case
corpus_raw = corpus_raw.lower()

In [5]:
# To ensure repeatable outcomes
seed = 10
np.random.seed(seed)

def getWord2IntAndInt2Word(corpus):
    """
    Converts the words in a corpus to integers and integers to words. It also provides the vocabulary size
    # Arguments:
        corpus: The entire corpus
    # Returns:
        V: Vocabulary size
        word_2_int: integer code for a word in the corpus
        int_2_word: word representation of an int
    """
    words = []
    for word in corpus.split():
        if word != '.':
            words.append(word)
    words = set(words)
    word_2_int = {}
    int_2_word = {}
    
    for i, token in enumerate(words):
        word_2_int[token] = i
        int_2_word[i] = token
    V = len(words)
    return V, word_2_int, int_2_word

def corpus2ListOfSentences(corpus):
    """
    Converts a corpus to a list of senstences, with each word in the sentence delimited by a comma
    # Arguments:
        corpus: The ntire corpus
    Returns:
        sentence_list: List of senstences
    """
    sentence_list = []
    sentences = corpus.split('.')
    for word in sentences:
        sentence_list.append(word.split())
    return sentence_list

def createListOfWordAndItsNeighbors(sentences, window=2):
    """
    Creates a list of a word and its neighboring words
    # Arguments:
        sentences: list of senstences
        window: window size for the word and its neighboring words
    # Returns:
        data: list of a word and its neighboring words
    """
    data = []
    for sentence in sentences:
        for i, word in enumerate(sentence):
            for neighbor_word in sentence[max(i - window, 0): min(i + window, len(sentence) + 1)]:
                if word != neighbor_word:
                    data.append([word, neighbor_word])
    return data

def toOneHotEncoding(data_index, size):
    """
    Converts a data point to a one-hot-encoding vector of a specified size
    # Argumnets:
        data_index: data point thta needs to be encoded
        size: size of one-hot-encoding vector
    # Returns:
        vec: returned one-hoteencoded vector
    """
    vec = np.zeros(size)
    vec[data_index] = 1
    return vec

def createFeaturesAndLabelData(context_and_target_words, word_2_int, vocab_size):
    """
    Creates the context and target data as one-hot-encoded array data sets
    # Arguments:
        context_and_target_words: list of a word and its neighboring words
        word_2_int: mapping of words to integers
        vocab_size: vocabulary size
    # Returns:
        X: features
        y: labels
    """
    X = []
    y = []
    for data in context_and_target_words:
        X.append(toOneHotEncoding(word_2_int[data[0]], vocab_size))
        y.append(toOneHotEncoding(word_2_int[data[1]], vocab_size))
    
    X = np.asarray(X)
    y = np.asarray(y)
    return X, y

def createNeuralNetModel(vocab_dim, embedding_dim):
    # create model
    model = Sequential()
    model.add(Dense(embedding_dim, input_dim=vocab_dim, init='uniform', activation='linear'))
    model.add(Dense(vocab_dim, init='uniform', activation='softmax'))
    #model.add(Dense(1, init='uniform', activation='sigmoid'))
    return model

def computeEmbeddingsViaNeuralNet(X, y, vocab_dim, embedding_dim):
    model = createNeuralNetModel(vocab_dim, embedding_dim)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X, y, epochs=150, batch_size=10)
    scores = model.evaluate(X, Y)
    print("{0} {1:.2f}".format(model.metrics_names[1], scores[1]*100))
    
embedding_dim = 5   
vocab_size, word_2_int, int_2_word = getWord2IntAndInt2Word(corpus_raw)
sentences = corpus2ListOfSentences(corpus_raw)
data = createListOfWordAndItsNeighbors(sentences)
#toOneHotEncoding(4, 10)
X, y = createFeaturesAndLabelData(data, word_2_int, vocab_size)
model = createNeuralNetModel(vocab_size, embedding_dim)



TypeError: softmax() got an unexpected keyword argument 'axis'

In [6]:
def tokenize(corpus):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    corpus_tokenized = tokenizer.texts_to_sequences(corpus)
    V = len(tokenizer.word_index)
    return corpus_tokenized, V

corpus = ["I like playing football with my friends"]
tokenize(corpus)

([[1, 2, 3, 4, 5, 6, 7]], 7)

In [7]:
#user-defined parameters
corpus = ["I like playing football with my friends"] #our example text corpus
N = 2 #assume that the hidden layer has dimensionality = 2
window_size = 2 #symmetrical
eta = 0.1 #learning rate
corpus_tokenized, V = tokenize(corpus)

In [3]:
#!pip install pip --upgrade
#!pip install tensorflow --upgrade
!pip install keras --upgrade

Requirement already up-to-date: keras in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (2.2.0)
Requirement not upgraded as not directly required: numpy>=1.9.1 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from keras) (1.14.5)
Requirement not upgraded as not directly required: scipy>=0.14 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from keras) (0.19.1)
Requirement not upgraded as not directly required: six>=1.9.0 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from keras) (1.11.0)
Requirement not upgraded as not directly required: pyyaml in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from keras) (3.12)
Requirement not upgraded as not directly required: h5py in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from keras) (2.8.0)
Requirement not upgraded as not directly required: keras_applications==1.0.2 in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (from keras) (1.0.2)
Requirement not upgraded as not directly