## This notebook works on Word Embeddings using Keras (covers Word2vec for CBOW and Skip Gram models, and Glove)

In [51]:
# import the necessary libraries
from keras.layers import Merge
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense,Reshape, Lambda
from keras.models import Sequential
import keras.backend as K

vocab_size = 5000
embedding_size = 300
window_size = 1

### Define the Context model and the Word model

In [58]:
# word model definition
word_model = Sequential()
word_model.add(Embedding(vocab_size, embedding_size, embeddings_initializer='glorot_uniform', input_length=1))
word_model.add(Reshape((embedding_size,)))

# Context model definition
context_model = Sequential()
context_model.add(Embedding(vocab_size, embedding_size, embeddings_initializer='glorot_uniform', input_length=1))
context_model.add(Reshape((embedding_size,)))

# add dot product and then a dense layer with sigmoid activation
model = Sequential()
model.add(Merge([word_model, context_model], mode="dot"))
model.add(Dense(1, init="glorot_uniform", activation="sigmoid"))
model.compile(loss="mean_squared_error",optimizer="adam")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_6 (Merge)              (None, 1)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 2         
Total params: 3,000,002
Trainable params: 3,000,002
Non-trainable params: 0
_________________________________________________________________




### This section reads the embedded layer weights which we are interested in

In [64]:
merge_layer = model.layers[0]
word_model = merge_layer.layers[0]
word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0]
weights.shape


(5000, 300)

### This section handles and preprocesses the text to create the skip-grams

In [12]:
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams

text = "I love green eggs and ham"

In [44]:
# create dictionary from word_to_id and id_to_word
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2id = tokenizer.word_index
id2word = {v:k for k,v in word2id.items()}
print(word2id)
print(len(word2id))
print(id2word)
print(len(id2word))

{'ham': 1, 'eggs': 2, 'and': 3, 'i': 4, 'green': 5, 'love': 6}
6
{1: 'ham', 2: 'eggs', 3: 'and', 4: 'i', 5: 'green', 6: 'love'}
6


### Create the skipgrams for the given text

In [48]:
# splits the sentence into list of words and gets the ids for those words from the dict
wids = [word2id[w] for w in text_to_word_sequence(text)]
print(wids)

# create the skipgrams for the texts - the default window_size is 4 for the skipgrams
pairs, labels = skipgrams(wids, len(word2id))
# pairs, labels = skipgrams(wids, len(word2id), window_size=4)
print("length of pairs - ", len(pairs))
# print(pairs[1])
print("Length of labels - ", len(labels))
print()

#print 10 such examples
print("Text is - ",text)
print()
for i in range(20):
    print(id2word[pairs[i][0]], pairs[i][0]," : ",
         id2word[pairs[i][1]], pairs[i][1],
         " -> ", labels[i] )

[4, 6, 5, 2, 3, 1]
length of pairs -  56
Length of labels -  56

Text is -  I love green eggs and ham

love 6  :  ham 1  ->  1
and 3  :  eggs 2  ->  0
eggs 2  :  green 5  ->  1
eggs 2  :  green 5  ->  0
eggs 2  :  and 3  ->  0
love 6  :  and 3  ->  1
love 6  :  green 5  ->  0
i 4  :  eggs 2  ->  1
and 3  :  ham 1  ->  1
love 6  :  i 4  ->  1
eggs 2  :  ham 1  ->  0
love 6  :  green 5  ->  0
i 4  :  eggs 2  ->  0
green 5  :  and 3  ->  0
eggs 2  :  i 4  ->  1
green 5  :  green 5  ->  0
ham 1  :  eggs 2  ->  0
ham 1  :  ham 1  ->  0
ham 1  :  green 5  ->  1
love 6  :  and 3  ->  0


### Create the CBOW for the given text

In [65]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, embeddings_initializer='glorot_uniform', 
                   input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_size, )))
model.add(Dense(vocab_size, kernel_initializer='glorot_uniform', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 2, 300)            1500000   
_________________________________________________________________
lambda_4 (Lambda)            (None, 300)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 5000)              1505000   
Total params: 3,005,000
Trainable params: 3,005,000
Non-trainable params: 0
_________________________________________________________________


### Weights of the embedded layer from the CBOW model

In [67]:
weights = model.layers[0].get_weights()[0]
weights.shape

(5000, 300)