## This notebook works on Word Embeddings using Keras (covers Glove models and pre-trained embeddings)

In [96]:
# import the necessary libraries
from keras.layers import Merge
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense,Reshape, Lambda
from keras.models import Sequential
import keras.backend as K

vocab_size = 5000
embedding_size = 300
window_size = 1

### Define the Context model and the Word model

In [58]:
# word model definition
word_model = Sequential()
word_model.add(Embedding(vocab_size, embedding_size, embeddings_initializer='glorot_uniform', input_length=1))
word_model.add(Reshape((embedding_size,)))

# Context model definition
context_model = Sequential()
context_model.add(Embedding(vocab_size, embedding_size, embeddings_initializer='glorot_uniform', input_length=1))
context_model.add(Reshape((embedding_size,)))

# add dot product and then a dense layer with sigmoid activation
model = Sequential()
model.add(Merge([word_model, context_model], mode="dot"))
model.add(Dense(1, init="glorot_uniform", activation="sigmoid"))
model.compile(loss="mean_squared_error",optimizer="adam")
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_6 (Merge)              (None, 1)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 2         
Total params: 3,000,002
Trainable params: 3,000,002
Non-trainable params: 0
_________________________________________________________________




### This section reads the embedded layer weights which we are interested in

In [64]:
merge_layer = model.layers[0]
word_model = merge_layer.layers[0]
word_embed_layer = word_model.layers[0]
weights = word_embed_layer.get_weights()[0]
weights.shape


(5000, 300)

### This section handles and preprocesses the text to create the skip-grams

In [12]:
from keras.preprocessing.text import *
from keras.preprocessing.sequence import skipgrams

text = "I love green eggs and ham"

In [44]:
# create dictionary from word_to_id and id_to_word
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word2id = tokenizer.word_index
id2word = {v:k for k,v in word2id.items()}
print(word2id)
print(len(word2id))
print(id2word)
print(len(id2word))

{'ham': 1, 'eggs': 2, 'and': 3, 'i': 4, 'green': 5, 'love': 6}
6
{1: 'ham', 2: 'eggs', 3: 'and', 4: 'i', 5: 'green', 6: 'love'}
6


### Create the skipgrams for the given text

In [48]:
# splits the sentence into list of words and gets the ids for those words from the dict
wids = [word2id[w] for w in text_to_word_sequence(text)]
print(wids)

# create the skipgrams for the texts - the default window_size is 4 for the skipgrams
pairs, labels = skipgrams(wids, len(word2id))
# pairs, labels = skipgrams(wids, len(word2id), window_size=4)
print("length of pairs - ", len(pairs))
# print(pairs[1])
print("Length of labels - ", len(labels))
print()

#print 10 such examples
print("Text is - ",text)
print()
for i in range(20):
    print(id2word[pairs[i][0]], pairs[i][0]," : ",
         id2word[pairs[i][1]], pairs[i][1],
         " -> ", labels[i] )

[4, 6, 5, 2, 3, 1]
length of pairs -  56
Length of labels -  56

Text is -  I love green eggs and ham

love 6  :  ham 1  ->  1
and 3  :  eggs 2  ->  0
eggs 2  :  green 5  ->  1
eggs 2  :  green 5  ->  0
eggs 2  :  and 3  ->  0
love 6  :  and 3  ->  1
love 6  :  green 5  ->  0
i 4  :  eggs 2  ->  1
and 3  :  ham 1  ->  1
love 6  :  i 4  ->  1
eggs 2  :  ham 1  ->  0
love 6  :  green 5  ->  0
i 4  :  eggs 2  ->  0
green 5  :  and 3  ->  0
eggs 2  :  i 4  ->  1
green 5  :  green 5  ->  0
ham 1  :  eggs 2  ->  0
ham 1  :  ham 1  ->  0
ham 1  :  green 5  ->  1
love 6  :  and 3  ->  0


### Create the CBOW for the given text

In [65]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_size, embeddings_initializer='glorot_uniform', 
                   input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_size, )))
model.add(Dense(vocab_size, kernel_initializer='glorot_uniform', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 2, 300)            1500000   
_________________________________________________________________
lambda_4 (Lambda)            (None, 300)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 5000)              1505000   
Total params: 3,005,000
Trainable params: 3,005,000
Non-trainable params: 0
_________________________________________________________________


### Weights of the embedded layer from the CBOW model

In [67]:
weights = model.layers[0].get_weights()[0]
weights.shape

(5000, 300)

## Using Third Party implementations of Word2Vec - Use Gensim

In [108]:
from gensim.models import keyedvectors
import os
import logging
from gensim.models import word2vec

In [99]:
# create sentences of maxlength from the text8 corpus

class text2sentences(object):
    def __init__(self, fname, maxlength):
        self.fname = fname
        self.maxlength = maxlength
    def __iter__(self):
        with open(os.path.join(DATA_DIR, "text8"), "r") as ftext:
            text = ftext.read().split(" ")
            words = []
            for word in text:
                if len(words) >= self.maxlength:
                    yield words
                    words = []
                words.append(word)
            yield words

### Creating sentences using the class defined above of maxlength of words

In [104]:
#setting the logging level
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

DATA_DIR = "/Users/tkmacl9/Desktop/FastAIDLCourse/text8_word2vec/"
sentences = text2sentences(os.path.join(DATA_DIR, "text8"), 50)

### Train the model and normalize the model params and save them and load them back

In [101]:
model = word2vec.Word2Vec(sentences, size=300, min_count=30)

In [102]:
#normalize the model parameters and save the model
model.init_sims(replace=True)
model.save("/Users/tkmacl9/Desktop/FastAIDLCourse/text8_word2vec/word2vec_gensim.bin")

In [106]:
#load the saved model
model = word2vec.Word2Vec.load("/Users/tkmacl9/Desktop/FastAIDLCourse/text8_word2vec/word2vec_gensim.bin")

### Use the loaded model to find similarity score between word pairs, find most similar words, or the embedded vectors of words

In [112]:
print(model['woman'])
print("")
print(model['woman'].shape)  # shape of the embedding vector is 300
print("")

# print most similar words
print(model.most_similar("woman"))
print("")

[ -1.02323689e-01  -4.41231430e-02   3.95736434e-02  -3.74355279e-02
  -7.39789829e-02   2.88131237e-02  -3.40031683e-02   7.72187039e-02
  -5.08850664e-02   1.06910309e-02   5.06884567e-02   5.08938171e-02
  -1.23202115e-01  -4.53161970e-02  -3.49204540e-02  -1.31303193e-02
  -1.43687138e-02   7.93975964e-02   6.11007363e-02   2.59787682e-02
  -3.98292840e-02  -3.16743110e-03  -2.58806758e-02  -1.08963745e-02
  -1.30261825e-02  -4.39308584e-03  -2.38791425e-02   1.70451522e-01
  -8.04974809e-02   6.84786439e-02   7.21806735e-02  -4.48552333e-02
   6.35374635e-02  -2.24758815e-02  -6.36098534e-02   6.69187028e-03
  -4.98502031e-02  -2.35009082e-02   6.53387234e-02   1.03983983e-01
  -4.37662974e-02  -3.59887369e-02  -3.01812459e-02  -7.45489746e-02
   3.52005512e-02   8.18186328e-02   7.92362075e-03   1.09489933e-01
   4.34352420e-02   5.51684387e-03   8.38563871e-03  -1.33026019e-02
  -2.11742762e-02   7.18761533e-02   6.18180856e-02   1.10621611e-03
  -7.57084042e-02   1.82901308e-01

In [113]:
# find words most similar to woman and king, but not like man
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

[('queen', 0.6340517997741699),
 ('empress', 0.5706495642662048),
 ('isabella', 0.5673955678939819),
 ('princess', 0.5533063411712646),
 ('prince', 0.5482945442199707),
 ('elizabeth', 0.5417065620422363),
 ('throne', 0.5405027270317078),
 ('daughter', 0.5301916003227234),
 ('monarch', 0.5251233577728271),
 ('regent', 0.5207557082176208)]

In [117]:
# find the similarity score between 2 words
print(model.similarity('girl', 'woman'))
print()
print(model.similarity('girl', 'man'))
print()
print(model.similarity('queen', 'king'))
print()
print(model.similarity('car', 'bus'))
print()
print(model.similarity('man', 'car'))
print()

0.712457985375

0.594207825088

0.637808347491

0.486791184796

0.15266906536

