<a href="https://colab.research.google.com/github/python-freak/CBOW-Implementation-/blob/master/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Implementation of Continuous bag of Words Model

#### Importing dependencies

In [0]:
from keras.preprocessing import text
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding , Dense , Flatten , Lambda
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import keras.backend as k
from keras.models import Sequential
import re
import pandas as pd

In [6]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [0]:
from nltk.corpus import gutenberg

In [8]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [0]:
raw_text = gutenberg.raw('bible-kjv.txt')
raw_text = re.sub('\n',' ',raw_text)

In [0]:
import re 
corpus = re.compile('[0-9]+:[0-9]+').split(raw_text)

In [0]:
len(corpus)
corpus = corpus[:10000]

In [0]:
tokenizer = text.Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(corpus)
word2id = tokenizer.word_index
word2id['<PAD>'] = 0

In [70]:
len(word2id)

6823

In [71]:
id2word = {id:word for word , id in word2id.items()}
type(word2id)

dict

In [0]:
vocab_size = len(word2id)
embedding_size = 128
window_size = 2
num_epochs = 5

In [0]:
wids = [[word2id[word] for word in text.text_to_word_sequence(doc)] for doc in corpus]

In [74]:
len(wids[4])

17

##  defining function to generate context and target word pairs

In [0]:
def generate_context_target_word_pairs(corpus , vocab_size , window_size):
    context_length = 2*window_size
    for words in corpus:
        sequence_length = len(words)
        for index , word in enumerate(words):
            start = index - window_size
            end = index + window_size + 1 
            context_words = []
            label_word = []
            context_words.append([words[i] for i in range(start,end) if 0 <= i < sequence_length and i != index])
            label_word.append(word)
            #x = context_words
            x = pad_sequences(context_words , maxlen = context_length , padding = 'pre' , truncating = 'pre')
            y = to_categorical(label_word , vocab_size)
            yield(x,y)
        

##  Generating Context and target pairs

In [76]:

i = 0
for x,y in generate_context_target_word_pairs(corpus = wids , vocab_size = vocab_size , window_size = window_size):
    if 0 not in x[0]:
        
        print("Context Words :" , [id2word[i] for i in x[0]] , '-> Target Word :' , id2word[np.argmax(y)])
        if i == 20:
            break
        i = i + 1


Context Words : ['the', 'king', 'bible', 'the'] -> Target Word : james
Context Words : ['king', 'james', 'the', 'old'] -> Target Word : bible
Context Words : ['james', 'bible', 'old', 'testament'] -> Target Word : the
Context Words : ['bible', 'the', 'testament', 'of'] -> Target Word : old
Context Words : ['the', 'old', 'of', 'the'] -> Target Word : testament
Context Words : ['old', 'testament', 'the', 'king'] -> Target Word : of
Context Words : ['testament', 'of', 'king', 'james'] -> Target Word : the
Context Words : ['of', 'the', 'james', 'bible'] -> Target Word : king
Context Words : ['the', 'king', 'bible', 'the'] -> Target Word : james
Context Words : ['king', 'james', 'the', 'first'] -> Target Word : bible
Context Words : ['james', 'bible', 'first', 'book'] -> Target Word : the
Context Words : ['bible', 'the', 'book', 'of'] -> Target Word : first
Context Words : ['the', 'first', 'of', 'moses'] -> Target Word : book
Context Words : ['first', 'book', 'moses', 'called'] -> Target Wo

In [77]:
import keras.backend as K

model = Sequential()
model.add(Embedding(input_dim = vocab_size , output_dim = embedding_size , input_length = window_size * 2))
model.add(Lambda(lambda x: K.mean(x , axis  = 1) , output_shape = (embedding_size ,)))
model.add(Dense(output_dim = vocab_size , activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy' , optimizer = 'rmsprop')

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 4, 128)            873344    
_________________________________________________________________
lambda_4 (Lambda)            (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 6823)              880167    
Total params: 1,753,511
Trainable params: 1,753,511
Non-trainable params: 0
_________________________________________________________________
None


  


#  Training the Model

In [78]:
for epoch in range(num_epochs):
    loss = 0
    i = 0 
    for x , y in generate_context_target_word_pairs(corpus = wids , vocab_size = vocab_size , window_size = window_size):
        i = i + 1
        loss+=model.train_on_batch(x,y)
    print('Epoch :' , epoch , '-> Loss :' , loss)
    print()
            

Epoch : 0 -> Loss : 2187087.5277019152

Epoch : 1 -> Loss : 2326781.2880545533

Epoch : 2 -> Loss : 2341920.4156878483

Epoch : 3 -> Loss : 2296029.482446004

Epoch : 4 -> Loss : 2271344.861756953



In [87]:
weights = model.get_weights()[0]
weights = weights[1:]
print(weights.shape)
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(6822, 128)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
and,1.324713,-2.262283,1.599976,2.406861,-2.780106,-2.143501,2.811772,-1.582393,3.211106,-1.832466,...,-1.941061,2.641529,2.236893,-2.192495,1.879202,-1.396138,-1.442995,-2.767183,1.814842,1.900154
of,4.704708,-2.103935,4.209386,3.17201,-1.985346,-2.742967,2.615588,-3.135289,2.659963,-2.564027,...,-2.873548,3.318985,2.71585,-2.70072,2.632065,-2.791503,-3.126455,-2.987996,3.810323,2.924502
to,2.862848,-1.886645,3.289742,2.582556,-1.442604,-2.30283,2.404355,-0.168917,2.604083,-3.022157,...,-1.886205,0.728539,2.210032,-2.229585,1.911665,-2.586068,-2.152974,-2.254407,2.624727,1.430733
in,2.384757,-1.404098,1.982178,2.229606,-0.802226,-1.819652,1.763981,-1.855353,2.099869,-2.373384,...,-0.535964,0.689754,1.109069,-1.664946,1.31529,-2.239972,-1.253184,-1.127031,1.344183,1.79683
that,2.181835,-1.50915,2.057486,1.349586,-0.160132,-1.812121,1.953357,-1.343929,2.245456,-1.927707,...,-0.380958,0.703841,0.426059,-1.262542,0.852948,-2.644204,-0.631043,-1.140461,1.523414,0.772313


In [92]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['god', 'noah', 'egypt', 'commanded']}

similar_words

(6822, 6822)


{'commanded': ['sent', 'more', 'delivered', 'given', 'bless'],
 'egypt': ['canaan', 'whither', 'gilead', 'way', 'goshen'],
 'god': ['because', 'against', 'out', 'hath', 'or'],
 'noah': ['azariah', 'mahlon', 'shem', 'obadiah', 'hadad']}