Name: **Pranav Bhagwat**<br>
Div: **BE-9**<br>
Roll no: **43161**<br>
Title: **Assignment 5: Implement the Continuous Bag of Words (CBOW) Model**<br>

In [2]:
#importing libraries
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

In [3]:
#taking random sentences as data
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. 
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

In [4]:
dl_data

['Deep',
 'learning',
 '(also',
 'known',
 'as',
 'deep',
 'structured',
 'learning)',
 'is',
 'part',
 'of',
 'a',
 'broader',
 'family',
 'of',
 'machine',
 'learning',
 'methods',
 'based',
 'on',
 'artificial',
 'neural',
 'networks',
 'with',
 'representation',
 'learning.',
 'Learning',
 'can',
 'be',
 'supervised,',
 'semi-supervised',
 'or',
 'unsupervised.',
 'Deep-learning',
 'architectures',
 'such',
 'as',
 'deep',
 'neural',
 'networks,',
 'deep',
 'belief',
 'networks,',
 'deep',
 'reinforcement',
 'learning,',
 'recurrent',
 'neural',
 'networks,',
 'convolutional',
 'neural',
 'networks',
 'and',
 'Transformers',
 'have',
 'been',
 'applied',
 'to',
 'fields',
 'including',
 'computer',
 'vision,',
 'speech',
 'recognition,',
 'natural',
 'language',
 'processing,',
 'machine',
 'translation,',
 'bioinformatics,',
 'drug',
 'design,',
 'medical',
 'image',
 'analysis,',
 'climate',
 'science,',
 'material',
 'inspection',
 'and',
 'board',
 'game',
 'programs,',
 'where

In [5]:
#tokenization
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in dl_data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [6]:
#generating (context word, target/label word) pairs
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)
            
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        # print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

In [7]:
#model building
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())

# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg'))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            7500      
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 75)                7575      
Total params: 15,075
Trainable params: 15,075
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 433.66469502449036

Epoch: 2 	Loss: 428.65322709083557

Epoch: 3 	Loss: 425.0849435329437

Epoch: 4 	Loss: 421.8628113269806

Epoch: 5 	Loss: 419.53119826316833



In [9]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,-0.030829,0.057311,-0.054094,0.036097,0.009418,-0.015365,-0.047589,0.000933,0.032222,-0.033904,...,0.001477,0.023165,-0.000979,0.005149,0.060412,0.043617,-0.060412,-0.004989,0.028167,-0.031081
networks,0.011113,0.006029,-0.041116,-0.060279,-0.0462,-0.015187,0.025729,0.003789,-0.001318,-0.046293,...,-0.011087,0.055135,-0.044276,-0.014142,-0.012076,0.051754,-0.063564,-0.016294,-0.054247,-0.051415
neural,0.003824,0.039844,-0.029943,-0.014061,0.03,0.028274,-0.013332,0.021909,0.002368,-0.00898,...,-0.007081,-0.021345,0.034435,0.047346,-0.026097,0.015146,-0.045184,-0.027339,-0.032168,-0.0046
and,0.014435,-0.014025,0.034986,0.038413,-0.011329,-0.010182,0.013403,0.009367,0.022211,-0.027106,...,-0.030106,-0.01029,0.046515,-0.017163,0.045547,-0.01903,-0.016205,0.000508,-0.004742,0.010775
as,0.046663,-0.008323,-0.02172,0.030659,-0.047511,-0.025712,-0.024707,0.038238,-0.026829,-0.01678,...,0.031577,-0.031058,0.026363,-0.032717,-0.017596,-0.045309,-0.047227,0.049841,-0.008256,0.006245


In [10]:
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['deep']}

similar_words

(74, 74)


{'deep': ['medical', 'computer', 'speech', 'can', 'to']}