In [33]:
# importing libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer # Import Tokenizer from tensorflow.keras.preprocessing.text
from tensorflow.keras.utils import pad_sequences, to_categorical
import numpy as np
import pandas as pd

In [35]:
#taking random sentences as data
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. 
Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
"""
dl_data = data.split()

In [37]:
#tokenization
#Tokenizer was already imported from tensorflow.keras.preprocessing.text
tokenizer = Tokenizer() #Calling Tokenizer directly 
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
#Calling text_to_word_sequence directly on tensorflow.keras.preprocessing.text
wids = [[word2id[w] for w in tf.keras.preprocessing.text.text_to_word_sequence(doc)] for doc in dl_data] 

vocab_size = len(word2id)
embed_size = 100
window_size = 2 

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [39]:
#generating (context word, target/label word) pairs
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)
            


In [41]:
#model building
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: tf.reduce_mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

print(cbow.summary())

# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot

# SVG(model_to_dot(cbow, show_shapes=True, show_layer_names=False, rankdir='TB').create(prog='dot', format='svg'))



None


In [43]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 430.84026861190796

Epoch: 2 	Loss: 430.6036448478699

Epoch: 3 	Loss: 428.8011283874512

Epoch: 4 	Loss: 427.0268521308899

Epoch: 5 	Loss: 425.5257782936096



In [45]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,0.002267,-0.007789,0.017094,0.038082,-0.024031,0.063732,-0.044413,0.00181,0.024153,-0.064489,...,-0.039946,0.005762,-0.027129,0.023011,0.064948,0.023818,0.014393,0.009237,0.001289,-0.034359
networks,-0.018856,-0.030122,0.018339,0.039654,-0.038074,0.002258,0.058217,-0.058789,-0.037672,0.031065,...,0.029049,-0.031998,0.010577,-0.063889,-0.025821,0.026566,-0.012046,-0.022789,0.03768,-0.004106
neural,0.0072,-0.001533,0.047119,-0.018791,0.000139,-0.031527,-0.042072,-0.012666,0.002303,0.020236,...,0.008419,0.01436,0.013458,0.030976,-0.001089,-0.021181,0.040395,-0.033758,-0.035054,-0.011201
and,-0.037502,-0.029883,0.040434,0.008709,0.048017,0.041638,0.034919,-0.026568,0.032635,0.011609,...,-0.002362,0.00972,-0.020262,-0.046418,0.018378,-0.007239,-0.012327,0.031023,-0.019619,0.006119
as,0.008086,-0.013236,-0.001055,0.032365,0.044762,-0.029319,-0.009214,0.041468,-0.019296,-0.013002,...,-0.024535,-0.005102,0.036546,0.014987,-0.004683,0.011302,-0.019502,-0.031948,0.022799,0.037477
