In [2]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence



In [16]:
data = ["Deep learning also known as deep structured learning", 
"is part of a broader family of machine learning methods based", 
"on artificial neural networks with representation learning", 
"Learning can be supervised, semi-supervised or unsupervised",
"Deep-learning architectures such as deep neural networks", 
"deep belief networks, deep reinforcement learning", 
"recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision  speech recognition, natural language processing, machine translation", 
"where they have produced results comparable to and in some cases surpassing human expert performance"
]
# dl_data = data.split()

In [17]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(dl_data)
word2id = tokenizer.word_index

# build vocabulary of unique words
word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]

vocab_size = len(word2id)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 75
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('and', 5), ('as', 6), ('of', 7), ('machine', 8), ('supervised', 9), ('have', 10)]


In [18]:
from keras.utils import pad_sequences
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [21]:
import numpy as np
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
#     print(x, y)
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['deep', 'learning', 'known', 'as'] -> Target (Y): also
Context (X): ['learning', 'also', 'as', 'deep'] -> Target (Y): known
Context (X): ['also', 'known', 'deep', 'structured'] -> Target (Y): as
Context (X): ['known', 'as', 'structured', 'learning'] -> Target (Y): deep
Context (X): ['is', 'part', 'a', 'broader'] -> Target (Y): of
Context (X): ['part', 'of', 'broader', 'family'] -> Target (Y): a
Context (X): ['of', 'a', 'family', 'of'] -> Target (Y): broader
Context (X): ['a', 'broader', 'of', 'machine'] -> Target (Y): family
Context (X): ['broader', 'family', 'machine', 'learning'] -> Target (Y): of
Context (X): ['family', 'of', 'learning', 'methods'] -> Target (Y): machine
Context (X): ['of', 'machine', 'methods', 'based'] -> Target (Y): learning


In [23]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary
print(cbow.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            7500      
                                                                 
 lambda_1 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 75)                7575      
                                                                 
Total params: 15,075
Trainable params: 15,075
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
for epoch in range(1, 50):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()

Epoch: 1 	Loss: 370.9859700202942

Epoch: 2 	Loss: 365.95088243484497

Epoch: 3 	Loss: 360.81728649139404

Epoch: 4 	Loss: 354.84530425071716

Epoch: 5 	Loss: 347.90175580978394

Epoch: 6 	Loss: 340.07426285743713

Epoch: 7 	Loss: 331.6740012168884

Epoch: 8 	Loss: 323.16635966300964

Epoch: 9 	Loss: 315.0267950296402

Epoch: 10 	Loss: 307.56587851047516

Epoch: 11 	Loss: 300.8361974954605

Epoch: 12 	Loss: 294.68997848033905

Epoch: 13 	Loss: 288.92508405447006

Epoch: 14 	Loss: 283.3787097334862

Epoch: 15 	Loss: 277.96111285686493

Epoch: 16 	Loss: 272.6375021934509

Epoch: 17 	Loss: 267.39986634254456

Epoch: 18 	Loss: 262.2533531188965

Epoch: 19 	Loss: 257.2086030244827

Epoch: 20 	Loss: 252.2792191505432

Epoch: 21 	Loss: 247.4753773212433

Epoch: 22 	Loss: 242.80450057983398

Epoch: 23 	Loss: 238.27022087574005

Epoch: 24 	Loss: 233.8772838115692

Epoch: 25 	Loss: 229.62362802028656

Epoch: 26 	Loss: 225.50446462631226

Epoch: 27 	Loss: 221.51664543151855

Epoch: 28 	Loss: 217.

In [26]:
import pandas as pd
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(74, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
deep,-0.361941,0.731627,0.239838,-0.531324,-0.047572,0.423992,0.029935,-0.11734,0.682427,-0.012576,...,-0.154055,-0.311902,0.21504,-0.053152,-0.465848,-0.186827,-0.131936,0.609785,-0.010017,0.745071
networks,0.491174,-0.180589,0.282849,-0.213955,-0.121375,0.487923,0.510078,-0.157923,0.16306,0.383088,...,-0.484411,-0.257293,0.248224,0.164831,0.715166,-0.475854,-0.674728,-0.776752,-0.227652,-0.530007
neural,0.141732,0.227724,-0.496395,-0.302921,0.086413,0.37935,0.361062,0.326057,-0.03955,-0.077444,...,0.806426,0.144321,0.477996,-0.104747,-0.834805,0.79936,-0.066161,0.337029,-0.684721,0.493491
and,0.169238,0.034286,-0.065764,-0.162544,-0.519765,-0.532053,0.369184,0.325808,0.105479,0.56838,...,-0.207473,0.022182,0.142096,-0.383148,-0.540944,-0.025271,0.476111,-0.004251,-0.621524,-0.243947
as,0.068114,0.071128,0.080918,-0.198025,-0.190036,-0.055908,-0.338354,-0.281027,-0.422187,-0.10551,...,-0.264047,-0.070758,0.37557,0.088351,-0.044989,0.018857,0.156525,-0.378772,0.033704,0.002676


In [28]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['deep', 'unsupervised']}

similar_words

(74, 74)


{'deep': ['representation', 'with', 'such', 'known', 'recurrent'],
 'unsupervised': ['medical', 'science', 'board', 'inspection', 'analysis']}