In [1]:
import numpy as np
from numpy import dot
from gensim import matutils
from gensim.corpora import Dictionary
from keras.models import Model, load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def cosine_similarity(doc_vec1, doc_vec2):
    # Taken from: gensim.models.keyedvectors.Doc2VecKeyedVectors    
    return dot(matutils.unitvec(doc_vec1), matutils.unitvec(doc_vec2))

def token_vec(token, vocab, W):
    token_id = vocab.token2id[token]
    return W[token_id]

def most_similar(token, vocab, W, topn=5):
    vec = token_vec(token, vocab, W)
    similarities = [cosine_similarity(vec, W[i]) for i in range(W.shape[0])]
    word_indices = matutils.argsort(similarities, topn, reverse=True)
    word_similarities = np.array(similarities)[word_indices]
    return zip(word_indices, word_similarities)

def plot_similar_words(word, vocab, W, topn=10):
    print('Words similar to "%s":' % word)
    token_ids = most_similar(word, vocab, W, topn)
    for token_id, token_similarity in token_ids:
        print('%s (%f)' % (vocab[token_id], token_similarity))


In [4]:
vocab_path = '/tmp/vocab.pkl'
model_path = '/tmp/w2v_enwiki_vocab=10000_model.h5'

vocab = Dictionary.load(vocab_path)

# load the Keras model
model = load_model(model_path)

# extract the word vector matrix W from the embedding layer
embedding_layer = model.layers[1]
W = embedding_layer.get_weights()[0]

print('W shape:', W.shape)

W shape: (10000, 100)


In [5]:
plot_similar_words('london', vocab, W, topn=10)
print()
plot_similar_words('berlin', vocab, W, topn=10)
print()
plot_similar_words('monday', vocab, W, topn=10)
print()
plot_similar_words('morning', vocab, W, topn=10)
print()
plot_similar_words('strong', vocab, W, topn=10)
print()
plot_similar_words('blue', vocab, W, topn=10)

Words similar to "london":


  if np.issubdtype(vec.dtype, np.int):


london (1.000000)
manchester (0.634632)
liverpool (0.607274)
edinburgh (0.585272)
birmingham (0.582786)
bristol (0.548277)
sheffield (0.544674)
dublin (0.534518)
leeds (0.534092)
brighton (0.533358)

Words similar to "berlin":
berlin (1.000000)
hamburg (0.811339)
frankfurt (0.802908)
dresden (0.792602)
vienna (0.773817)
munich (0.760517)
stuttgart (0.744570)
cologne (0.738208)
leipzig (0.727342)
prague (0.723992)

Words similar to "monday":
monday (1.000000)
thursday (0.885920)
tuesday (0.878971)
wednesday (0.870507)
saturday (0.838684)
friday (0.799223)
midnight (0.658124)
sunday (0.630880)
utc (0.601302)
afternoon (0.599218)

Words similar to "morning":
morning (1.000000)
afternoon (0.734455)
evening (0.709340)
midnight (0.613252)
night (0.573420)
friday (0.571411)
pm (0.562850)
thursday (0.529090)
saturday (0.525491)
wednesday (0.514147)

Words similar to "strong":
strong (1.000000)
significant (0.446728)
enthusiastic (0.446149)
important (0.444043)
enormous (0.439374)
powerful (0.4