# word2vec model and sample queries

See Miklov et. al. 2013 for a description of the method 

01/08/2020: Initial version

In [None]:
# load required packages
import gensim
from sklearn.decomposition import PCA
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# load Google News word2vec model
google_model = gensim.models.Word2Vec.load_word2vec_format('../models/google-vectors.w2v'
                                                           ,binary=True)

In [None]:
# make vocab available
vocab = google_model.vocab

In [None]:
print("vocab:",len(vocab))

In [None]:
# simple queries first, single word
google_model.most_similar("self",topn=20)

In [None]:
# now we can add other terms to produce "better" queries
query = ["self","individual","sovereign","person"]
google_model.most_similar(query,topn=20)

In [None]:
# extract data to plot from a query (uses above, from "query" variable)
# this is examining neighbors in semantic space 

response = google_model.most_similar(query,topn=20)

neighbor_list=list()
words=list()
for i in response:
    words.append(i[0])
    neighbor_list.append(google_model.wv[i[0]])

In [None]:
pca = PCA(n_components=2)
plot_data = pca.fit_transform(neighbor_list)

xs = plot_data[:, 0]
ys = plot_data[:, 1]
fig = plt.figure(figsize=(20, 15))
plt.clf()
plt.scatter(xs, ys, marker = 'o')
  
for i, w in enumerate(words):
    plt.annotate(w, xy = (xs[i], ys[i]), xytext = (3, 3),
    textcoords = 'offset points', ha = 'left', va = 'top')

In [None]:
# we can subtract certain words:
google_model.most_similar(positive=["self","individual","sovereign","person"], negative=['virtue'],topn=20)

In [None]:
# this is the analogical reasoning task from the paper
google_model.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
# access vectors directly
google_model["self"]