In [2]:
import gensim
import numpy as np
from nltk.tokenize import WordPunctTokenizer

In [3]:
# read data
with open("./quora.txt", encoding="utf8") as file:
    data = list(file)
print(f"{len(data) / 1e6:.2f}M questions loaded")

# tokenize data
tokenizer = WordPunctTokenizer()
data_tokens = [tokenizer.tokenize(x.lower()) for x in data]

num_words = sum([len(row) for row in data_tokens])
print(f"Total number of words: {num_words / 1e6:.2f}M")

0.54M questions loaded
Total number of words: 7.13M


In [4]:
# train model
model = gensim.models.Word2Vec(data_tokens,
                               vector_size=50,
                               min_count=5,
                               window=5).wv

In [5]:
import bokeh.models as bm
import bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue', width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [6]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

words = model.index_to_key[:1000]
word_vectors = np.array([model.get_vector(w) for w in words])

word_tsne = TSNE(n_components=2).fit_transform(word_vectors)
word_tsne = StandardScaler().fit_transform(word_tsne)
draw_vectors(word_tsne[:, 0], word_tsne[:, 1], words=words)