# Word2vec model using Gensim

1. pip install gensim, tsne, bokeh
3. download sample text corpus http://wortschatz.uni-leipzig.de/en/download/
2. Text preprocessing iterator
3. Train word2vec model
4. Sanity check
5. tsne 
5. plot with bokeh

In [2]:
import logging
import os.path
import sys
from gensim.models import Word2Vec

In [3]:
class sentences_iter(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

In [4]:
sentences = sentences_iter('eng_news_2005_100K/') 
model = Word2Vec(sentences, size=int(100), window=20, min_count=5,hs=1)

In [5]:
model.most_similar('world')

  """Entry point for launching an IPython kernel.


[('greatest', 0.6399431228637695),
 ('world,', 0.6276425123214722),
 ('nation', 0.6247788071632385),
 ('Europe', 0.596915602684021),
 ('country', 0.5723059773445129),
 ('history', 0.5503307580947876),
 ('Europe.', 0.5425519943237305),
 ('success', 0.5399937033653259),
 ('best', 0.5368356108665466),
 ('century', 0.5160427093505859)]

In [6]:
model.wv.syn0.shape

(27953, 100)

In [7]:
import numpy as np
import datetime
from tsne import bh_sne
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool

In [8]:
vectors = np.array(model.wv.syn0,dtype='float')

In [9]:
# perform t-SNE embedding on subset of the large vocabulary
start = datetime.datetime.now()
vis_data = bh_sne(vectors)
end = datetime.datetime.now()
print "tsne in {}".format(end-start)
# take like 5 mins

tsne in 0:05:17.579270


In [10]:
words = model.wv.index2word

In [11]:
# plot the result
vis_x = vis_data[:, 0]
vis_y = vis_data[:, 1]

output_file("word2vec_tsne.html")
source = ColumnDataSource(data=dict(
    x= vis_data[:, 0],
    y= vis_data[:, 1],
    desc= words ,
))

hover = HoverTool(tooltips=[
    ("word", "@desc"),
])

p = figure(plot_width=500, plot_height=500, tools=[hover,"wheel_zoom"],
           title="Glove Tsne")

p.circle('x', 'y', size=20, source=source)
show(p)