# Creating Word Vectors with word2vec and reduce dimensionality using TSNE and Visualizing it using Bokeh 

In this notebook, I have create word vectors from a corpus [Project Gutenberg]

#### Load dependencies

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

#### Load data

In [None]:
from nltk.corpus import gutenberg

In [None]:
len(gutenberg.fileids())

In [None]:
gutenberg.fileids()

#### Tokenize text

In [None]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [None]:
gberg_sents[4]

#### Run word2vec

There are many parameters on this constructor; a few noteworthy arguments you may wish to configure are:

size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).

window: (default 5) The maximum distance between a target word and words around the target word.

min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.

workers: (default 3) The number of threads to use while training.

sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1).

In [None]:
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42)

In [None]:
model.save('raw_gutenberg_model.w2v')

#### Explore model

In [None]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [None]:
model['dog']

In [None]:
len(model['dog'])

In [None]:
model.most_similar('dog') # distance

In [None]:
model.similarity('father', 'dog')

#### Reduce word vector dimensionality with t-SNE

In [None]:
model.wv.vocab

In [None]:
len(model.wv.vocab)

In [None]:
X = model[model.wv.vocab]

In [None]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [None]:
X_2d = tsne.fit_transform(X)

In [None]:
X_2d[0:5]

In [None]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
coords_df.head()

In [None]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualize 2D representation of word vectors

In [None]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)

In [None]:
output_notebook() # output bokeh plots inline in notebook

In [None]:
subset_df = coords_df.sample(n=5000)

In [None]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)