# Creating Word Vectors with word2vec

Let's start with NLTK

#### Load Dependencies

In [None]:
import nltk

from nltk.tokenize import word_tokenize, sent_tokenize

import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE

import pandas as pd

from bokeh.io import output_notebook
from bokeh.plotting import show, figure

%matplotlib inline

In [None]:
nltk.download('punkt')

#### Load Data

In [None]:
nltk.download('gutenberg')

In [None]:
from nltk.corpus import gutenberg

In [None]:
gutenberg.fileids()

#### Tokenize Text

In [None]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sent_tokens = sent_tokenize(gutenberg.raw(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt']))

In [None]:
gberg_sent_tokens[0:5]

In [None]:
gberg_sent_tokens[1]

In [None]:
word_tokenize(gberg_sent_tokens[1])

In [None]:
word_tokenize(gberg_sent_tokens[1])[14]

In [None]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sents = gutenberg.sents(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt'])

In [None]:
gberg_sents[0:5]

In [None]:
gberg_sents[4][14]

In [None]:
gutenberg.words()

In [None]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
len(gutenberg.words(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt']))

#### Run Word2Vec

In [None]:
# size == dimensions
# window 10: 20 context words, 10 to the left and 10 to the right
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=2)

In [None]:
# We don't have to save the model if we don't want to. It's being done here as demonstration.
model.save('raw_gutenberg_model.w2v')

#### Explore the Model

In [None]:
model = Word2Vec.load('raw_gutenberg_model.w2v')

In [None]:
model['house']

In [None]:
len(model['house'])

In [None]:
model.most_similar('house')

In [None]:
model.most_similar('think')

In [None]:
model.most_similar('day')

In [None]:
model.most_similar('father')

In [None]:
model.doesnt_match('mother father daughter house'.split())

In [None]:
model.similarity('father', 'mother')

In [None]:
model.most_similar(positive=['father', 'woman'], negative=['man'])

In [None]:
model.most_similar(positive=['son', 'woman'], negative=['man'])

In [None]:
model.most_similar(positive=['husband', 'woman'], negative=['man'])

In [None]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=50)

#### Reduce word vector dimensionality with t-SNE

t-Distributed Stochastic Name Embedding

In [None]:
len(model.wv.vocab)

In [None]:
X = model[model.wv.vocab]

In [None]:
tsne = TSNE(n_components=3, n_iter=250)

In [None]:
X_2d = tsne.fit_transform(X)

In [None]:
coords_df = pd.DataFrame(X_2d, columns=['x', 'y', 'z'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
coords_df.head()

In [None]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualise 2D representation of word vectors

In [None]:
coorrds_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [None]:
coords_df.head()

In [None]:
_ = coords_df.plot.scatter('x', 'y', 'z', figsize=(8,8,8), marker='o', s=10, alpha=0.2)

In [None]:
output_notebook()

In [None]:
subset_df = coords_df.sample(n=1000)

In [None]:
p = figure(plot_width=600, plot_height=600)
p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)