# Creating Word Vectors with word2vec

In this notebook, we create word vectors from a corpus of public-domain books, a selection from [Project Gutenberg](https://www.gutenberg.org/).

### Load Dependencies

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [3]:
nltk.download('punkt') # English - language sentence tokenizer (not all periods end sentences: not all sentences )

[nltk_data] Downloading package punkt to C:\Users\ANILESH
[nltk_data]     PRAJAPATI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

### Load Data

In [4]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to C:\Users\ANILESH
[nltk_data]     PRAJAPATI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [5]:
from nltk.corpus import gutenberg

In [6]:
len(gutenberg.fileids())

18

In [7]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### Tokenize Text

In [8]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [9]:
gberg_sent_tokens[0:6]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [10]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [11]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [12]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [13]:
# a convenient method that handles newlines , as well as tokenizing sentences and words in one shot 
gberg_sents = gutenberg.sents()

In [14]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [15]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [16]:
gberg_sents[4][14]

'father'

In [17]:
# another convenient method that we don't immediately need:
gutenberg.words()

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [24]:
gutenberg.words() #is analogous to the following line , which need not be run:
word_tokenize(gutenberg.raw())

['[',
 'Emma',
 'by',
 'Jane',
 'Austen',
 '1816',
 ']',
 'VOLUME',
 'I',
 'CHAPTER',
 'I',
 'Emma',
 'Woodhouse',
 ',',
 'handsome',
 ',',
 'clever',
 ',',
 'and',
 'rich',
 ',',
 'with',
 'a',
 'comfortable',
 'home',
 'and',
 'happy',
 'disposition',
 ',',
 'seemed',
 'to',
 'unite',
 'some',
 'of',
 'the',
 'best',
 'blessings',
 'of',
 'existence',
 ';',
 'and',
 'had',
 'lived',
 'nearly',
 'twenty-one',
 'years',
 'in',
 'the',
 'world',
 'with',
 'very',
 'little',
 'to',
 'distress',
 'or',
 'vex',
 'her',
 '.',
 'She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.',
 'Her',
 'mother',
 'had',
 'died',
 'too',
 'long',
 'ago',
 'for',
 'her',
 'to',
 'have',
 'more',
 'than',
 'an',
 'indistin

In [25]:
# our Gutenberg corpus is 2.6m words in length:
len(gutenberg.words())

2621613

### Run word2vec

In [31]:
model = Word2Vec(sentences=gberg_sents, vector_size=64, sg=1, window=10, min_count=5, workers=8)

In [32]:
model.save('raw_gutenberg_model.w2v')

### Explore Model

In [33]:
# skip re-training the model with the next line:
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [34]:
model.wv['dog']

array([ 0.07371113, -0.39823112,  0.15164585,  0.33159667, -0.33959627,
       -0.2109201 ,  0.19396636, -0.70013523, -0.08342089,  0.34735334,
        0.40017962, -0.4757135 ,  0.00576881, -0.13750748, -0.09546565,
        0.9785357 , -0.3065237 ,  0.06141578, -0.06449086,  0.3282374 ,
        0.09532353,  0.32525456,  0.08579792, -0.03152066,  0.02432476,
        0.3698776 , -0.43070462,  0.0625393 , -0.03621057,  0.16643108,
       -0.14666396, -0.11954744, -0.26359183, -0.14507982, -0.05043266,
        0.19107898,  0.3682436 ,  0.1852998 ,  0.2348385 , -0.34083357,
       -0.16756369,  0.02325815,  0.0638731 , -0.14611363,  0.1330549 ,
       -0.33867782,  0.12053788, -0.14123784,  0.14287344,  0.4305418 ,
       -0.10724271, -0.2368042 ,  0.41739976,  0.17906795,  0.19773248,
        0.6709983 , -0.06531896, -0.27440435, -0.12591048,  0.24137743,
       -0.26478454,  0.01176239,  0.17038013, -0.53574777], dtype=float32)

In [35]:
len(model.wv['dog'])

64

In [36]:
model.wv.most_similar('dog') # distance

[('puppy', 0.8116813898086548),
 ('sweeper', 0.7771533727645874),
 ('thief', 0.7704423666000366),
 ('cage', 0.7686753273010254),
 ('chimney', 0.7647938132286072),
 ('boy', 0.7570463418960571),
 ('kick', 0.7533022165298462),
 ('bullet', 0.7473858594894409),
 ('Tom', 0.7452253699302673),
 ('Thief', 0.7434250712394714)]

In [37]:
model.wv.most_similar('think')

[('manage', 0.8595181107521057),
 ('contradict', 0.8464229106903076),
 ('suppose', 0.8390563726425171),
 ('Mamma', 0.8298581838607788),
 ('NOW', 0.8087869882583618),
 ('pretend', 0.8055347800254822),
 ('_you_', 0.8045792579650879),
 ('_not_', 0.8036143779754639),
 ('injure', 0.80242919921875),
 ('really', 0.7997591495513916)]

In [38]:
model.wv.most_similar('day')

[('morning', 0.7794086337089539),
 ('time', 0.759593665599823),
 ('night', 0.7261411547660828),
 ('week', 0.6999301910400391),
 ('month', 0.6807558536529541),
 ('feasting', 0.6794111728668213),
 ('year', 0.6671295762062073),
 ('fourteenth', 0.662652313709259),
 ('evening', 0.6555370688438416),
 ('Adar', 0.6551826000213623)]

In [39]:
model.wv.most_similar('father')

[('brother', 0.8479248285293579),
 ('mother', 0.8445989489555359),
 ('sister', 0.795856773853302),
 ('wife', 0.7724902033805847),
 ('daughter', 0.7721859812736511),
 ('Amnon', 0.7406697869300842),
 ('younger', 0.7342216968536377),
 ('servant', 0.7323956489562988),
 ('David', 0.7276148796081543),
 ('master', 0.716538667678833)]

In [40]:
model.wv.doesnt_match("mother father daughter dog".split())

'dog'

In [41]:
model.wv.similarity('father', 'dog')

0.45021367

In [42]:
# close, but not quite : distinctly in female direction:
model.wv.most_similar(positive=['father', 'woman'], negative=['man'])

[('husband', 0.7820450663566589),
 ('daughter', 0.761955738067627),
 ('wife', 0.7617030143737793),
 ('sister', 0.753807544708252),
 ('mother', 0.7433029413223267),
 ('brother', 0.7084606289863586),
 ('Sarah', 0.687145471572876),
 ('daughters', 0.6789749264717102),
 ('Rachel', 0.6772615313529968),
 ('younger', 0.669572651386261)]

In [43]:
# more confident about this one:
model.wv.most_similar(positive=['son', 'woman'], negative=['man'])

[('Leah', 0.7397541999816895),
 ('wife', 0.7382063269615173),
 ('firstborn', 0.7216333150863647),
 ('Sarah', 0.7158926725387573),
 ('Rachel', 0.7146885991096497),
 ('Onan', 0.7087196111679077),
 ('Hagar', 0.7061285376548767),
 ('Abram', 0.7060638070106506),
 ('Esau', 0.7030718326568604),
 ('Caleb', 0.7022258043289185)]

In [44]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

[('wife', 0.7173700332641602),
 ('sister', 0.676688551902771),
 ('daughter', 0.6702681183815002),
 ('conceived', 0.6600360870361328),
 ('nurse', 0.6523160338401794),
 ('maid', 0.6427376866340637),
 ('mother', 0.6416106820106506),
 ('child', 0.6387804746627808),
 ('widow', 0.627594530582428),
 ('whoredoms', 0.6150249242782593)]

In [46]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'] , topn=30)

[('Sarah', 0.7041272521018982),
 ('Leah', 0.667900025844574),
 ('Pharaoh', 0.65855872631073),
 ('Rachel', 0.6542605757713318),
 ('Abram', 0.6417808532714844),
 ('Hanun', 0.639335036277771),
 ('Babylon', 0.6373266577720642),
 ('daughter', 0.6367177963256836),
 ('Rebekah', 0.6351458430290222),
 ('Bilhah', 0.6327655911445618),
 ('Solomon', 0.6326667666435242),
 ('Padanaram', 0.6302077770233154),
 ('Onan', 0.6286587119102478),
 ('Hagar', 0.6282020807266235),
 ('David', 0.627619206905365),
 ('Vashti', 0.6249998211860657),
 ('Sarai', 0.6234779953956604),
 ('queen', 0.6212396025657654),
 ('Judah', 0.6196826696395874),
 ('Zilpah', 0.6180209517478943),
 ('Ephron', 0.617994487285614),
 ('Laban', 0.6152968406677246),
 ('Esther', 0.6144556403160095),
 ('Bethuel', 0.6137859225273132),
 ('Hamor', 0.6115031838417053),
 ('Rahab', 0.6112484931945801),
 ('sware', 0.6103159189224243),
 ('Joseph', 0.6087198853492737),
 ('birthright', 0.6076738834381104),
 ('magicians', 0.6072992086410522)]

In [47]:
# impressive for such a small data set , without any cleaning , e.g., to lower case (covered next) 

## Reduce Word Vector dimensionalit with t-SNE

In [50]:
# len(model.wv.vocab()) 
'''
Note:- vocab dict became key_to_index for looking up a key's integer index, 
or get_vecattr() and set_vecattr() for other per-key attributes:
'''
len(model.wv)

17011

In [67]:
X = model.wv[model.wv.key_to_index]

In [68]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter: default is 1000

In [69]:
X_2d = tsne.fit_transform(X)

In [70]:
X_2d[0:5]

array([[-34.291916, -46.42284 ],
       [-33.54891 , -46.200397],
       [-33.8369  , -46.440372],
       [-43.324276, -32.800526],
       [-33.300735, -46.70541 ]], dtype=float32)

In [71]:
# creating DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x', 'y'])
coords_df['token'] = model.wv.key_to_index

In [72]:
coords_df.head()

Unnamed: 0,x,y,token
0,-34.291916,-46.42284,","
1,-33.548908,-46.200397,the
2,-33.836899,-46.440372,and
3,-43.324276,-32.800526,.
4,-33.300735,-46.70541,of


In [73]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

## Visualize 2D Representation of Word Vectors

In [74]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [75]:
output_notebook()

In [76]:
subset_df = coords_df.sample(n=5000)

In [77]:
p = figure(plot_width=800, plot_height=800)
_= p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [78]:
show(p)