# Creating Word Vectors with word2vec

Let's start with NLTK

#### Load Dependencies

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Load Data

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [4]:
from nltk.corpus import gutenberg

In [5]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Tokenize Text

In [6]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sent_tokens = sent_tokenize(gutenberg.raw(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt']))

In [7]:
gberg_sent_tokens[0:5]

['[Sense and Sensibility by Jane Austen 1811]\n\nCHAPTER 1\n\n\nThe family of Dashwood had long been settled in Sussex.',
 'Their estate was large, and their residence was at Norland Park,\nin the centre of their property, where, for many generations,\nthey had lived in so respectable a manner as to engage\nthe general good opinion of their surrounding acquaintance.',
 'The late owner of this estate was a single man, who lived\nto a very advanced age, and who for many years of his life,\nhad a constant companion and housekeeper in his sister.',
 'But her death, which happened ten years before his own,\nproduced a great alteration in his home; for to supply\nher loss, he invited and received into his house the family\nof his nephew Mr. Henry Dashwood, the legal inheritor\nof the Norland estate, and the person to whom he intended\nto bequeath it.',
 "In the society of his nephew and niece,\nand their children, the old Gentleman's days were\ncomfortably spent."]

In [8]:
gberg_sent_tokens[1]

'Their estate was large, and their residence was at Norland Park,\nin the centre of their property, where, for many generations,\nthey had lived in so respectable a manner as to engage\nthe general good opinion of their surrounding acquaintance.'

In [9]:
word_tokenize(gberg_sent_tokens[1])

['Their',
 'estate',
 'was',
 'large',
 ',',
 'and',
 'their',
 'residence',
 'was',
 'at',
 'Norland',
 'Park',
 ',',
 'in',
 'the',
 'centre',
 'of',
 'their',
 'property',
 ',',
 'where',
 ',',
 'for',
 'many',
 'generations',
 ',',
 'they',
 'had',
 'lived',
 'in',
 'so',
 'respectable',
 'a',
 'manner',
 'as',
 'to',
 'engage',
 'the',
 'general',
 'good',
 'opinion',
 'of',
 'their',
 'surrounding',
 'acquaintance',
 '.']

In [10]:
word_tokenize(gberg_sent_tokens[1])[14]

'the'

In [11]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sents = gutenberg.sents(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt'])

In [12]:
gberg_sents[0:5]

[['[', 'Sense', 'and', 'Sensibility', 'by', 'Jane', 'Austen', '1811', ']'],
 ['CHAPTER', '1'],
 ['The',
  'family',
  'of',
  'Dashwood',
  'had',
  'long',
  'been',
  'settled',
  'in',
  'Sussex',
  '.'],
 ['Their',
  'estate',
  'was',
  'large',
  ',',
  'and',
  'their',
  'residence',
  'was',
  'at',
  'Norland',
  'Park',
  ',',
  'in',
  'the',
  'centre',
  'of',
  'their',
  'property',
  ',',
  'where',
  ',',
  'for',
  'many',
  'generations',
  ',',
  'they',
  'had',
  'lived',
  'in',
  'so',
  'respectable',
  'a',
  'manner',
  'as',
  'to',
  'engage',
  'the',
  'general',
  'good',
  'opinion',
  'of',
  'their',
  'surrounding',
  'acquaintance',
  '.'],
 ['The',
  'late',
  'owner',
  'of',
  'this',
  'estate',
  'was',
  'a',
  'single',
  'man',
  ',',
  'who',
  'lived',
  'to',
  'a',
  'very',
  'advanced',
  'age',
  ',',
  'and',
  'who',
  'for',
  'many',
  'years',
  'of',
  'his',
  'life',
  ',',
  'had',
  'a',
  'constant',
  'companion',
  'and'

In [13]:
gberg_sents[4][14]

'a'

In [14]:
gutenberg.words()

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [15]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
len(gutenberg.words(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt']))

1104978

#### Run Word2Vec

In [16]:
# size == dimensions
# window 10: 20 context words, 10 to the left and 10 to the right
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=2)

In [17]:
# We don't have to save the model if we don't want to. It's being done here as demonstration.
model.save('raw_gutenberg_model.w2v')

#### Explore the Model

In [18]:
model = Word2Vec.load('raw_gutenberg_model.w2v')

In [19]:
model['house']

  """Entry point for launching an IPython kernel.


array([ 0.00253844, -0.28362033, -0.05020804,  0.243067  ,  0.22758678,
       -0.03445566,  0.05544896, -0.6400096 ,  0.7421818 , -0.22936262,
       -0.28277883,  0.09234599,  0.15808737, -0.41168603, -0.1746989 ,
       -0.09412833,  0.2830225 , -0.0023677 ,  0.16449161,  0.04813597,
       -0.33191484, -0.18412393,  0.3233978 , -0.18004127, -0.15361878,
        0.0461226 , -0.2623931 ,  0.07534794, -0.16722617, -0.03013645,
       -0.06027886, -0.3076095 , -0.77084535,  0.00415463,  0.10246416,
       -0.20290059,  0.41379207,  0.19709958, -0.00493711,  0.3623045 ,
        0.47710043,  0.26188043, -0.4200422 , -0.37100074,  0.3208747 ,
        0.52022797, -0.13591456, -0.29092348, -0.00981716, -0.07570454,
       -0.30985105,  0.53540057, -0.15808363,  0.19505948, -0.0456162 ,
        0.33518425,  0.11240716,  0.26155046, -0.11950276, -0.5990342 ,
       -0.35793218, -0.15128244,  0.33151767,  0.24814092], dtype=float32)

In [20]:
len(model['house'])

  """Entry point for launching an IPython kernel.


64

In [21]:
model.most_similar('house')

  """Entry point for launching an IPython kernel.


[('cottage', 0.8514493703842163),
 ('court', 0.7917842864990234),
 ('dining', 0.7778569459915161),
 ('carriage', 0.7735164761543274),
 ('park', 0.7682986259460449),
 ('Harley', 0.7620264291763306),
 ('room', 0.7617365121841431),
 ('parlour', 0.7609195709228516),
 ('Allenham', 0.7596607804298401),
 ('Exeter', 0.7549090385437012)]

In [22]:
model.most_similar('think')

  """Entry point for launching an IPython kernel.


[('suppose', 0.8686421513557434),
 ('understand', 0.8406089544296265),
 ('remember', 0.8331885933876038),
 ('know', 0.8296819925308228),
 ('manage', 0.823654294013977),
 ('guess', 0.8217471837997437),
 ('NOW', 0.816806435585022),
 ('impertinent', 0.8160394430160522),
 ('commit', 0.8098946809768677),
 ('MUST', 0.8077293634414673)]

In [23]:
model.most_similar('day')

  """Entry point for launching an IPython kernel.


[('morning', 0.8520458340644836),
 ('night', 0.8025751709938049),
 ('evening', 0.7523692846298218),
 ('year', 0.7284113168716431),
 ('summer', 0.7225607633590698),
 ('month', 0.6959215998649597),
 ('week', 0.6941913962364197),
 ('spend', 0.683414876461029),
 ('eleven', 0.6771047711372375),
 ('May', 0.6760200262069702)]

In [24]:
model.most_similar('father')

  """Entry point for launching an IPython kernel.


[('brother', 0.8279287219047546),
 ('wife', 0.8140286207199097),
 ('mother', 0.8138506412506104),
 ('master', 0.7999870181083679),
 ('cousin', 0.79924476146698),
 ('uncle', 0.7981894612312317),
 ('son', 0.7844958901405334),
 ('Maurice', 0.7770201563835144),
 ('child', 0.7729185819625854),
 ('Fanny', 0.7524842619895935)]

In [25]:
model.doesnt_match('mother father daughter house'.split())

  """Entry point for launching an IPython kernel.


'house'

In [26]:
model.similarity('father', 'mother')

  """Entry point for launching an IPython kernel.


0.8138506563047428

In [27]:
model.most_similar(positive=['father', 'woman'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('brother', 0.815841794013977),
 ('mother', 0.8129395246505737),
 ('husband', 0.7806116938591003),
 ('daughter', 0.770318865776062),
 ('daughters', 0.7428721189498901),
 ('Fanny', 0.7407931089401245),
 ('sister', 0.7365231513977051),
 ('Lady', 0.7356016039848328),
 ('sisters', 0.7349551916122437),
 ('Susan', 0.7319581508636475)]

In [28]:
model.most_similar(positive=['son', 'woman'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('daughter', 0.8234765529632568),
 ('eldest', 0.7831596732139587),
 ('brother', 0.7709355354309082),
 ('husband', 0.7619892954826355),
 ('daughters', 0.7532057762145996),
 ('Lady', 0.7251722812652588),
 ('wife', 0.7176585793495178),
 ('invited', 0.7099258899688721),
 ('sisters', 0.7092226147651672),
 ('widow', 0.7049161195755005)]

In [29]:
model.most_similar(positive=['husband', 'woman'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('sister', 0.8065283894538879),
 ('mother', 0.8033996820449829),
 ('daughter', 0.7897864580154419),
 ('brother', 0.7847374081611633),
 ('daughters', 0.7702164649963379),
 ('sisters', 0.7588099241256714),
 ('herself', 0.7522388696670532),
 ('Fanny', 0.7458125352859497),
 ('eldest', 0.7438929080963135),
 ('Mary', 0.7120168209075928)]

In [31]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=50)

  """Entry point for launching an IPython kernel.


[('eldest', 0.754181981086731),
 ('daughters', 0.7207632660865784),
 ('visit', 0.7073965072631836),
 ('daughter', 0.6980539560317993),
 ('widow', 0.697096049785614),
 ('husband', 0.6945968866348267),
 ('invited', 0.6940564513206482),
 ('elder', 0.6931315660476685),
 ('attending', 0.6923259496688843),
 ('son', 0.6889738440513611),
 ('Margaret', 0.6815564036369324),
 ('wife', 0.6785577535629272),
 ('lively', 0.6768495440483093),
 ('youngest', 0.676306426525116),
 ('provided', 0.6734195947647095),
 ('Fanny', 0.6722611784934998),
 ('Exeter', 0.6707691550254822),
 ('taught', 0.6687226891517639),
 ('brother', 0.66819167137146),
 ('charity', 0.6665070056915283),
 ('industrious', 0.6651074886322021),
 ('honoured', 0.6637418270111084),
 ('Gray', 0.6606661081314087),
 ('Robert', 0.660377025604248),
 ('Sicily', 0.660137951374054),
 ('robbed', 0.6592308282852173),
 ('health', 0.6570944786071777),
 ('entreaty', 0.6570299863815308),
 ('mother', 0.655613899230957),
 ('Lady', 0.6554924249649048),
 ('M

#### Reduce word vector dimensionality with t-SNE

t-Distributed Stochastic Name Embedding

In [32]:
len(model.wv.vocab)

11319

In [33]:
X = model[model.wv.vocab]

  """Entry point for launching an IPython kernel.


In [47]:
tsne = TSNE(n_components=3, n_iter=250)

In [48]:
X_2d = tsne.fit_transform(X)

In [49]:
coords_df = pd.DataFrame(X_2d, columns=['x', 'y', 'z'])
coords_df['token'] = model.wv.vocab.keys()

In [50]:
coords_df.head()

Unnamed: 0,x,y,z,token
0,0.391134,0.762717,-0.833366,[
1,0.204031,-0.438965,0.127528,and
2,-0.017161,-0.111368,0.0896,by
3,0.389705,0.763152,-0.832475,]
4,0.399254,0.751378,-0.832394,CHAPTER


In [51]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualise 2D representation of word vectors

In [52]:
coorrds_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [53]:
coords_df.head()

Unnamed: 0,x,y,z,token
0,0.391134,0.762717,-0.833366,[
1,0.204031,-0.438965,0.127528,and
2,-0.017161,-0.111368,0.0896,by
3,0.389705,0.763152,-0.832475,]
4,0.399254,0.751378,-0.832394,CHAPTER


In [56]:
_ = coords_df.plot.scatter('x', 'y', 'z', figsize=(8,8,8), marker='o', s=10, alpha=0.2)

TypeError: scatter() got multiple values for argument 's'

In [43]:
output_notebook()

In [44]:
subset_df = coords_df.sample(n=1000)

In [45]:
p = figure(plot_width=600, plot_height=600)
p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [46]:
show(p)