In [15]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re

from nltk.corpus import gutenberg, stopwords
import gensim
from gensim.models import word2vec


# word2vec
- shallow neural network model for converting words to vectors using distributed representation
- LSA creates vector representations of sentences based on words in them, word2vec creates representations of individual words, based on words around them
- used when computers need to parse requests written by humans
- 2 options, inverses of each other
    - Continuous bag of words (CBOW)
        - Identity of word predicted using words near it in a sentence
    - skip-gram
        - Identities of words are predicted from the word they surround, works better for large corpuses
- 2 approaches to "pushing" vectors apart
    - negative sampling: each time a word pulled toward some neighbors, the vectors for randomly chosen small set of other words pushed away
    - hierarchical softmax: every neighboring word pulled closer or farther from a subset of words chosen based on a tree of possibilities

In [13]:
# Utility function to clean text
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash'--'
    # Get rid of it now
    
    text = re.sub(r'--', ' ', text)
    
    # Get rid of headings in square brackets
    text = re.sub('[\[].*?[\]]', '', text)
    
    # Get rid of chapter titles
    text = re.sub(r'Chapter \d+', '', text)
    
    # Get rid of extra whitespace
    text = ' '.join(text.split())
    
    return text

# Import all the Austen in the Project Gutenberg corpus
austen = ''
for novel in ['persuasion', 'emma', 'sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work
    
# Clean the data
austen_clean = text_cleaner(austen)

In [25]:
# Parse the data.  This can take some time
nlp = spacy.load('en')
austen_doc1 = nlp(austen_clean[:1000000])
austen_doc2 = nlp(austen_clean[1000000:2000000])
austen_doc3 = nlp(austen_clean[2000000:])

In [24]:
austen_doc1

Sir Walter Elliot, of Kellynch Hall, in Somersetshire, was a man who, for his own amusement, never took up any book but the Baronetage; there he found occupation for an idle hour, and consolation in a distressed one; there his faculties were roused into admiration and respect, by contemplating the limited remnant of the earliest patents; there any unwelcome sensations, arising from domestic affairs changed naturally into pity and contempt as he turned over the almost endless creations of the last century; and there, if every other leaf were powerless, he could read his own history with an interest which never failed. This was the page at which the favourite volume always opened: "ELLIOT OF KELLYNCH HALL. "Walter Elliot, born March 1, 1760, married, July 15, 1784, Elizabeth, daughter of James Stevenson, Esq. of South Park, in the county of Gloucester, by which lady (who died 1800) he has issue Elizabeth, born June 1, 1785; Anne, born August 9, 1787; a still-born son, November 5, 1789; M

In [26]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words

sentences = []
for sentence in austen_doc1.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    
for sentence in austen_doc2.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    
for sentence in austen_doc3.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    

In [27]:
print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
We have 17854 sentences and 2006272 tokens.


In [29]:
model = word2vec.Word2Vec(
    sentences,    # Number  of threads to run in parralel
    min_count=10, # Min word count threshold
    window=6,     # Number of words around target word to consider
    sg=0,         # Use CBOW bc our corpus is small
    sample=1e-3,  # Penalize frequent words
    size=300,     # Word vector length
    hs=1          # Use hierarchical softmax
)

print('done!')

done!


In [30]:
# List of words in model
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity

[('people', 0.6582650542259216), ('daughter', 0.5567139387130737), ('introduction', 0.4640844464302063), ('prejudice', 0.4414145350456238), ('pleasing', 0.43370333313941956), ('thousand', 0.4336819648742676), ('person', 0.42792898416519165), ('joke', 0.4169720411300659), ('visit', 0.41186773777008057), ('friend', 0.40585067868232727)]


In [32]:

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

0.6767084
0.07640822
marriage


# Drill 0: Modify hyperparameters

In [46]:
model = word2vec.Word2Vec(
    sentences,    # Number  of threads to run in parralel
    min_count=10, # Min word count threshold
    window=10,    # Number of words around target word to consider
    sg=0,         # Use CBOW bc our corpus is small
    sample=1e-2,  # Penalize frequent words
    size=300,     # Word vector length
    hs=1          # Use hierarchical softmax
)

# List of words in model
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('farmer', 0.5917782783508301), ('rapid', 0.5804653167724609), ('husband', 0.5533252954483032), ('visit', 0.5447976589202881), ('house', 0.5397225618362427), ('people', 0.5167005658149719), ('settle', 0.5030304193496704), ('entertain', 0.4993135631084442), ('indisposition', 0.4963826537132263), ('brother', 0.48956024646759033)]
0.63302493
0.3267087
marriage


Husband and brother are at least the right gender for the analogy.

# Drill 1: Word2Vec on 100B+ Words
Because the models are so large, however, you may run into memory problems or crash the kernel. If you can't get a pretrained model to run locally, check out this interactive web app of the Google News model instead. https://rare-technologies.com/word2vec-tutorial/#bonus_app

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format ('./model/GoogleNews-vectors-negative300.bin', binary=True)