In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

In [6]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text


# Import all the Austen in the Project Gutenberg corpus. Reduce size to make it more manageable
work1 = gutenberg.raw('austen-persuasion.txt')
work2 = gutenberg.raw('austen-emma.txt')
work3 = gutenberg.raw('austen-sense.txt')

In [7]:
persuasion = text_cleaner(work1)
emma = text_cleaner(work2)
sense = text_cleaner(work3)

In [8]:
emma = emma[:len(persuasion)]
sense = sense[:len(persuasion)]
austen_clean = text_cleaner(persuasion+emma+sense)

In [9]:
#default max length is 1000000. set max length size
print(len(austen_clean))

1388454


In [10]:
# Parse the data. This can take some time.
nlp = spacy.load('en')
nlp.max_length = 10000000

austen_doc = nlp(austen_clean)

In [15]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
We have 11856 sentences and 1388454 tokens.


In [12]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')



done!


In [26]:
# List of words in model.
vocab = model.wv.vocab.keys()

def similarity(model):
    print(model.wv.most_similar(positive=['lady', 'woman'], negative=['man']))

    # Similarity is calculated using the cosine, so again 1 is total
    # similarity and 0 is no similarity.
    print(model.wv.similarity('buy', 'by'))
    print(model.wv.similarity('mr', 'mrs'))

similarity(model)
# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('heir', 0.8128623962402344), ('daughter', 0.8055077195167542), ('true', 0.7997446060180664), ('choice', 0.7820346355438232), ('law', 0.7776310443878174), ('bath', 0.7690468430519104), ('human', 0.7681286334991455), ('communication', 0.7576424479484558), ('frank', 0.7565971612930298), ('acceptable', 0.7460526823997498)]
0.8569384726542572
0.7251773912831494
marriage


In [27]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=5,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)
similarity(model)

[('beginning', 0.8566043972969055), ('harley', 0.8235278129577637), ('recommendation', 0.8199771642684937), ('sign', 0.8196719884872437), ('prize', 0.8162989020347595), ('thoughtful', 0.8153204917907715), ('acceptable', 0.8109824061393738), ('history', 0.8078625202178955), ('conduit', 0.8036626577377319), ('widow', 0.8019697070121765)]
0.838905868814873
0.5686843165340472


In [28]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=10,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)
similarity(model)

[('bath', 0.8551249504089355), ('daughter', 0.8461683392524719), ('croft', 0.8383054733276367), ('clay', 0.8378781676292419), ('opposite', 0.8315855264663696), ('widow', 0.830804705619812), ('miss', 0.829302191734314), ('heir', 0.8132860064506531), ('mr', 0.808639645576477), ('choice', 0.8035989999771118)]
0.814290933430577
0.7232964427946919


In [29]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=3,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)
similarity(model)

[('heir', 0.7823143601417542), ('widow', 0.7668845653533936), ('pray', 0.7569032311439514), ('communication', 0.751582145690918), ('true', 0.7458893656730652), ('frank', 0.7388154864311218), ('human', 0.7358202934265137), ('maid', 0.7288565635681152), ('son', 0.7178120613098145), ('mr', 0.7046736478805542)]
0.8370693176838317
0.6709115194380956


In [30]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=400,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)
similarity(model)

[('heir', 0.9447368383407593), ('bath', 0.9021087884902954), ('croft', 0.8630268573760986), ('opposite', 0.8511704206466675), ('mr', 0.8423051238059998), ('colonel', 0.8261034488677979), ('buildings', 0.8179928064346313), ('middletons', 0.8175707459449768), ('frank', 0.8147050142288208), ('elizabeth', 0.8140933513641357)]
0.7668913869669862
0.6885310984822481


In [31]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=250,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)
similarity(model)

[('bath', 0.8305132985115051), ('frank', 0.8086374402046204), ('miss', 0.7982124090194702), ('mr', 0.7922332286834717), ('communication', 0.7894659638404846), ('widow', 0.7890956997871399), ('daughter', 0.7775357365608215), ('heir', 0.7652280926704407), ('colonel', 0.7645837664604187), ('maid', 0.76324063539505)]
0.7626877387282744
0.6429039334751658


In [38]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=1,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)
similarity(model)

[('dalrymple', 0.6871240139007568), ('exceedingly', 0.6256294846534729), ('middleton', 0.6045461893081665), ('welcome', 0.6003682613372803), ('carteret', 0.5953027606010437), ('mistress', 0.5885478854179382), ('arrival', 0.5882683992385864), ('discover', 0.5793486833572388), ('middletons', 0.5755021572113037), ('introduce', 0.5727115869522095)]
0.74402197818789
0.564997032670922


In [40]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=8,      # Number of words around target word to consider.
    sg=1,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=0           # Use hierarchical softmax.
)
similarity(model)

[('daughter', 0.8508806228637695), ('middleton', 0.8372458219528198), ('acquaintance', 0.8176541924476624), ('elizabeth', 0.8130645155906677), ('dalrymple', 0.809624433517456), ('kellynch', 0.799096941947937), ('shepherd', 0.7796478271484375), ('hall', 0.7763948440551758), ('bath', 0.7707130312919617), ('brandon', 0.7681570053100586)]
0.9940057828253424
0.8138476179832403
