In [6]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
import gensim
from gensim.models import word2vec

In [2]:
def text_cleaner(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r'Chapter \d+', '', text)
    text = ' '.join(text.split())
    return text[0:900000]

austen = ""
for novel in ['persuasion', 'emma', 'sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen += work
    
austen_clean = text_cleaner(austen)

In [4]:
nlp = spacy.load('en')
austen_doc = nlp(austen_clean)

In [5]:
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    
print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['lady', 'russell', 'steady', 'age', 'character', 'extremely', 'provide', 'thought', 'second', 'marriage', 'need', 'apology', 'public', 'apt', 'unreasonably', 'discontent', 'woman', 'marry', 'sir', 'walter', 'continue', 'singleness', 'require', 'explanation']
We have 9299 sentences and 900000 tokens.


In [10]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     #number of threads to run in parallel
    min_count=10,  #minimum word count threshold
    window=6,      #number of words around the target to consider
    sg=0,          #use CBOW:0, skip-gram:1
    sample=1e-3,   #penalize frequent words
    size=300,      #word vector length
    hs=1           #use hierarchical softmax
)

print('Done!')

Done!


In [9]:
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

print('\n{}'.format(model.wv.similarity('mr', 'mrs')))

print('\n{}'.format(model.doesnt_match('breakfast marriage dinner lunch'.split())))

[('musgrove', 0.9526277780532837), ('benwick', 0.946715235710144), ('clay', 0.9406770467758179), ('goddard', 0.9396756291389465), ('wentworth', 0.9253109693527222), ('harville', 0.9202356338500977), ('colonel', 0.8757010698318481), ('hall', 0.8667272925376892), ('smith', 0.8595788478851318), ('weston', 0.8587863445281982)]

0.9287651777267456

marriage


  import sys


In [11]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     #number of threads to run in parallel
    min_count=50,  #minimum word count threshold
    window=4,      #number of words around the target to consider
    sg=1,          #use CBOW:0, skip-gram:1
    sample=1e-3,   #penalize frequent words
    size=300,      #word vector length
    hs=1           #use hierarchical softmax
)

print('Done!')

vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

print('\n{}'.format(model.wv.similarity('mr', 'mrs')))

print('\n{}'.format(model.doesnt_match('breakfast marriage dinner lunch'.split())))

Done!
[('uppercross', 0.8354675769805908), ('anne', 0.8317722082138062), ('heart', 0.8266140222549438), ('course', 0.8171985149383545), ('live', 0.8138149380683899), ('kellynch', 0.8135673999786377), ('acquaintance', 0.7988577485084534), ('listen', 0.7795247435569763), ('call', 0.7785896062850952), ('mary', 0.7690882086753845)]

0.7493101358413696

dinner




Changing to skip-gram, decreasing the window, and upping the min count threshold made the model a lot worse than the initial. It also selected the wrong word from the sentence. Let's try going the other way with those parameters and switching back to CBOW.

In [13]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     #number of threads to run in parallel
    min_count=10,  #minimum word count threshold
    window=8,      #number of words around the target to consider
    sg=0,          #use CBOW:0, skip-gram:1
    sample=1e-3,   #penalize frequent words
    size=300,      #word vector length
    hs=1           #use hierarchical softmax
)

print('Done!')

vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

print('\n{}'.format(model.wv.similarity('mr', 'mrs')))

print('\n{}'.format(model.doesnt_match('breakfast marriage dinner lunch'.split())))

Done!
[('benwick', 0.9414724707603455), ('musgrove', 0.9320449829101562), ('harville', 0.9121419191360474), ('charles', 0.9103752374649048), ('goddard', 0.9028724431991577), ('clay', 0.9002286195755005), ('wentworth', 0.8816044330596924), ('conscious', 0.8659206032752991), ('mary', 0.8458876013755798), ('hall', 0.8385883569717407)]

0.9557908177375793

dinner




Taking the window up to 8 and the min_count back to 10 shows better similarity with mr. and mrs., but it selected dinner instead of marriage from the list. Also, it chose the word 'mary' from the vocab but it should have chosen male names.

In [16]:
model = word2vec.Word2Vec(
    sentences,
    workers=4,     #number of threads to run in parallel
    min_count=10,  #minimum word count threshold
    window=8,      #number of words around the target to consider
    sg=0,          #use CBOW:0, skip-gram:1
    sample=1e-3,   #penalize frequent words
    size=600,      #word vector length
    hs=1           #use hierarchical softmax
)

print('Done!')

vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

print('\n{}'.format(model.wv.similarity('mr', 'mrs')))

print('\n{}'.format(model.doesnt_match('breakfast marriage dinner lunch'.split())))

Done!
[('musgrove', 0.96905517578125), ('benwick', 0.9638471603393555), ('goddard', 0.9537417888641357), ('harville', 0.9398235082626343), ('wentworth', 0.9164236783981323), ('clay', 0.9051190614700317), ('colonel', 0.9001761674880981), ('cole', 0.8700426816940308), ('weston', 0.86616051197052), ('consult', 0.8582299947738647)]

0.958689272403717

marriage




Increasing the vector length seems to have improved everything. It selected marriage as the non-similar word, the similarity with mr and mrs was higher than the original model and the similar words that mach the analogy all work.

Drill 1: word2vec on 100B+ words

Due to previous issues with memory I am going to look at the Google News Model that is a web app.
https://rare-technologies.com/word2vec-tutorial/#bonus_app

Tried kid : messy :: adult:? and got back messier, chaotic, & unpleasant. Does that mean that kids get it from their parents?

when looking for the most similar words to vicarious, the model returned 'escapist fantasy'.

Trying to remove the word that did not belong from the following list: balloon party confetti shovel cake presents resulted in the word presents. I think in this case it had trouble with the word presents having a double meaning.

Last thing I tried was gibberish. I put blue red iuerb green purple black into the box to choose the phrase that doesn't exist. Everytime the model selected one of the colors instead of luerb. I put that gibberish into the box to find the most similar and got nothing back. Since the word2vec model is built with thresholds on how often a word is found, then it will likely give gibberish words or misspellings no vector value at all and therefore can't be compared to the rest of the words.