In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from nltk.corpus import gutenberg, stopwords

In [2]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    text = re.sub(r'CHAPTER \d+', '', text)
    text = re.sub("\\n\\n.*?\\n\\n", '', text)
  
    # Get rid of extra whitespace.
    text = ' '.join(text.split())

    
    return text

## Data Cleaning & Setup

In [3]:
# load documents
ball = gutenberg.raw('chesterton-ball.txt')
brown = gutenberg.raw('chesterton-brown.txt')
thursday = gutenberg.raw('chesterton-thursday.txt')

# clean texts
ball = text_cleaner(ball)
brown = text_cleaner(brown)
thursday = text_cleaner(thursday)

# run spacy and analyze the documents
nlp = spacy.load('en')
ball_doc = nlp(ball)
brown_doc = nlp(brown)
thursday_doc = nlp(thursday)

In [4]:
# put sentences into a list
brown_sents = []
ball_sents = []
thursday_sents = []
for sentence in brown_doc.sents:
    brown_sents.append(sentence)
for sentence in ball_doc.sents:
    ball_sents.append(sentence)
for sentence in thursday_doc.sents:
    thursday_sents.append(sentence)

In [5]:
print('Length of sentences\n'+'-'*40+'\nBrown sentences: {}\nBall sentences: {}\nThursday sentences: {}'.format(len(brown_sents), len(ball_sents), len(thursday_sents)))

Length of sentences
----------------------------------------
Brown sentences: 3534
Ball sentences: 4272
Thursday sentences: 3053


In [6]:
# brown_sents = brown_sents[:2000]
# ball_sents = ball_sents[:2000]
# thursday_sents = thursday_sents[:2000]

In [7]:
# combine the sentences together
def appendSentences(doc):
    sentence_list = []
    for sentence in doc:
        sentence = [
            token.lemma_.lower()
            for token in sentence
            if not token.is_stop
            and not token.is_punct
        ]
        sentence_list.append(sentence)
    return sentence_list 

# get count of tokens
def getTokenCount(doc):
    count = 0
    for sentence in doc:
        for token in sentence:
            count += 1
    return count

all_sents = appendSentences(brown_sents) + appendSentences(ball_sents) + appendSentences(thursday_sents)
token_count = getTokenCount(all_sents)

print('We have {} sentences and {} tokens'.format(len(all_sents), token_count))

We have 10859 sentences and 99833 tokens


## CBOW

In [8]:
import gensim
from gensim.models import word2vec

model_cbow = word2vec.Word2Vec(
    all_sents,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)



In [9]:
# List of words in model.
vocab = model_cbow.wv.vocab.keys()

print(model_cbow.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.

# One of these things is not like the other...
print(model_cbow.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('business', 0.9922202229499817), ('fancy', 0.9893401861190796), ('vulgar', 0.9888213872909546), ('desperate', 0.9886668920516968), ('dubosc', 0.9886517524719238), ('buy', 0.9883180260658264), ('death', 0.9876230955123901), ('anarchist', 0.987596869468689), ('social', 0.9872456789016724), ('train', 0.9869893789291382)]
breakfast


## Skip-gram

In [10]:
model_sg = word2vec.Word2Vec(
    all_sents,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=1,          # skip-gram.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

In [11]:
# List of words in model.
vocab = model_sg.wv.vocab.keys()

print(model_sg.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.

# One of these things is not like the other...
print(model_sg.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('madness', 0.8698895573616028), ('keeper', 0.8633471131324768), ('librarian', 0.8612260222434998), ('though', 0.8536661863327026), ('coincidence', 0.8517497181892395), ('rope', 0.851563572883606), ('stiff', 0.8512357473373413), ('deadly', 0.8511946797370911), ('memory', 0.8507880568504333), ('pure', 0.84935063123703)]
breakfast
