In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

In [12]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text


# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [13]:
# Parse the data. This can take some time.
nlp = spacy.load('en')
austen_doc = nlp(austen_clean)

In [14]:
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
We have 3649 sentences and 462818 tokens.


In [17]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [31]:
# List of words in model.
vocab = len(model.wv.vocab.keys())
print (vocab)
print(model.wv.most_similar(positive=['lady']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

732
[('mr', 0.9990690350532532), ('hall', 0.998643696308136), ('croft', 0.9981932640075684), ('satisfaction', 0.9980280995368958), ('shepherd', 0.9980175495147705), ('then', 0.9976925849914551), ('divide', 0.9975411891937256), ('elliot', 0.9974880218505859), ('carriage', 0.9974134564399719), ('colonel', 0.9973748922348022)]
0.99778044


  if sys.path[0] == '':


dinner


# Drill 0


Take a few minutes to modify the hyperparameters of this model and see how its answers change. Can you wrangle any improvements?

In [49]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=5,  # Minimum word count threshold.
    window=3,      # Number of words around target word to consider.
    sg=1,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)


In [50]:
# List of words in model.
vocab = model.wv.vocab.keys()
print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
#print(model.wv.similarity('loud', 'aloud'))
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.wv.doesnt_match("breakfast marriage dinner lunch".split()))

[('miss', 0.8669900894165039), ('bath', 0.8486307859420776), ('hall', 0.8432186245918274), ('wish', 0.8186935782432556), ('conscious', 0.8135144710540771), ('to', 0.8106386661529541), ('elizabeth', 0.8101241588592529), ('could', 0.8026816248893738), ('walter', 0.8013855218887329), ('like', 0.7993282079696655)]
0.7885449
dinner


# Conclusion:

As we observe , changing the value of min_counts from 10  to 5, window  from 6to 3 ad sg from 0 to 1 reduce the accuracy score signicantly.Larger windows tend to capture more topic/domain information, while Smaller windows tend to capture more about word itself.Finding odd one out between "breakfast marriage dinner lunch"  got result as "marriage" which is dissimilar to "breakfast", "lunch", and "dinner".

# Drill 1: Word2Vec on 100B+ words


In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors
# Load Google's pre-trained Word2Vec model.
model = KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=20000)

Document was not able to load

In [None]:
# List of words in model.
print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))