In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

## word2vec
- Most common unsupervised NN approach for NLP
- Shallow NN model for converting words to vectors using distributed representation
    - Each word represented by many neurons
    - Each neuron involved in representing many words
- Works by assigning vectors of random values to each word
    - For word W, looks at words near W in sentence
    - Shifts values in word vectors such that vectors for words near W are closer to W vector
    - Words not near W are shifted further away from W vector
    - Eventually results in words that often appear together having vectors near one another
    - Words that rarely/never appear together have vectors far away from one another
    - Similarity scores can then be computed for each word pair by taking cosine of the vectors
- Difference from Latent Semantic Analysis
    - LSA creates vector representations of sentences based on the words in them
    - word2vec creates representations of individual words based on the words around them
- Useful for parsing requests written by humans
    - Humans can communicate the same concept many different ways
    - Humans know silverware and utensils can refer to the same thing but computers do not
    - word2vec helps computers infer meaning by  looking at words with close vectors
    - Search engines: return best results for query and not just ones containing exact words used

## Generating vectors
- **Continuous Bag of Words (CBOW):** identity of a word is predicted using the words near it in a sentence
- **Skip-gram:** identities of words are predicted from the word they surround
    - Seems to work better for larger corpuses
- Example: 'Terry Gilliam is a better comedian than a director'
    - CBOW will try to predict 'comedian' using is a better than a director, vector for comedian pulled closer to other words
    - Skip-gram will try to predict is a better than a director using 'comedian', vectors for other words pulled closer to 'comedian'
- Each time a word is processed some vectors are moved further away
    - Negative sampling: each time a word is pulled toward neighbors, others are pushed away
    - Hierarchical softmax: every neighboring word is pulled closer or farther from a subset of words chose based on a tree of probabilities

## Similarity
- word2vec operates on the assumption that frequent proximity indicates similarity, but words can similar in various ways
- Can identify similarities between words that never occur near one another in the corpus
- Vectors can be used to convert analogies into mathematical expressions
    - king:queen :: man:woman
    - king + woman - man = queen
- Works best on very a large corpus (billions of words)
- Example only has 2 million words so results will not be great

In [6]:
#cleaner function
def text_cleaner(text):
    text = re.sub(r'--', ' ',text)
    text = re.sub('[\[].*?[\]]','',text)
    text = re.sub(r'Chapter \d+','', text)
    text = ' '.join(text.split())
    return text

#import all austen in gutenberg corpus
#austen = ''
#for novel in ['persuasion','emma','sense']:
#    work = gutenberg.raw('austen-' + novel + '.txt')
#    austen = austen + work
p = gutenberg.raw('austen-persuasion.txt')
e = gutenberg.raw('austen-emma.txt')
s = gutenberg.raw('austen-sense.txt')

p_clean = text_cleaner(p)
e_clean = text_cleaner(e)
s_clean = text_cleaner(s)

In [7]:
nlp = spacy.load('en')
p_doc = nlp(p_clean)
e_doc = nlp(e_clean)
s_doc = nlp(s_clean)

In [11]:
sentences = []
for sentence in p_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


for sentence in e_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)
    
for sentence in s_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)

print(sentences[20])
print('we have {} sentences and {} tokens.'.format(len(sentences), (len(p_clean)+len(e_clean)+len(s_clean))))

['for', 'daughter', 'eld', 'give', 'thing', 'tempt']
we have 17853 sentences and 2006270 tokens.


In [14]:
import gensim
from gensim.models import word2vec
import time

start_time = time.clock()
model = word2vec.Word2Vec(
    sentences,
    workers=2,
    min_count=10,
    window=6,
    sg=0,
    sample=1e-3,
    size=300,
    hs=1)

print(time.clock() - start_time)

2.944455000000005


In [15]:
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

print(model.wv.similarity('loud','aloud'))
print(model.wv.similarity('mr','mrs'))

print(model.doesnt_match('breakfast marriage dinner lunch'.split()))

[('people', 0.578013002872467), ('friend', 0.47928112745285034), ('introduction', 0.4748184084892273), ('daughter', 0.46245941519737244), ('monstrous', 0.4612240195274353), ('visit', 0.44612759351730347), ('anne', 0.43285226821899414), ('way', 0.4328392744064331), ('choice', 0.4274982213973999), ('stranger', 0.4229855239391327)]
0.72369635
0.10934523
marriage


  if np.issubdtype(vec.dtype, np.int):
  


## Drill 0
Play with word2vec hyperparameters and see if you can improve model performance

In [17]:
start_time = time.clock()
model = word2vec.Word2Vec(
    sentences,
    workers=2,
    min_count=1,
    window=1,
    sg=0,
    sample=1e-3,
    size=300,
    hs=1)

print(time.clock() - start_time)

vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

print(model.wv.similarity('loud','aloud'))
print(model.wv.similarity('mr','mrs'))

print(model.doesnt_match('breakfast marriage dinner lunch'.split()))

3.520030999999989
[('disinclined', 0.8604533672332764), ('entry', 0.8412480354309082), ('inexperienc', 0.8180102109909058), ('snatch', 0.8139647245407104), ('swisserland', 0.8108318448066711), ('inforc', 0.8068943023681641), ('parson', 0.7885650396347046), ('proficient', 0.7775000333786011), ('enchanting', 0.7773404121398926), ('brood', 0.7767763137817383)]
0.6061417
0.5211655
marriage


  if np.issubdtype(vec.dtype, np.int):


## Drill 1
As we mentioned, word2vec really works best on a big corpus, but it can take half a day to clean such a corpus and run word2vec on it.  Fortunately, there are word2vec models available that have already been trained on _really_ big corpora. They are big files, but you can download a [pretrained model of your choice here](https://github.com/3Top/word2vec-api). At minimum, the ones built with word2vec (check the "Architecture" column) should load smoothly using an appropriately modified version of the code below, and you can play to your heart's content.

Because the models are so large, however, you may run into memory problems or crash the kernel. If you can't get a pretrained model to run locally, check out this [interactive web app of the Google News model](https://rare-technologies.com/word2vec-tutorial/#bonus_app) instead.

In [21]:
#seriously the code for this nlp unit has been a complete disaster and i'm done troubleshooting it
#model = gensim.models.KeyedVectors.load_word2vec_format ('./model/GoogleNews-vectors-negative300.bin', binary=True)