In [1]:
# import spacy and load a larger model

import spacy

# get the large model
nlp = spacy.load('en_core_web_lg')

In [2]:
# compare some words

tokens = nlp(u'kitty tiger water bowl dog')

In [3]:
# compare them 
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

kitty kitty 1.0
kitty tiger 0.4961798
kitty water 0.22237565
kitty bowl 0.28006953
kitty dog 0.6306644
tiger kitty 0.4961798
tiger tiger 1.0
tiger water 0.25212076
tiger bowl 0.30012825
tiger dog 0.43654656
water kitty 0.22237565
water tiger 0.25212076
water water 1.0
water bowl 0.43104872
water dog 0.30933863
bowl kitty 0.28006953
bowl tiger 0.30012825
bowl water 0.43104872
bowl bowl 1.0
bowl dog 0.26937348
dog kitty 0.6306644
dog tiger 0.43654656
dog water 0.30933863
dog bowl 0.26937348
dog dog 1.0


In [4]:
# import spatial for vector arithmetic
# vector arithmetic

from scipy import spatial

In [6]:
# make the lambda function for cosine similarity

cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [7]:
# Write an expression for vector arithmetic
# For example: new_vector = word1 - word2 + word3

dog = nlp.vocab['dog'].vector
cat = nlp.vocab['cat'].vector
kitty = nlp.vocab['kitty'].vector

new_vector = dog-cat+kitty

In [8]:
computed_similarities = []

# for all words in the vocab
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))
                
# use negative item for sorting descending
computed_similarities = sorted(computed_similarities, key = lambda item:-item[1])

# print top 10
print([t[0].text for t in computed_similarities[:10]])

['dog', 'kitty', 'puppy', 'doggie', 'dogs', 'puppies', 'pet', 'pup', 'doggy', 'bunny']


In [10]:
def vector_math(a,b,c):
    computed_similarities = []
    
    # make them vectors
    new_vector = nlp.vocab[a].vector - nlp.vocab[b].vector + nlp.vocab[c].vector
    
    
    # for all words in the vocab
    for word in nlp.vocab:
        if word.has_vector:
            if word.is_lower:
                if word.is_alpha:
                    similarity = cosine_similarity(new_vector, word.vector)
                    computed_similarities.append((word, similarity))
                    
    # use negative item for sorting descending
    computed_similarities = sorted(computed_similarities, key = lambda item:-item[1])
    
    # print top 10
    return[w[0].text for w in computed_similarities[:10]]

In [12]:
vector_math('wolf', 'cat', 'kitty')

['wolf',
 'wolves',
 'wolfs',
 'kitty',
 'werewolf',
 'bear',
 'panther',
 'fox',
 'grizzly',
 'coyote']

In [13]:
# Import SentimentIntensityAnalyzer and create an sid object
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()



In [23]:
# Write a review as one continuous string (multiple sentences are ok)
review = 'This movie portrayed real people, and was based on actual events. It was heartbreaking.'

In [24]:
sid.polarity_scores(review)

{'neg': 0.188, 'neu': 0.812, 'pos': 0.0, 'compound': -0.4588}

In [21]:
# write a function to take in review and return score

def review_rating(string):
    scores = sid.polarity_scores(string)
    
    if scores['compound'] > 0:
        return 'Positive'
    # neutral should be more of a range than ==0
    elif scores['compound'] ==0:
        return 'Neutral'
    else:
        return 'Negative'
    

In [25]:
review_rating(review)

'Negative'