### Setup

In [2]:
import numpy as np
import pandas as pd
import spacy

In [3]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[K     |████████████████████████████████| 777.4 MB 17 kB/s  eta 0:00:010     |█████████████████               | 414.6 MB 6.8 MB/s eta 0:00:54     |███████████████████████▉        | 579.1 MB 4.5 MB/s eta 0:00:45     |██████████████████████████▋     | 646.2 MB 7.7 MB/s eta 0:00:18
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


# SPACY

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
nlp(u'lion').vector.shape

(300,)

In [6]:
nlp(u'The quick brown fox jumped').vector.shape

(300,)

In [7]:
def print_sims(word1, word2):
  sim = f'{word1.similarity(word2):.2f}'
  print(f'[{sim}] {word1.text} ➡ {word2.text}')

def print_all_sims(tokens):
  for token1 in tokens:
    for token2 in tokens:
      print_sims(token1, token2)

In [8]:
# tokens = nlp(u'lion cat pet')

print_all_sims(nlp(u'lion cat pet'))
print_all_sims(nlp(u'like love hate'))

[1.00] lion ➡ lion
[0.53] lion ➡ cat
[0.40] lion ➡ pet
[0.53] cat ➡ lion
[1.00] cat ➡ cat
[0.75] cat ➡ pet
[0.40] pet ➡ lion
[0.75] pet ➡ cat
[1.00] pet ➡ pet
[1.00] like ➡ like
[0.66] like ➡ love
[0.66] like ➡ hate
[0.66] love ➡ like
[1.00] love ➡ love
[0.64] love ➡ hate
[0.66] hate ➡ like
[0.64] hate ➡ love
[1.00] hate ➡ hate


In [9]:
nlp.vocab.vectors.shape

(684830, 300)

In [10]:
tokens = nlp(u'dog cat nargle')
# print_all_sims(tokens)
for token in tokens:
  print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


In [11]:
from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [12]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [13]:
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
  if word.has_vector:
    if word.is_lower:
      if word.is_alpha:
        similarity = cosine_similarity(new_vector, word.vector)
        computed_similarities.append((word, similarity))

In [14]:
computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])

print([t[0].text for t in computed_similarities[:10]])

['king', 'woman', 'she', 'lion', 'who', 'fox', 'brown', 'when', 'dare', 'cat']


# VADER - Sentiment Analysis

In [15]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/bladnman/nltk_data...


True

In [16]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [17]:
a = "This is a good movie"

sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [18]:
sid.polarity_scores("This was the best, most awesome movie EVER MADE!!!")

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [19]:
sid.polarity_scores("This was the worst movie that has ever disgraced the screen.")

{'neg': 0.441, 'neu': 0.559, 'pos': 0.0, 'compound': -0.7964}

In [20]:
df = pd.read_csv(get_data_url("amazonreviews.tsv"), sep="\t")

NameError: name 'get_data_url' is not defined

In [None]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [None]:
df.label.value_counts()

Out[23]: neg    5097
pos    4903
Name: label, dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
blanks = []
for i,lb,rv in df.itertuples():
  if type(rv) == str:
    if rv.isspace():
      blanks.append(i)
blanks
# if there were blanks we would
# df.drop(blanks, inplace=True)

Out[25]: []

In [None]:
df.iloc[0]['review']

Out[26]: 'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [None]:
sid.polarity_scores(df.iloc[0]['review'])

Out[27]: {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [None]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [None]:
df['scores'] = df['scores'].apply(lambda d: d['compound'])
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...",0.9781


In [None]:
df['comp_score'] = df['scores'].apply(lambda v: 'pos' if v >= 0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...",0.9781,pos


In [None]:
df[df['label'] != df['comp_score']]

Unnamed: 0,label,review,scores,comp_score
13,neg,Oh please: I guess you have to be a romance no...,0.9097,pos
19,neg,sizes recomended in the size chart are not rea...,0.4926,pos
20,neg,mens ultrasheer: This model may be ok for sede...,0.6318,pos
22,neg,Another Abysmal Digital Copy: Rather than scra...,0.9667,pos
28,neg,Oh dear: I was excited to find a book ostensib...,0.6486,pos
...,...,...,...,...
9982,neg,great movie massacred by tape quality: One of ...,0.8591,pos
9988,pos,Frightening movie with superb acting by Sir Ho...,-0.3506,neg
9992,neg,The Silence of the Dummies: This is overall a ...,0.8890,pos
9994,neg,"Sorry Jim: As a former realtor, Mr. Cole owes ...",0.9066,pos


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
accuracy_score(df['label'], df['comp_score'])

Out[33]: 0.7097

In [None]:
print(classification_report(df['label'], df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [None]:
confusion_matrix(df['label'], df['comp_score'])

Out[35]: array([[2629, 2468],
       [ 435, 4468]])

# Movie Reivews Sentiment

In [None]:
df = pd.read_csv(get_data_url("moviereviews.tsv"), sep="\t")

In [None]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [None]:
df.dropna(inplace=True)

In [None]:
blanks = []
for i,lb,rv in df.itertuples():
  if type(rv) == str:
    if rv.isspace():
      blanks.append(i)
blanks

Out[39]: [57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [None]:
df.drop(blanks, inplace=True)
df['label'].value_counts()

Out[40]: neg    969
pos    969
Name: label, dtype: int64

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
df['scores'] = df['review'].apply(lambda review:sid.polarity_scores(review))

In [None]:
df['compound'] = df['scores'].apply(lambda d:d['compound'])
df

Unnamed: 0,label,review,scores,compound
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484
...,...,...,...,...
1995,pos,"i like movies with albert brooks , and i reall...","{'neg': 0.073, 'neu': 0.763, 'pos': 0.164, 'co...",0.9991
1996,pos,it might surprise some to know that joel and e...,"{'neg': 0.238, 'neu': 0.688, 'pos': 0.074, 'co...",-0.9993
1997,pos,the verdict : spine-chilling drama from horror...,"{'neg': 0.15, 'neu': 0.702, 'pos': 0.147, 'com...",-0.5966
1998,pos,i want to correct what i wrote in a former ret...,"{'neg': 0.131, 'neu': 0.71, 'pos': 0.16, 'comp...",0.9387


In [None]:
df['comp_score'] = df['compound'].apply(lambda score:'pos' if score >= 0 else 'neg')

In [None]:
df

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg
...,...,...,...,...,...
1995,pos,"i like movies with albert brooks , and i reall...","{'neg': 0.073, 'neu': 0.763, 'pos': 0.164, 'co...",0.9991,pos
1996,pos,it might surprise some to know that joel and e...,"{'neg': 0.238, 'neu': 0.688, 'pos': 0.074, 'co...",-0.9993,neg
1997,pos,the verdict : spine-chilling drama from horror...,"{'neg': 0.15, 'neu': 0.702, 'pos': 0.147, 'com...",-0.5966,neg
1998,pos,i want to correct what i wrote in a former ret...,"{'neg': 0.131, 'neu': 0.71, 'pos': 0.16, 'comp...",0.9387,pos


In [None]:
print(accuracy_score(df['label'], df['comp_score']))
print(classification_report(df['label'], df['comp_score']))
print(confusion_matrix(df['label'], df['comp_score']))

0.6357069143446853
              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938

[[427 542]
 [164 805]]


# Assessment
[Courseware Notebook](https://github.com/bladnman/ml_courses/blob/main/natural_language_processing/course%20materials/04-Semantics-and-Sentiment-Analysis/03-Sentiment-Analysis-Assessment.ipynb)

## Task #1: Perform vector arithmetic on your own words
Write code that evaluates vector arithmetic on your own set of related words. The goal is to come as close to an expected word as possible. Please feel free to share success stories in the Q&A Forum for this section!

In [None]:
# Import spaCy and load the language library. Remember to use a larger model!
nlp = spacy.load('en_core_web_lg')

In [None]:
# Choose the words you wish to compare, and obtain their vectors
lion_vec = nlp(u'lion').vector
tiger_vec = nlp(u'tiger').vector
airplane_vec = nlp(u'airplane').vector

In [None]:
# Import spatial and define a cosine_similarity function
from scipy import spatial
cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [None]:
# Write an expression for vector arithmetic
# For example: new_vector = word1 - word2 + word3
new_vec = lion_vec - tiger_vec + airplane_vec

In [None]:
# List the top ten closest vectors in the vocabulary to the result of the expression above
def get_similarities(nlp, word_vec, limit=10):
  computed_similarities = []
  for word in nlp.vocab:
    if word.has_vector:
      if word.is_lower:
        if word.is_alpha:
          similarity = cosine_similarity(word_vec, word.vector)
          computed_similarities.append((word, similarity))
  computed_similarities = sorted(computed_similarities, key=lambda item:-item[1])
  return computed_similarities[:limit]

def vector_minus_plus(a,b,c):
  return a - b + c

def combination_similarities(nlp, word1, word2, word3):
  vec1 = nlp(u'' + word1).vector
  vec2 = nlp(u'' + word2).vector
  vec3 = nlp(u'' + word3).vector
  new_vec = vector_minus_plus(vec1, vec2, vec3)
  return get_similarities(nlp, new_vec)

def print_words(words):
  print([t[0].text for t in words])
  
print_words(get_similarities(nlp, new_vec))


['airplane', 'lion', 'wings', 'space', 'when', 'where', 'it', 'that', 'could', 'was']


**CHALLENGE: Write a function that takes in 3 strings, performs a-b+c arithmetic, and returns a top-ten result**

In [None]:
print_words(combination_similarities(nlp, 'car', 'automobile', 'airplane'))
print_words(combination_similarities(nlp, 'wolf', 'dog', 'cat'))


['airplane', 'car', 'wings', 'got', 'when', 'wheel', 'monkey', 'i', 'was', 'space']
['wolf', 'tiger', 'lion', 'cat', 'monkey', 'wings', 'i', 'cuz', 'dare', 'u']


## Task #2: Perform VADER Sentiment Analysis on your own review
Write code that returns a set of SentimentIntensityAnalyzer polarity scores based on your own written review.

In [None]:
# Import SentimentIntensityAnalyzer and create an sid object
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
# Write a review as one continuous string (multiple sentences are ok)
review = "I am not sure if I liked this product. It got stuck in the beginning and was unclear about how to fix things. Eventually it started working."

In [None]:
# Obtain the sid scores for your review
sid.polarity_scores(review)

Out[76]: {'neg': 0.283, 'neu': 0.717, 'pos': 0.0, 'compound': -0.7426}

**CHALLENGE: Write a function that takes in a review and returns a score of "Positive", "Negative" or "Neutral"**

In [None]:
def review_rating(string):
  polarity = sid.polarity_scores(string)
  if polarity['compound'] < 0: return 'Negative'
  if polarity['compound'] > 0: return 'Positive'
  return 'Neutral'

In [None]:
# Test the function on your review above:
review_rating(review)

Out[83]: 'Negative'