# Word Vectors

In [1]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_md')

In [4]:
nlp(u'dog').vector

array([-4.0176e-01,  3.7057e-01,  2.1281e-02, -3.4125e-01,  4.9538e-02,
        2.9440e-01, -1.7376e-01, -2.7982e-01,  6.7622e-02,  2.1693e+00,
       -6.2691e-01,  2.9106e-01, -6.7270e-01,  2.3319e-01, -3.4264e-01,
        1.8311e-01,  5.0226e-01,  1.0689e+00,  1.4698e-01, -4.5230e-01,
       -4.1827e-01, -1.5967e-01,  2.6748e-01, -4.8867e-01,  3.6462e-01,
       -4.3403e-02, -2.4474e-01, -4.1752e-01,  8.9088e-02, -2.5552e-01,
       -5.5695e-01,  1.2243e-01, -8.3526e-02,  5.5095e-01,  3.6410e-01,
        1.5361e-01,  5.5738e-01, -9.0702e-01, -4.9098e-02,  3.8580e-01,
        3.8000e-01,  1.4425e-01, -2.7221e-01, -3.7016e-01, -1.2904e-01,
       -1.5085e-01, -3.8076e-01,  4.9583e-02,  1.2755e-01, -8.2788e-02,
        1.4339e-01,  3.2537e-01,  2.7226e-01,  4.3632e-01, -3.1769e-01,
        7.9405e-01,  2.6529e-01,  1.0135e-01, -3.3279e-01,  4.3117e-01,
        1.6687e-01,  1.0729e-01,  8.9418e-02,  2.8635e-01,  4.0117e-01,
       -3.9222e-01,  4.5217e-01,  1.3521e-01, -2.8878e-01, -2.28

In [3]:
doc = nlp(u'The quick brown fox jumped over the lazy dogs.')

doc.vector

array([-1.96635887e-01, -2.32740352e-03, -5.36607020e-02, -6.10564947e-02,
       -4.08843048e-02,  1.45266443e-01, -1.08268000e-01, -6.27789786e-03,
        1.48455709e-01,  1.90697408e+00, -2.57692993e-01, -1.95818534e-03,
       -1.16141019e-02, -1.62858292e-01, -1.62938282e-01,  1.18210977e-02,
        5.12646027e-02,  1.00078702e+00, -2.01447997e-02, -2.54611671e-01,
       -1.28316596e-01, -1.97198763e-02, -2.89733019e-02, -1.94347113e-01,
        1.26644447e-01, -8.69869068e-02, -2.20812604e-01, -1.58452198e-01,
        9.86308008e-02, -1.79210991e-01, -1.55290633e-01,  1.95643142e-01,
        2.66436003e-02, -1.64984968e-02,  1.18824698e-01, -1.17830629e-03,
        4.99809943e-02, -4.23077159e-02, -3.86111848e-02, -7.47400150e-03,
        1.23448208e-01,  9.60620027e-03, -3.32463719e-02, -1.77848607e-01,
        1.19390726e-01,  1.87545009e-02, -1.84173390e-01,  6.91781715e-02,
        1.28520593e-01,  1.48827005e-02, -1.78013414e-01,  1.10003807e-01,
       -3.35464999e-02, -

### Identifying similar vectors

In [5]:
# Create a three-token Doc object:
tokens = nlp(u'lion cat pet')

# Iterate through token combinations:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0


In [8]:
# Vector norms
# It's sometimes helpful to aggregate 300 dimensions into a Euclidian (L2) norm, computed as the square root of the sum-of-squared-vectors. This is accessible as the .vector_norm token attribute. Other helpful attributes include .has_vector and .is_oov or out of vocabulary.

# For example, our 685k vector library may not have the word "kooble". To test this:

tokens = nlp(u'dog cat kooble')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

# We see that "kooble" does not have a vector, so the vector_norm value is zero, and it identifies as out of vocabulary.

dog True 7.0336733 False
cat True 6.6808186 False
kooble False 0.0 True


### Vector arithmetic

We can calculate new vectors by adding & subtracting related vectors. Like - 

"king" - "man" + "woman" = "queen"

In [9]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['king', 'woman', 'she', 'lion', 'who', 'fox', 'brown', 'when', 'dare', 'cat']


# Sentiment Analysis

### Broad Steps:
-First, consider the text being analyzed. A model trained on paragraph-long movie reviews might not be effective on tweets. Make sure to use an appropriate model for the task at hand.

-Next, decide the type of analysis to perform. In the previous section on text classification we used a bag-of-words technique that considered only single tokens, or unigrams. Some rudimentary sentiment analysis models go one step further, and consider two-word combinations, or bigrams. In this section, we'd like to work with complete sentences, and for this we're going to import a trained NLTK lexicon called VADER.

### NLTK's VADER module

VADER is an NLTK module that provides sentiment scores based on words used ("completely" boosts a score, while "slightly" reduces it), on capitalization & punctuation ("GREAT!!!" is stronger than "great."), and negations (words like "isn't" and "doesn't" affect the outcome).

VADER's SentimentIntensityAnalyzer() takes in a string and returns a dictionary of scores in each of four categories:

* negative
* neutral
* positive
* compound (computed by normalizing the scores above)

In [10]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [14]:
a = 'This was a fine movie.'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.625, 'pos': 0.375, 'compound': 0.2023}

In [13]:
a = 'This was the best, most awesome movie EVERRR!!!'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.388, 'pos': 0.612, 'compound': 0.8877}

In [18]:
a = 'This was the worst movie of all times.'
sid.polarity_scores(a)

{'neg': 0.369, 'neu': 0.631, 'pos': 0.0, 'compound': -0.6249}

### Using VADER to analyze Amazon Reviews

For this we are going to apply SentimentIntensityAnalyzer to a dataset of 10,000 Amazon reviews. The documents are labeled as either "pos" or "neg". At the end we'll determine the accuracy of our sentiment analysis with VADER.

In [19]:
import numpy as np
import pandas as pd

df = pd.read_csv('amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [20]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

### Clean the data:

Let's check to see if any empty records exist in amazonreviews.tsv.

In [21]:
# REMOVE NaN VALUES AND EMPTY STRINGS:
df.dropna(inplace=True)

blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

df.drop(blanks, inplace=True)

In [22]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

Let's run the first review through VADER

In [23]:
sid.polarity_scores(df.loc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [24]:
df.loc[0]['label']

'pos'

Great! Our first review was labeled "positive", and earned a positive compound score.

Adding Scores and Labels to the DataFrame
In this next section we'll add columns to the original DataFrame to store polarity_score dictionaries, extracted compound scores, and new "pos/neg" labels derived from the compound score. We'll use this last column to perform an accuracy test.

In [25]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


### Report on Accuracy
Finally, we'll use scikit-learn to determine how close VADER came to our original 10,000 labels.

In [26]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [27]:
accuracy_score(df['label'],df['comp_score'])

0.7091

In [28]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [29]:
print(confusion_matrix(df['label'],df['comp_score']))

[[2622 2475]
 [ 434 4469]]


This tells us that VADER correctly identified an Amazon review as "positive" or "negative" roughly *71%* of the time.