# Semantics and Word Vectors

In [2]:
import spacy

In [3]:
nlp=spacy.load('en_core_web_md')

In [4]:
doc=nlp(u"lion")

300 dimensions

In [6]:
doc.vector.shape

(300,)

In [7]:
doc.vocab.vectors.shape

(20000, 300)

vector for an doc is the average of each item

In [8]:
doc=nlp(u"The quick brown fox jumps over the lazy dog.")
doc.vector.shape

(300,)

the similarity between each item/word (based on meanings)

In [9]:
doc=nlp(u"dog cat monkey")

In [11]:
for token1 in doc:
    for token2 in doc:
        print(token1.text,token2.text,token1.similarity(token2))

dog dog 1.0
dog cat 0.8220816850662231
dog monkey 0.4771559536457062
cat dog 0.8220816850662231
cat cat 1.0
cat monkey 0.5929930210113525
monkey dog 0.4771559536457062
monkey cat 0.5929930210113525
monkey monkey 1.0


In [12]:
nlp(u"lion").similarity(nlp(u"dandelion"))

0.14681951826673326

In [13]:
doc=nlp(u"like love dislike hate")

similarity-> the context of the words are similar

In [14]:
for token1 in doc:
    for token2 in doc:
        print(token1.text,token2.text,token1.similarity(token2))

like like 1.0
like love 0.5212638974189758
like dislike 0.5710194706916809
like hate 0.5065140724182129
love like 0.5212638974189758
love love 1.0
love dislike 0.5367781519889832
love hate 0.5708349943161011
dislike like 0.5710194706916809
dislike love 0.5367781519889832
dislike dislike 1.0
dislike hate 0.6406129002571106
hate like 0.5065140724182129
hate love 0.5708349943161011
hate dislike 0.6406129002571106
hate hate 1.0


In [17]:
tokens=nlp(u"dog cat nowayinvocab")
for token in tokens:
    print(token.text,token.has_vector,token.vector_norm,token.is_oov)  # 几何平均数,out of vocabulary

dog True 75.254234 False
cat True 63.188496 False
nowayinvocab False 0.0 True


### vector arithmetic

In [18]:
king=nlp.vocab['king'].vector
man=nlp.vocab['man'].vector
woman=nlp.vocab['woman'].vector
queen=nlp.vocab['queen'].vector

new_vector=king-man+woman

In [20]:
from scipy import spatial
consine_similarity=lambda x,y:1-spatial.distance.cosine(x,y)  
# misspell here.

In [21]:
consine_similarity(new_vector,queen)

0.6178014278411865

In [23]:
consine_similarity(new_vector,king)

0.8489541411399841

In [24]:
consine_similarity(new_vector,man)

0.07003621011972427

In [25]:
consine_similarity(new_vector,woman)

0.30994713306427

# sentiment analysis with VADER

In [None]:
# VADER is a rule based system

In [26]:
import nltk

In [27]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yangminyue/nltk_data...


True

In [28]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid=SentimentIntensityAnalyzer()

In [29]:
a='This was a great movie.'

sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.423, 'pos': 0.577, 'compound': 0.6249}

In [30]:
a='This was the greatest movie EVER MORE!!!'

sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.542, 'pos': 0.458, 'compound': 0.7249}

In [31]:
a='This was a horrible movie.Worst film ever made!'

sid.polarity_scores(a)

{'neg': 0.387, 'neu': 0.613, 'pos': 0.0, 'compound': -0.5848}

## Using Vader for amazon movie Review

In [32]:
import numpy as np
import pandas as pd

df=pd.read_csv('amazonreviews.tsv',sep='\t')

In [33]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [34]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [35]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [37]:
sid.polarity_scores(df.loc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [38]:
df['scores']=df['review'].apply(lambda review: sid.polarity_scores(review))

In [39]:
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [40]:
df['compound']=df['scores'].apply(lambda score_dict:score_dict['compound'])

In [41]:
df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [43]:
df['comp_score']=df['compound'].apply(lambda c:'pos' if c>0 else 'neg')

In [44]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


check how accurate VADER is

In [45]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [46]:
accuracy_score(df['label'],df['comp_score'])

0.713

In [48]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.85      0.53      0.65      5097
         pos       0.65      0.90      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.72      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [49]:
print(confusion_matrix(df['label'],df['comp_score']))

[[2716 2381]
 [ 489 4414]]


# Sentiment Analysis Project with VADER

In [50]:
import numpy as np
import pandas as pd

df=pd.read_csv('moviereviews.tsv',sep='\t')

In [51]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [53]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [54]:
df['label'].value_counts()

neg    1000
pos    1000
Name: label, dtype: int64

In [55]:
df.dropna(inplace=True)

In [56]:
df.isnull().sum()

label     0
review    0
dtype: int64

In [57]:
df['label'].value_counts()

neg    983
pos    982
Name: label, dtype: int64

In [58]:
blanks=[]
for i,label,review in df.itertuples():
    if type(review)==str:
        if (review).isspace():
            blanks.append(i)
            
df.drop(blanks,inplace=True)

In [59]:
df['label'].value_counts()

neg    969
pos    969
Name: label, dtype: int64

In [60]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid=SentimentIntensityAnalyzer()

In [61]:
df['scores']=df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']=df['scores'].apply(lambda score_dict:score_dict['compound'])
df['comp_score']=df['compound'].apply(lambda c:'pos' if c>0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.068, 'neu': 0.781, 'pos': 0.15, 'com...",0.9951,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.071, 'neu': 0.782, 'pos': 0.147, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.091, 'neu': 0.817, 'pos': 0.093, 'co...",-0.2484,neg


In [62]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [63]:
accuracy_score(df['label'],df['comp_score'])

0.6357069143446853

In [64]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

    accuracy                           0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [65]:
print(confusion_matrix(df['label'],df['comp_score']))

[[427 542]
 [164 805]]
