### Word Embeddings

In [1]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_lg')

In [6]:
nlp('lion').vector.shape

(300,)

In [9]:
tokens = nlp(u'lion cat pet')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2) )

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0


In [10]:
tokens = nlp(u'like love hate')

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2) )

like like 1.0
like love 0.65790397
like hate 0.6574652
love like 0.65790397
love love 1.0
love hate 0.6393099
hate like 0.6574652
hate love 0.6393099
hate hate 1.0


In [15]:
tokens = nlp(u'dog cat sekkito')

for token in tokens:
    print(token, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
sekkito False 0.0 True


In [17]:
from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1,vec2)

In [18]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# king - man + woman ---> NEW VECTOR similar Queen, princess, highness 

new_vector = king-man+woman

computed_similarities = []

# FOR ALL WORDS IN MY VOCAB
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key = lambda item: - item[1])

# We can add and 
print([t[0].text for t in computed_similarities[:10]])

In [29]:
police = nlp.vocab['policeman'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# king - man + woman ---> NEW VECTOR similar Queen, princess, highness 

new_vector = police  - man + woman
 
computed_similarities = []

# FOR ALL WORDS IN MY VOCAB
for word in nlp.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key = lambda item: - item[1])

# We can add and 
print([t[0].text for t in computed_similarities[:10]])

['policeman', 'policewoman', 'woman', 'policemen', 'police', 'schoolteacher', 'assaulted', 'lady', 'officer', 'prostitute']


### VADER for Sentiment Analysis

In [30]:
import nltk

In [31]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\AH\AppData\Roaming\nltk_data...


True

In [37]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [38]:
sid = SentimentIntensityAnalyzer()

In [39]:
text = 'This is a good review'

In [41]:
sid.polarity_scores(text)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [42]:
text2 = "This was the best, most awesome movie EVER MADE!!!"
sid.polarity_scores(text2)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [43]:
text3 = "This was the WORST movie that has ever disgraced the screen"
sid.polarity_scores(text3)

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}

In [44]:
import pandas as pd

In [46]:
df = pd.read_csv('UPDATED_NLP_COURSE/TextFiles/amazonreviews.tsv', sep = '\t')

In [47]:
df.dropna(inplace = True)

In [48]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [51]:
blanks = []
for index, label, review in df.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks.append(index)
blanks

[]

In [52]:
df['scores'] =  df['review'].apply(lambda review: sid.polarity_scores(review))

df['compound'] = df['scores'].apply(lambda scores:  scores['compound'])

df['compound_score'] = df['compound'].apply(lambda compound: "pos" if compound > 0 else 'neg')

In [59]:
df.head()

Unnamed: 0,label,review,scores,compound,compound_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [63]:
accuracy_score(df['label'], df['compound_score'])

0.7122

In [65]:
print(classification_report(df['label'], df['compound_score']))

              precision    recall  f1-score   support

         neg       0.85      0.53      0.65      5097
         pos       0.65      0.90      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.72      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [66]:
print(confusion_matrix(df['label'], df['compound_score']))

[[2709 2388]
 [ 490 4413]]


### Sentiment Analysis Project

In [67]:
import numpy as np
import pandas as pd

In [68]:
df = pd.read_csv("UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv", sep='\t')

In [69]:
df.dropna(inplace = True, how = 'any')

In [70]:
blanks = []
for index, label, review in df.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks.append(index)
blanks

[57,
 71,
 147,
 151,
 283,
 307,
 313,
 323,
 343,
 351,
 427,
 501,
 633,
 675,
 815,
 851,
 977,
 1079,
 1299,
 1455,
 1493,
 1525,
 1531,
 1763,
 1851,
 1905,
 1993]

In [72]:
df.drop(index = blanks, inplace = True)

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1938 entries, 0 to 1999
Data columns (total 2 columns):
label     1938 non-null object
review    1938 non-null object
dtypes: object(2)
memory usage: 45.4+ KB


In [78]:
df.label.value_counts()

pos    969
neg    969
Name: label, dtype: int64

In [74]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [75]:
sid  = SentimentIntensityAnalyzer()

In [79]:
df['scores'] =  df['review'].apply(lambda review: sid.polarity_scores(review))

df['compound'] = df['scores'].apply(lambda scores:  scores['compound'])

df['compound_score'] = df['compound'].apply(lambda compound: "pos" if compound > 0 else 'neg')

In [80]:
df.head()

Unnamed: 0,label,review,scores,compound,compound_score
0,neg,how do films like mouse hunt get into theatres...,"{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...",-0.9125,neg
1,neg,some talented actresses are blessed with a dem...,"{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...",-0.8618,neg
2,pos,this has been an extraordinary year for austra...,"{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...",0.9953,pos
3,pos,according to hollywood movies made in last few...,"{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...",0.9972,pos
4,neg,my first press screening of 1998 and already i...,"{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...",-0.7264,neg


In [81]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [82]:
accuracy_score(df['label'], df['compound_score'])

0.6367389060887513

In [83]:
print(classification_report(df['label'], df['compound_score']))

              precision    recall  f1-score   support

         neg       0.72      0.44      0.55       969
         pos       0.60      0.83      0.70       969

   micro avg       0.64      0.64      0.64      1938
   macro avg       0.66      0.64      0.62      1938
weighted avg       0.66      0.64      0.62      1938



In [84]:
print(confusion_matrix(df['label'], df['compound_score']))

[[427 542]
 [162 807]]


In [86]:
df[df['label'] != df['compound_score']][['label', 'review']]

Unnamed: 0,label,review
6,neg,"synopsis : melissa , a mentally-disturbed woma..."
7,neg,tim robbins and martin lawernce team up in thi...
10,neg,"upon first viewing of this movie , the phrases..."
11,pos,"with stars like sigourney weaver ( "" alien "" t..."
13,neg,"georges polti once wrote a paper called "" the ..."
15,neg,here's a rarity : a children's film that attem...
17,neg,writer/director lawrence kasdan had a hand in ...
19,neg,"one would think that david duchovny , star of ..."
21,pos,"seen august 8 , 1998 at 6 p . m . at rotterdam..."
26,neg,synopsis : cro-magnon ayla loses her mother to...
