## Sentiment Aware Word Embeddings

#### The goal of this module is to create sentiment aware word embeddings for a corpus using Word2Vec and an existing Emotional Lexicon.
### References:
#### https://www.mdpi.com/2076-3417/9/7/1334
#### http://saifmohammad.com/WebPages/lexicons.html

In [32]:
import json
import pandas as pd
import numpy as np
import nltk
from gensim import models
import contractions

import warnings
warnings.filterwarnings('ignore')

from bs4 import BeautifulSoup, Tag

### Getting the Dataset

In [33]:
import pickle

with open('../outputs/reviews_for_embeddings.pkl', 'rb') as file:
  rv = pickle.load(file)
df = pd.DataFrame(rv,columns=["reviewText"])
df

Unnamed: 0,reviewText
0,We got this GPS for my husband who is an (OTR)...
1,"I'm a professional OTR truck driver, and I bou..."
2,"Well, what can I say. I've had this unit in m..."
3,"Not going to write a long review, even thought..."
4,I've had mine for a year and here's what we go...
...,...
9995,The audio connector was very difficult to make...
9996,This is a solid well built unit that I have no...
9997,"Plantronics T20: 90% PERFECT, 10% DEADLYThis w..."
9998,Here's what's wrong with it:* People are const...


### Getting the Emotion Lexicon

In [36]:
edf = None
with open("../resources/NRC-Emotion-Intensity-Lexicon-v1/NRC-Emotion-Intensity-Lexicon-v1.txt", "r") as efile:
    edf = pd.read_csv(efile, sep="\t")
edf

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.940
4,terrorize,anger,0.939
...,...,...,...
9894,fugitive,trust,0.141
9895,divorce,trust,0.133
9896,mistakes,trust,0.133
9897,bait,trust,0.133


### Word Embeddings from Dataset

In [37]:
from nltk.tokenize import sent_tokenize, word_tokenize
from itertools import chain
from collections import Counter

# reviews = df["summary"] + df["reviewText"] 
reviews = df["reviewText"]
sentences = list(chain(*reviews.apply(sent_tokenize)))
sentences = [list(map(str.lower, word_tokenize(s))) for s in sentences]

In [38]:
wv_model = models.Word2Vec(sentences=sentences, vector_size=300, window=11, min_count=1)

In [59]:
vocab = wv_model.wv.index_to_key
word_embeddings = {}
for word in vocab:
    word_embeddings[word] = wv_model.wv[word]

In [60]:
### Saving W2V model in an external file.
with open("../output/wordEmbeddings.pkl", "wb") as wvfile:
    pickle.dump(word_embeddings, file=wvfile)

In [61]:
### Demonstrating loading the word vectors back into memory.
with open("../output/wordEmbeddings.pkl", "rb") as wvfile:
    wv = pickle.load(wvfile)
wv["joy"]

array([ 0.02745958,  0.08550036,  0.07900085,  0.04433017,  0.02539896,
       -0.11310062, -0.01601092,  0.16247016, -0.00768178,  0.01936728,
        0.06455947, -0.08121633,  0.04351631, -0.00928702, -0.18439871,
       -0.12359925,  0.06831494, -0.04288586,  0.05628758,  0.07154419,
       -0.02353128,  0.04543516, -0.04048407, -0.03799186, -0.01870383,
       -0.04591846, -0.13031225,  0.01073328,  0.00336856, -0.10409253,
        0.02436649, -0.07125816,  0.12994334, -0.03170492,  0.03955619,
        0.03479991,  0.04915466, -0.00865234, -0.08672893, -0.00970373,
        0.04662003,  0.05660615, -0.05405471, -0.00351273, -0.02046751,
        0.0462717 ,  0.09312263,  0.00190855, -0.08558293, -0.00421814,
       -0.01625445, -0.00563715, -0.02278447,  0.02177822, -0.01663387,
        0.04934742, -0.01682947, -0.0608725 , -0.01517394, -0.00665053,
       -0.00611602, -0.0088266 , -0.0092489 ,  0.07563432, -0.02942354,
        0.00035669,  0.00117109,  0.08894952,  0.00667536, -0.00

### Comparing the Dataset and Lexicon Vocabulary

In [44]:
### Finding the vocabulary of the dataset
vocab = wv_model.wv.index_to_key

In [45]:
len(vocab)

39276

In [46]:
### Finding the Emolex vocabulary.
emotions = edf["word"]
words_emotions = list(chain(*emotions.apply(word_tokenize)))

# List of words with count higher that the threshold ie 1
words_emotions = [w.lower() for w in words_emotions]
word_counts = Counter(words_emotions)
words_emotions = [k for k,v in word_counts.items() if v >= 1]

words_emotions

['outraged',
 'brutality',
 'hatred',
 'hateful',
 'terrorize',
 'infuriated',
 'violently',
 'furious',
 'enraged',
 'furiously',
 'screwyou',
 'murderer',
 'fury',
 'execution',
 'angered',
 'savagery',
 'slaughtering',
 'veryangry',
 'assassinate',
 'fuckoff',
 'annihilation',
 'rage',
 'loathe',
 'damnation',
 'roadrage',
 'fucktard',
 'homicidal',
 'furor',
 'hostile',
 'annihilate',
 'murder',
 'raging',
 'sopissed',
 'pissed',
 'rape',
 'explosive',
 'obliterated',
 'vengeful',
 'ferocious',
 'infuriates',
 'killing',
 'combative',
 'gofuckyourself',
 'vengeance',
 'wrath',
 'torment',
 'vicious',
 'threatening',
 'massacre',
 'bloodthirsty',
 'abhorrent',
 'pissoff',
 'fighting',
 'annihilated',
 'attacking',
 'angriest',
 'bloodshed',
 'smite',
 'brawl',
 'assault',
 'assassination',
 'strangle',
 'explode',
 'malicious',
 'tirade',
 'hostility',
 'loathsome',
 'attack',
 'hell',
 'murderous',
 'malice',
 'terrorism',
 'beating',
 'pissingmeoff',
 'desecration',
 'outrage',
 '

In [47]:
common_words = [w for w in vocab if w in words_emotions]
common_words

['good',
 'like',
 'time',
 'cable',
 'lens',
 'long',
 'case',
 'recommend',
 'problem',
 'music',
 'battery',
 'found',
 'light',
 'home',
 'love',
 'money',
 'pretty',
 'fit',
 'excellent',
 'less',
 'down',
 'books',
 'perfect',
 'system',
 'bad',
 'happy',
 'simple',
 'full',
 'life',
 'store',
 'free',
 'network',
 'turn',
 'thought',
 'wear',
 'top',
 'fact',
 'deal',
 'mine',
 'pay',
 'running',
 'compact',
 'real',
 'install',
 'run',
 'manual',
 'experience',
 'expect',
 'finally',
 'wireless',
 'help',
 'cover',
 'monster',
 'fits',
 'level',
 'visor',
 'service',
 'expected',
 'break',
 'change',
 'warranty',
 'larger',
 'label',
 'clean',
 'kind',
 'start',
 'weight',
 'durable',
 'guess',
 'live',
 'travel',
 'pleased',
 'machine',
 'watch',
 'difficult',
 'strong',
 'comfort',
 'save',
 'disappointed',
 'super',
 'adjust',
 'mind',
 'true',
 'important',
 'instructions',
 'surge',
 'matter',
 'lost',
 'enjoy',
 'show',
 'reliable',
 'together',
 'bottom',
 'prefer',
 'mo

In [48]:
len(common_words)

2578

### Creating the Cosine Similarity Matrix of Dataset Vocab and Emotion Words

In [49]:
### For each word in the vocab, we find the similarity between that word and each emotion word. 
### We are only using the words common to both vocabs as they are relevant.

# Calculate the cosine similarity
def cosine_similarity_calc(vec_1,vec_2):
	sim = np.dot(vec_1,vec_2)/(np.linalg.norm(vec_1)*np.linalg.norm(vec_2))
	return sim

similarity_matrix = np.zeros(shape=(len(vocab),len(common_words)))
for ind, word in enumerate(vocab):
    for eind, emotion_word in enumerate(common_words):
        try:
         similarity_matrix[ind][eind] = cosine_similarity_calc(wv_model.wv[word], wv_model.wv[emotion_word])
        except KeyError:
            # print(word, emotion_word)
            pass


In [50]:
similarity_matrix.shape

(39276, 2578)

### Calculating Emotion Word Scores and Weights.

In [51]:
emotion_words_df = edf[edf["word"].isin(common_words)]
emotion_words_df

Unnamed: 0,word,emotion,emotion-intensity-score
5,infuriated,anger,0.938
8,enraged,anger,0.927
12,fury,anger,0.922
14,angered,anger,0.916
21,rage,anger,0.911
...,...,...,...
9888,shatter,trust,0.172
9889,unreliable,trust,0.172
9890,addict,trust,0.172
9896,mistakes,trust,0.133


In [52]:
### Ranking emotion words based on the intensity, then assigning a score = 1/rank ie index
emotion_words_df.sort_values(by="emotion-intensity-score", ascending=False, inplace=True)
emotion_words_df = emotion_words_df.reset_index(drop=True)
emotion_words_df


Unnamed: 0,word,emotion,emotion-intensity-score
0,happiest,joy,0.986
1,happiness,joy,0.984
2,torture,fear,0.984
3,bliss,joy,0.971
4,horrific,fear,0.969
...,...,...,...
4304,musical,anger,0.011
4305,lovely,sadness,0.009
4306,liquor,sadness,0.000
4307,tree,anger,0.000


### Creating the Sentiment Aware Word Embeddings

In [53]:
### For each word in the dataset vocabulary, find the two emotion word with the highest cosine similarity.
### Create the emotion-aware embeddings by multiplying the word vector of the emotion word with the weight.
### Overlay the wv of the original word and the emotion word.

emotion_aware_wv = {}
embedding_range = 2
for ind, word in enumerate(vocab):
        sim_scores = similarity_matrix[ind]
        ind_max = sorted(range(len(sim_scores)), key=lambda x: sim_scores[x])[:embedding_range]

        # Find the scores and weights of the emotion words based on the ranks in the emotion lexicon.
        eweights = []
        for eind in range(embedding_range):
                rank = emotion_words_df[emotion_words_df['word'] == common_words[ind_max[eind]]].index.values[0]
                score = 1/(rank +1)
                weight = similarity_matrix[ind][ind_max[eind]] * score
                eweights.append(weight)

        # Normalize the weights.
        norm_eweights = [w/sum(eweights) for w in eweights]
        
        # Created the emotion word vector as the normalized weighted sum of the vectors of the emotion words.
        ewv = np.zeros(300)
        for eind in range(embedding_range):
                emotion_word = common_words[ind_max[eind]]
                emotion_word_vector = wv_model.wv[emotion_word]
                ewv += [norm_eweights[eind] * x for x in emotion_word_vector]
        
        # Overlaying the emotion vector with the word vector of the original word.
        emotion_aware_wv[word] = wv_model.wv[word] + ewv

        

In [54]:
emotion_aware_wv["great"]

array([ 0.2176701 ,  0.62414142,  0.30286305, -0.58419892,  0.83006238,
       -0.76335966,  0.18487118,  2.21727284,  0.90564702,  0.53596918,
        1.35456467, -0.73633766, -0.89404412,  1.32236125, -1.07383049,
       -1.78690213,  0.27815304,  0.45496689,  0.34918614,  1.86030398,
        0.2945681 , -0.31027123,  0.41718246, -1.0158881 ,  0.08389386,
       -0.27827246, -0.88661807, -0.15258321,  0.59904536,  0.0296327 ,
       -0.9028317 , -1.4707096 , -0.27506155, -0.56707137,  0.17164829,
       -0.42496213,  0.23103879,  0.66374223, -1.38055027,  0.07703854,
       -0.38584159, -0.24675348, -0.32854959,  1.38792951, -0.5835358 ,
       -0.22781488,  1.40032482, -0.1776328 ,  0.17774465, -0.40224682,
       -0.78711729,  0.3154596 , -0.19536816, -0.38088837,  1.83634757,
        0.33264691, -0.31929214, -0.11997473, -0.03657924, -0.54895998,
       -0.15853285, -0.33809169, -0.5996392 ,  0.24255532, -1.55606733,
        0.2070467 ,  1.51430529,  1.09043041, -0.03458081, -1.42

In [55]:
# Saving the vocab and Emotion Aware Word Embeddings in an external file.
with open("../output/vocab.csv", "w") as vocabfile:
    for ind, word in enumerate(vocab):
        vocabfile.write("\t".join([str(ind), word]))
        vocabfile.write("\n")

with open("../output/emotionAwareEmbeddings.pkl", "wb") as ewvfile:
    pickle.dump(emotion_aware_wv, file=ewvfile)


In [56]:
### Demonstrating retrieving the vocab and emotion-aware wv.

with open("../output/emotionAwareEmbeddings.pkl", "rb") as ewvfile:
    emotion_aware_wv = pickle.load(ewvfile)
emotion_aware_wv["great"]

array([ 0.2176701 ,  0.62414142,  0.30286305, -0.58419892,  0.83006238,
       -0.76335966,  0.18487118,  2.21727284,  0.90564702,  0.53596918,
        1.35456467, -0.73633766, -0.89404412,  1.32236125, -1.07383049,
       -1.78690213,  0.27815304,  0.45496689,  0.34918614,  1.86030398,
        0.2945681 , -0.31027123,  0.41718246, -1.0158881 ,  0.08389386,
       -0.27827246, -0.88661807, -0.15258321,  0.59904536,  0.0296327 ,
       -0.9028317 , -1.4707096 , -0.27506155, -0.56707137,  0.17164829,
       -0.42496213,  0.23103879,  0.66374223, -1.38055027,  0.07703854,
       -0.38584159, -0.24675348, -0.32854959,  1.38792951, -0.5835358 ,
       -0.22781488,  1.40032482, -0.1776328 ,  0.17774465, -0.40224682,
       -0.78711729,  0.3154596 , -0.19536816, -0.38088837,  1.83634757,
        0.33264691, -0.31929214, -0.11997473, -0.03657924, -0.54895998,
       -0.15853285, -0.33809169, -0.5996392 ,  0.24255532, -1.55606733,
        0.2070467 ,  1.51430529,  1.09043041, -0.03458081, -1.42