In [46]:
import os

from gensim.models import Word2Vec
import numpy as np
import pandas as pd

from socialsent import lexicons
from socialsent import seeds
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

from LMLexicon import LMLexicon
from IMDB import IMDB

In [2]:
# Just helper functions
def most_sim(embedding, word):
    word_scores = embedding.most_similar(word)
    return [(word, round(float(score), 3)) for word, score in word_scores]

def eval_words(embedding, not_included, lexicon):
    return [word for word in embedding.iw if word not in not_included and lexicon.get(word, False)]

def evaluate_lexicon(generated_lex, embedding, seeds, lexicon):
    word_eval = eval_words(embedding, seeds, lexicon)
    auc, avg_per  = binary_metrics(generated_lex, lexicon, word_eval)
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
    print("")

# Creating a lexicon

We are going to create a domain specific lexicon using the method stated in this [paper](http://aclweb.org/anthology/D16-1057). The code for the paper was released [here](https://github.com/williamleif/socialsent) and I have modified it so that it can be used in Python 3 which can be found [here](https://github.com/apmoore1/socialsent). The method is based on word embeddings and even though in their paper they state that using Word2Vec models tend to perform poorly we are going to use them due to the ease in creating them.

Therefore the first job is to create the word embeddings we are going to use through out this notebook which are:
1. Financial word embeddings which we have taken from this [paper](https://arxiv.org/pdf/1705.00571.pdf) and have been pre-trained on financial articles e.g. The Times.
2. Movie review word embeddings which have been pre-trained but the method to train and create them can be found in this [class](IMDB.py).

In [3]:
fin_embeddings = os.path.abspath(os.path.join('word2vec_models', 'finance', 'all_fin_model_lower'))
fin_embeddings = Word2Vec.load(fin_embeddings)

movie_embeddings = IMDB().vectors

Now we have done that we should have a look at the differences between the two word embedding models so that we can tell the differences. Have a go at changing the text values in the *most_similar* function argument. Hopefully you will see a difference between the finance embedding model and the movie model.

In [42]:
fin_good = pd.Series(most_sim(fin_embeddings, 'good'))
fin_sains = pd.Series(most_sim(fin_embeddings, 'sainsbury'))
movie_good = pd.Series(most_sim(movie_embeddings, 'good'))
movie_film = pd.Series(most_sim(movie_embeddings, 'film'))
together = pd.concat([fin_good, fin_sains, movie_good, movie_film],axis=1)
together.columns = ['Fin good', 'Fin Sains', 'Movie good', 'Movie film']
together

Unnamed: 0,Fin good,Fin Sains,Movie good,Movie film
0,"(great, 0.688)","(tesco, 0.673)","(decent, 0.751)","(movie, 0.936)"
1,"(solid, 0.643)","(asda, 0.562)","(great, 0.668)","(picture, 0.688)"
2,"(decent, 0.64)","(morrison, 0.516)","(fine, 0.646)","(flick, 0.659)"
3,"(encouraging, 0.622)","(morrisons, 0.508)","(solid, 0.64)","(sequel, 0.595)"
4,"(pleasing, 0.619)","(marston, 0.48)","(passable, 0.624)","(films, 0.561)"
5,"(healthy, 0.593)","(lowe, 0.479)","(nice, 0.623)","(entry, 0.553)"
6,"(strong, 0.591)","(macy, 0.479)","(terrific, 0.612)","(installment, 0.551)"
7,"(excellent, 0.583)","(waitrose, 0.478)","(bad, 0.611)","(pic, 0.538)"
8,"(terrible, 0.579)","(wendy, 0.464)","(well-done, 0.599)","(it, 0.53)"
9,"(terrific, 0.564)","(grocer, 0.447)","(fantastic, 0.585)","(story, 0.529)"


To create domain specific lexicons from these word embeddings we have to give them some **seed sentiment** words in the domain we want them to be specialised in. In this case it is going to be finance. Therefore we are going to get 10 positive and 10 negative seed sentiment words and create a sentiment specific lexicon and evaluate using the following metric:
1. [ROC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
2. [Precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall)

Good description of the [difference](https://www.quora.com/What-is-the-difference-between-a-ROC-curve-and-a-precision-recall-curve-When-should-I-use-each)

In [43]:
finance_lex = LMLexicon().lexicon

# Getting all of the financial sentiment seeds
fin_pos_seeds, fin_neg_seeds = seeds.finance_seeds()
all_fin_seeds = set(finance_lex.keys()).union(set(fin_pos_seeds)).union(set(fin_neg_seeds))

# Manipulating the movie and financial word embeddings so that they only contain the financial 
# sentiment seeds to speed up processing
fin_fin_embeddings = create_representation('GENSIM', fin_embeddings, words=all_fin_seeds)
fin_movie_embeddings = create_representation('GENSIM', movie_embeddings, words=all_fin_seeds)

# Creating financial sepcific sentiment lexicons from the finacial and movie word embeddings
fin_fin_polarities = random_walk(fin_fin_embeddings, fin_pos_seeds, fin_neg_seeds, beta=0.99, 
                                 nn=10,sym=True, arccos=True)
fin_movie_polarities = random_walk(fin_movie_embeddings, fin_pos_seeds, fin_neg_seeds, beta=0.99, 
                                   nn=10,sym=True, arccos=True)

# 
# Evaluating against the finance lexicon
#
print('Evaluting the finance embeddings as finance sentiment lexicon')
evaluate_lexicon(fin_fin_polarities, fin_fin_embeddings, fin_pos_seeds + fin_neg_seeds, finance_lex)
print('Evaluting the movie embeddings as finance sentiment lexicon')
evaluate_lexicon(fin_movie_polarities, fin_movie_embeddings, fin_pos_seeds + fin_neg_seeds, finance_lex)



Evaluting the finance embeddings as finance sentiment lexicon
ROC AUC: 0.82
Average precision score: 0.47

Evaluting the movie embeddings as finance sentiment lexicon
ROC AUC: 0.75
Average precision score: 0.51



We are now going to use the movie and finance embeddings to create a general sentiment lexicon and compare it to the general inquirer which is a general sentiment lexicon. We will see which is better for this task:

In [44]:
inquirer_lex = lexicons.load_lexicon("inquirer", remove_neutral=True)

# Getting all of the general sentiment seeds
gen_pos_seeds, gen_neg_seeds = seeds.hist_seeds()
all_gen_seeds = set(inquirer_lex.keys()).union(set(gen_pos_seeds)).union(set(gen_neg_seeds))

# Manipulating the movie and financial word embeddings so that they only contain the general 
# sentiment seeds to speed up processing
gen_fin_embeddings = create_representation('GENSIM', fin_embeddings, words=all_gen_seeds)
gen_movie_embeddings = create_representation('GENSIM', movie_embeddings, words=all_gen_seeds)

# Creating general sentiment lexicons from the finacial and movie word embeddings
gen_fin_polarities = random_walk(gen_fin_embeddings, gen_pos_seeds, gen_neg_seeds, beta=0.90, 
                                 nn=20,sym=True, arccos=True)
gen_movie_polarities = random_walk(gen_movie_embeddings, gen_pos_seeds, gen_neg_seeds, beta=0.90, 
                                   nn=20,sym=True, arccos=True)

# 
# Evaluating against the general lexicon
#
print('Evaluting the finance embeddings as general sentiment lexicon')
evaluate_lexicon(gen_fin_polarities, gen_fin_embeddings, gen_pos_seeds + gen_neg_seeds, inquirer_lex)
print('Evaluting the movie embeddings as general sentiment lexicon')
evaluate_lexicon(gen_movie_polarities, gen_movie_embeddings, gen_pos_seeds + gen_neg_seeds, inquirer_lex)

Evaluting the finance embeddings as general sentiment lexicon
ROC AUC: 0.64
Average precision score: 0.64

Evaluting the movie embeddings as general sentiment lexicon
ROC AUC: 0.72
Average precision score: 0.72



$$p^{(t+1)} = \beta \textbf{T} p^{(t)} + (1 - \beta)s$$

This equation is what controls the sentiment propagation through the word embeddings to define the polarity of words. This equation as you can see depends on the value of $ \beta $. The larger $ \beta $ the more of an effect the original sentiment seed words have on the finishing sentiment lexicon. The smaller $ \beta $ the larger the affect the similarity between words are defined by the word embeddings.

As you can see the $ \beta $ value can be manipulated in the two cells above within the random_walk function. See what happens to the evaluation metrics when you: 
1. change the value of $ \beta $
2. Change the value of $ nn $ which defines the number of nearest words for each word the algorthim should spread the sentiment values.

In [25]:
def polarity_dataframe(list_polarities, reverse=True):
    '''Given a list of lists that contain tuples of (word, sentiment value) it sorts
    the lists by sentiment value and returns words in pandas dataframe sorted by polarity.
    '''
    polarity_dfs = []
    for word_polaritys in list_polarities:
        word_order = sorted(word_polaritys.items(), key=lambda item: item[1], reverse=reverse)
        polarity_words = None
        polarity_dfs.append(pd.Series([word for word, pol in word_order]))
    polarity_df = pd.concat(polarity_dfs, ignore_index=True, axis=1)
    return polarity_df

columns = ['Fin Fin','Fin Movie','Gen Movie','Gen Fin']
compare_pos_polaritys = polarity_dataframe([fin_fin_polarities, fin_movie_polarities, 
                                            gen_movie_polarities, gen_fin_polarities])
compare_neg_polaritys = polarity_dataframe([fin_fin_polarities, fin_movie_polarities, 
                                            gen_movie_polarities, gen_fin_polarities], False)
compare_pos_polaritys.columns = columns
compare_neg_polaritys.columns = columns

compare_pos_polaritys and compare_neg_polaritys are the sentiment lexicons ranked by the most positive and most negative sentiment words i.e. the first word in compare_pos_polaritys is the most positive word for the sentiment lexicons.

1. Compare the sentiment lexicons that have been created based on different word embeddings and seed values.
2. Change the value in the head function and see how much they change.

In [27]:
compare_pos_polaritys.head(40)

Unnamed: 0,Fin Fin,Fin Movie,Gen Movie,Gen Fin
0,improving,excellent,excellent,fortunate
1,improved,successful,perfect,loved
2,excellent,improved,lovely,lovely
3,success,gains,delightful,happy
4,successful,improving,wonderful,love
5,beneficial,profit,marvelous,good
6,positive,beneficial,fantastic,excellent
7,encouraging,success,brilliant,pleasant
8,strong,positive,fabulous,perfect
9,improves,brilliant,sensational,enjoyable


In [28]:
compare_neg_polaritys.head(40)

Unnamed: 0,Fin Fin,Fin Movie,Gen Movie,Gen Fin
0,litigation,down,evil,unfortunate
1,damages,negligent,unhappy,unpleasant
2,negligent,litigation,disgusting,evil
3,failure,damages,unpleasant,unhappy
4,losses,loss,unfortunate,hated
5,wrong,losses,hate,bad
6,claims,wrong,absurd,poor
7,loss,volatile,ridiculous,hate
8,injunctions,failure,unbelievable,abrupt
9,penalties,negative,loveless,epidemic
