In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pygraphviz as pgv
import pydot
import difflib

In [3]:
words = pd.read_csv('../../Scaricati/lemmas_unique_final.csv', index_col = 0)
words = words.dropna().reset_index(drop = True)
words

FileNotFoundError: [Errno 2] No such file or directory: '../../Scaricati/lemmas_unique_final.csv'

## Upload Sentiment dictionary

See here for better comprehension of the sentiment dictionary

http://valeriobasile.github.io/twita/sentix.html

In [3]:
sentix = pd.read_csv('sentix/sentix', sep = "\t", header = None, dtype = {'lemmas': object})
sentix.columns = ['lemma', 'POS', 'Wordnet synset ID' ,'positive score', 'negative score', 'polarity','intensity']
sentix = sentix.dropna()
sentix

Unnamed: 0,lemma,POS,Wordnet synset ID,positive score,negative score,polarity,intensity
0,abile,a,1740,0.125,0.00,1.0,0.125
1,intelligente,a,1740,0.125,0.00,1.0,0.125
2,valente,a,1740,0.125,0.00,1.0,0.125
3,capace,a,1740,0.125,0.00,1.0,0.125
4,incapace,a,2098,0.000,0.75,-1.0,0.750
...,...,...,...,...,...,...,...
74604,imbronciarsi,v,2771020,0.000,0.25,-1.0,0.250
74605,rannuvolarsi,v,2771020,0.000,0.25,-1.0,0.250
74606,rasserenarsi,v,2771169,0.125,0.00,1.0,0.125
74607,rischiararsi,v,2771169,0.125,0.00,1.0,0.125


In [1]:
def get_polarity(pos, neg):
    
    if (pos == 0.)and(neg == 0.):
        return 0.
    
    elif pos != 0.:
        theta = np.arctan(neg/pos)
    
    elif pos == 0: 
        theta = np.arctan(np.Inf)
    
    return 1.-4.*theta/np.pi

def get_exact_match(word, sentix):
    subset = sentix[sentix.loc[:,'lemma'].str.fullmatch(word) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean,neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)
    
def get_match(word, sentix):
    subset = sentix[sentix.loc[:,'lemma'].str.match(word) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean,neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)
    
def get_contains(word, sentix):
    subset = sentix[sentix.loc[:,'lemma'].str.contains(word) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean,neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)
    
def get_similar(word, sentix):
    similar_words = difflib.get_close_matches(word, sentix.lemma, n = 8, cutoff = 0.8)
    subset = sentix[sentix.loc[:,'lemma'].isin(similar_words) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean ,neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)

#instead of returning a row of a dictionary, return a list of its values
def denest_row(row):
    
    return [row.loc[:,'lemma'].values[0], 
            row.loc[:,'positive score'].values[0], 
            row.loc[:,'negative score'].values[0], 
            row.loc[:,'polarity'].values[0],
            row.loc[:,'intensity'].values[0]]

def get_rows_data(word, sentix, use_similarity = False):
    
    row = get_exact_match(word, sentix)
    
    #if there are no exact matches, try others
    if (len(row) == 0):
#         print('no exact match')
        row = get_match(word, sentix)
        
        #if there are no matches, try the others
        if (len(row) == 0):
#             print('no match')
            row = get_contains(word, sentix)
            
            #if word is not even contained and we want similarities then try it
            if ((len(row) == 0) and (use_similarity)):
#                 print('no contains')
                row = get_similar(word, sentix)
            
                #if still no matches, give up and return an empty dataframe
                if (len(row) == 0):
#                     print('no similar')
                    dictionary = {'lemma': [word], 'positive score' : [0.],'negative score' : [0.],
                                  'polarity' : [0.],'intensity' : [0.]}
                    row =  pd.DataFrame.from_dict(dictionary)
            
            #if we do not want similarities then return the empty dataframe        
            elif (len(row) == 0):
#                 print('no contains')
                dictionary = {'lemma': [word], 'positive score' : [0.],'negative score' : [0.],
                                  'polarity' : [0.],'intensity' : [0.]}
                row = pd.DataFrame.from_dict(dictionary)
        
    #get back the original word    
    row.lemma = word
    
    #compute the right polarity
    row.polarity = get_polarity(row.loc[0,'positive score'] , row.loc[0,'negative score'] )
    
    #denest it
    return denest_row(row)

The next call to the function get_rows_data(word, sentix, use_similarity = False) above does the following:

+ **Output**: Returns a row of the kind (lemma, positive score, negative score, polarity, intensity)

### **What it does**: it looks for a matching word in the sentix dataframe. 
* If there is one *exactly* matching, returns that row
* if there are many, then compute the mean between the two positive and negative scores among the different rows, polarity is the sign of the difference (positive_score - negative_score) and intensity is the $L_2$ norm (i.e. sqrt(pos^2 + neg^2))
* If there is **NONE** exactly matching, then look for any string that *matches* a word in the sentix dictionary. Same reasoning as before. If there is only one matching then returns that row, otherwise do some algebra.

* If still there are none matching, then look for any string that *contains* (indeed to *contain* is a less strict relation that *matching* that in turn is a less strict relation than *exactly match*)

* If still there are none contained, if a flag is set to true, then take the most similar words in the lemmas list and use them to create sentiment. 

* If even this way there are none, or the previous flag was set to false, return a row of the kind      (word, 0.  ,  0.  , 0.)



In [6]:
labels = ['lemma', 'positive_score', 'negative_score', 'polarity', 'intensity']

In [7]:
sentiment_list = [] 
# for word in words.iloc[0:100,0]:
for i, word in enumerate (words.iloc[:,0]):
    if ( i % 50 == 0): print("word #", i+1, "/", len(words))
    sentiment_list.append(get_rows_data(word, sentix, True))

word # 1 / 40019
word # 51 / 40019
word # 101 / 40019
word # 151 / 40019
word # 201 / 40019
word # 251 / 40019
word # 301 / 40019
word # 351 / 40019
word # 401 / 40019
word # 451 / 40019
word # 501 / 40019
word # 551 / 40019
word # 601 / 40019
word # 651 / 40019
word # 701 / 40019
word # 751 / 40019
word # 801 / 40019
word # 851 / 40019
word # 901 / 40019
word # 951 / 40019
word # 1001 / 40019
word # 1051 / 40019
word # 1101 / 40019
word # 1151 / 40019
word # 1201 / 40019
word # 1251 / 40019
word # 1301 / 40019
word # 1351 / 40019
word # 1401 / 40019
word # 1451 / 40019
word # 1501 / 40019
word # 1551 / 40019
word # 1601 / 40019
word # 1651 / 40019
word # 1701 / 40019
word # 1751 / 40019
word # 1801 / 40019
word # 1851 / 40019
word # 1901 / 40019
word # 1951 / 40019
word # 2001 / 40019
word # 2051 / 40019
word # 2101 / 40019
word # 2151 / 40019
word # 2201 / 40019
word # 2251 / 40019
word # 2301 / 40019
word # 2351 / 40019
word # 2401 / 40019
word # 2451 / 40019
word # 2501 / 40019
wor

word # 20051 / 40019
word # 20101 / 40019
word # 20151 / 40019
word # 20201 / 40019
word # 20251 / 40019
word # 20301 / 40019
word # 20351 / 40019
word # 20401 / 40019
word # 20451 / 40019
word # 20501 / 40019
word # 20551 / 40019
word # 20601 / 40019
word # 20651 / 40019
word # 20701 / 40019
word # 20751 / 40019
word # 20801 / 40019
word # 20851 / 40019
word # 20901 / 40019
word # 20951 / 40019
word # 21001 / 40019
word # 21051 / 40019
word # 21101 / 40019
word # 21151 / 40019
word # 21201 / 40019
word # 21251 / 40019
word # 21301 / 40019
word # 21351 / 40019
word # 21401 / 40019
word # 21451 / 40019
word # 21501 / 40019
word # 21551 / 40019
word # 21601 / 40019
word # 21651 / 40019
word # 21701 / 40019
word # 21751 / 40019
word # 21801 / 40019
word # 21851 / 40019
word # 21901 / 40019
word # 21951 / 40019
word # 22001 / 40019
word # 22051 / 40019
word # 22101 / 40019
word # 22151 / 40019
word # 22201 / 40019
word # 22251 / 40019
word # 22301 / 40019
word # 22351 / 40019
word # 22401 

word # 39601 / 40019
word # 39651 / 40019
word # 39701 / 40019
word # 39751 / 40019
word # 39801 / 40019
word # 39851 / 40019
word # 39901 / 40019
word # 39951 / 40019
word # 40001 / 40019


In [8]:
sentiment_dataframe = pd.DataFrame.from_records(sentiment_list, columns = labels)
sentiment_dataframe

Unnamed: 0,lemma,positive_score,negative_score,polarity,intensity
0,diro,0.1250,0.28125,-1.0,0.307777
1,gente,0.4375,0.18750,1.0,0.475986
2,guerra,0.0625,0.12500,-1.0,0.139754
3,portato,0.1250,0.43750,-1.0,0.455007
4,casa,0.0650,0.17500,-1.0,0.186682
...,...,...,...,...,...
40014,#rip,0.0000,0.00000,0.0,0.000000
40015,#aspettandoprometeo,0.0000,0.00000,0.0,0.000000
40016,#ivreich,0.0000,0.00000,0.0,0.000000
40017,satanasso,0.0000,0.50000,-1.0,0.500000


In [9]:
sentiment_dataframe.to_csv('lemmas_sentiment.csv', index = False)