In [179]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pygraphviz as pgv
import pydot
import difflib

In [210]:
words = pd.read_csv('../../Scaricati/lemmas_unique_final.csv', index_col = 0)
words.head(20)

Unnamed: 0,0
0,diro
1,gente
2,guerra
3,portato
4,casa
5,rimpatriare
6,marcello
7,perfavore
8,patrio
9,patriota


## Upload Sentiment dictionary

See here for better comprehension of the sentiment dictionary

http://valeriobasile.github.io/twita/sentix.html

In [149]:
sentix = pd.read_csv('sentix/sentix', sep = "\t", header = None, dtype = {'lemmas': object})
sentix.columns = ['lemma', 'POS', 'Wordnet synset ID' ,'positive score', 'negative score', 'polarity','intensity']
sentix = sentix.dropna()
sentix

Unnamed: 0,lemma,POS,Wordnet synset ID,positive score,negative score,polarity,intensity
0,abile,a,1740,0.125,0.00,1.0,0.125
1,intelligente,a,1740,0.125,0.00,1.0,0.125
2,valente,a,1740,0.125,0.00,1.0,0.125
3,capace,a,1740,0.125,0.00,1.0,0.125
4,incapace,a,2098,0.000,0.75,-1.0,0.750
...,...,...,...,...,...,...,...
74604,imbronciarsi,v,2771020,0.000,0.25,-1.0,0.250
74605,rannuvolarsi,v,2771020,0.000,0.25,-1.0,0.250
74606,rasserenarsi,v,2771169,0.125,0.00,1.0,0.125
74607,rischiararsi,v,2771169,0.125,0.00,1.0,0.125


In [230]:
def get_polarity(my_value):
    if my_value > 0.: 
        return +1.
    elif my_value < 0.:
        return -1.
    elif my_value == 0.:
        return 0.

def get_exact_match(word, sentix):
    subset = sentix[sentix.loc[:,'lemma'].str.fullmatch(word) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean - neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)
    
def get_match(word, sentix):
    subset = sentix[sentix.loc[:,'lemma'].str.match(word) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean - neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)
    
def get_contains(word, sentix):
    subset = sentix[sentix.loc[:,'lemma'].str.contains(word) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean - neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)
    
def get_similar(word, sentix):
    similar_words = difflib.get_close_matches(word, sentix.lemma, n = 8, cutoff = 0.8)
    subset = sentix[sentix.loc[:,'lemma'].isin(similar_words) == True]
    
    if (len(subset) == 0):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
    
    if (len(subset) == 1):
        return subset.loc[:,['lemma','positive score','negative score','polarity','intensity']]
        
    elif (len(subset) > 1):
        pos_mean  = subset.loc[:,'positive score'].mean()
        neg_mean  = subset.loc[:,'negative score'].mean()
        polarity  = get_polarity(pos_mean - neg_mean)
        intensity = np.sqrt(pos_mean**2 + neg_mean**2)
        
        dictionary = {'lemma': [word], 'positive score' : [pos_mean],'negative score' : [neg_mean],
                             'polarity' : [polarity],'intensity' : [intensity]}
        
        return pd.DataFrame.from_dict(dictionary)


def get_rows_data(word, sentix, use_similarity = False):
    
    row = get_exact_match(word, sentix)
    
    #if there are no exact matches, try others
    if (len(row) == 0):
#         print('no exact match')
        row = get_match(word, sentix)
        
        #if there are no matches, try the others
        if (len(row) == 0):
#             print('no match')
            row = get_contains(word, sentix)
            
            #if word is not even contained and we want similarities then try it
            if ((len(row) == 0) and (use_similarity)):
#                 print('no contains')
                row = get_similar(word, sentix)
            
                #if still no matches, give up and return an empty dataframe
                if (len(row) == 0):
#                     print('no similar')
                    dictionary = {'lemma': [word], 'positive score' : [0.],'negative score' : [0.],
                                  'polarity' : [0.],'intensity' : [0.]}
                    row =  pd.DataFrame.from_dict(dictionary)
            
            #if we do not want similarities then return the empty dataframe        
            elif (len(row) == 0):
#                 print('no contains')
                dictionary = {'lemma': [word], 'positive score' : [0.],'negative score' : [0.],
                                  'polarity' : [0.],'intensity' : [0.]}
                row = pd.DataFrame.from_dict(dictionary)
        
    #get back the original word    
    row.lemma = word
    return row

The next call to the function get_rows_data(word, sentix, use_similarity = False) above does the following:

+ **Output**: Returns a row of the kind (lemma, positive score, negative score, polarity, intensity)

### **What it does**: it looks for a matching word in the sentix dataframe. 
* If there is one *exactly* matching, returns that row
* if there are many, then compute the mean between the two positive and negative scores among the different rows, polarity is the sign of the difference (positive_score - negative_score) and intensity is the $L_2$ norm (i.e. sqrt(pos^2 + neg^2))
* If there is **NONE** exactly matching, then look for any string that *matches* a word in the sentix dictionary. Same reasoning as before. If there is only one matching then returns that row, otherwise do some algebra.

* If still there are none matching, then look for any string that *contains* (indeed to *contain* is a less strict relation that *matching* that in turn is a less strict relation than *exactly match*)

* If still there are none contained, if a flag is set to true, then take the most similar words in the lemmas list and use them to create sentiment. 

* If even this way there are none, or the previous flag was set to false, return a row of the kind      (word, 0.  ,  0.  , 0.)



In [None]:
sentiment_df = pd.DataFrame()
# for word in words.iloc[0:100,0]:
for word in words.iloc[:,0]:
    sentiment_df = sentiment_df.append(get_rows_data(word, sentix, True), ignore_index = True)

In [None]:
sentiment_df.to_csv('lemmas_sentiment.csv', index = False)

In [None]:
prova = pd.read_csv('lemmas_sentiment.csv')
prova