# ITALIAN

## BERT base italian cased

In [None]:
import torch
from torch import nn  
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import pickle
from tqdm import tqdm
import textwrap
from collections import defaultdict
width=512
sentiment=[]
n=[]
p=[]
neu=[]
it_df_factor = pd.read_pickle('data/it_df_factor.pkl')
it_df_factor=it_df_factor.drop_duplicates(subset=['year',"text","label","region","canton","category","language"], ignore_index=True)
from transformers import pipeline

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("neuraly/bert-base-italian-cased-sentiment")
# Load the model, use .cuda() to load it on the GPU
model = AutoModelForSequenceClassification.from_pretrained("neuraly/bert-base-italian-cased-sentiment")
nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, max_length=512, truncation=True)

for i in tqdm(range(len(it_df_factor)), position=0, leave=True):
    doc = nlp(textwrap.wrap(it_df_factor.text[i], width, break_long_words=False))
    senti=defaultdict(int)
  
    senti["positive"]=0
    senti["negative"]=0
    senti["neutral"]=0
  
    for sent in doc:
    
        senti[sent["label"]]+=1

    sentiment.append(max(senti, key=senti.get))
    
    p.append(senti["positive"]/len(list(doc)))
    n.append(senti["negative"]/len(list(doc)))
    neu.append(senti["neutral"]/len(list(doc)))
    
    
it_df_factor["sentiment"]=sentiment
it_df_factor["positive"]=p
it_df_factor["negative"]=n
it_df_factor["neutral"]=neu
  

it_df_factor.to_pickle('data/it_df_sentiments_bert_whole_len_no_dup.pkl')

    


In [None]:
it_df_factor.groupby('sentiment').count()['id']

In [None]:
it_df_factor.loc[(it_df_factor.sentiment=="negative")]

## Sentita

In [None]:

from tqdm import tqdm
from collections import defaultdict
import pandas as pd
import pickle
import spacy
it_nlp=spacy.load("it_core_news_sm")
from sentita import calculate_polarity
it_df_factor = pd.read_pickle('data/it_df_factor.pkl')
it_df_factor=it_df_factor.drop_duplicates(subset=['year',"text","label","region","canton","category","language"], ignore_index=True)
p=[]
n=[]
neu=[]
sentiment=[]
pos_scores=[]
neg_scores=[]
for i in tqdm(range(len(it_df_factor)), position=0, leave=True):
    doc=it_nlp(it_df_factor.text[i])
    sentences=[]
    for sent in doc:
        sentences.append(sent.text)
        
    
    results, polarities = calculate_polarity(sentences)
    for posi in polarities:
        pos_scores.append(posi[0])
        neg_scores.append(posi[1])
    pos= [p[0] for p in polarities]
    neg=[p[1] for p in polarities]
    senti=defaultdict(int)
    for s in range(len(pos)):
        if pos[s]> neg[s] and pos[s]>0.6:
            senti["positive"]+=1
        elif neg[s]> pos[s] and neg[s]>0.1:
            senti["negative"]+=1
        else: 
            senti["neutral"]+=1

            
    sentiment.append(max(senti, key=senti.get))
    
    p.append(senti["positive"]/len(pos))
    n.append(senti["negative"]/len(pos))
    neu.append(senti["neutral"]/len(pos))
    
    
it_df_factor["sentiment"]=sentiment
it_df_factor["positive"]=p
it_df_factor["negative"]=n
it_df_factor["neutral"]=neu
  

it_df_factor.to_pickle('data/it_df_sentiments_sentita_whole_len_no_dup.pkl')

    


In [None]:
len(pos_scores)
print(min(pos_scores))
print(max(pos_scores))
import seaborn as sns
import matplotlib.pyplot as plt
sns.violinplot(x=pos_scores)
ax2 = plt.axes([0.2, 0.6, .2, .2], facecolor='y')
sns.violinplot(x=pos_scores,ax=ax2)
ax2.set_title('zoom')
ax2.set_xlim([-0.15,0.15])

In [None]:
len(neg_scores)
print(min(neg_scores))
print(max(neg_scores))
import seaborn as sns
import matplotlib.pyplot as plt
sns.violinplot(x=neg_scores)
ax2 = plt.axes([0.2, 0.6, .2, .2], facecolor='y')
sns.violinplot(x=neg_scores,ax=ax2)
ax2.set_title('zoom')
ax2.set_xlim([-0.15,0.15])

## NLTK with sentiment lexicon #1 (Ragusalex)


In [None]:
POS_WORDS_FILE="./italian_sentiment/pos.words.txt"
NEG_WORDS_FILE="./italian_sentiment/neg.words.txt"
sentiment_lookup={}
for pos_word in open(POS_WORDS_FILE, 'r', encoding="latin1").readlines():
    sentiment_lookup[pos_word.strip("\n")]='positive'
for neg_word in open(NEG_WORDS_FILE, 'r', encoding="latin1").readlines():
    sentiment_lookup[neg_word.strip("\n")]='negative'

In [None]:
for k,v in list(sentiment_lookup.items())[-10:]:
    print("%s: %s"%(k, v))

In [None]:
from tqdm import tqdm
import pandas as pd
import pickle
import spacy
from nltk import word_tokenize, sent_tokenize
it_df_factor = pd.read_pickle('data/it_df_factor.pkl')
it_df_factor=it_df_factor.drop_duplicates(subset=['year',"text","label","region","canton","category","language"], ignore_index=True)
from collections import defaultdict

tokens=[]
sentiment=[]
sentiment_score_pos=[]
sentiment_score_neg=[]
sentiment_score_mix=[]
sentiment_score=[]
for t in tqdm(range(len(it_df_factor)), position=0, leave=True):
  sentilist=[]
  senti=defaultdict(int)
  sentences=sent_tokenize(it_df_factor.loc[t].text, language="italian")
  tokensent=[]
  for s in sentences:
    token=word_tokenize(s, language="italian")
    tokensent=tokensent+token
    polarityflag=0
    posi=[]
    nega=[]
    for tok in token:
        if tok.lower() in sentiment_lookup.keys():
          sentilist.append(sentiment_lookup[tok.lower()])
          if sentiment_lookup[tok.lower()]=="positive":
                posi.append(tok.lower())
          else:
                nega.append(tok.lower())
          polarityflag=1
        else:
          sentilist.append("neutral")
    if polarityflag==0:
        senti["neutral"]+=1
    else:
        if len(posi)>len(nega):
            senti["positive"]+=1
        elif len(nega)>len(posi):
            senti["negative"]+=1
        else:
            senti["mixed"]+=1
             
  tokens.append(tokensent)
  sentiment.append(sentilist)
  sentiscore_neg=senti["negative"]
  sentiscore_pos=senti["positive"]
  sentiscore_mix=senti["mixed"]
  aggregated=senti["mixed"]+senti["positive"]-senti["negative"]
  sentiscore=aggregated/len(list(sentences))
  sentiment_score_neg.append(sentiscore_neg)
  sentiment_score_pos.append(sentiscore_pos)
  sentiment_score_mix.append(sentiscore_mix)
  sentiment_score.append(sentiscore)
it_df_factor["lookup_sentiment"]=sentiment
it_df_factor["tokens"]=tokens
it_df_factor["sentiment_score_pos"]=sentiment_score_pos
it_df_factor["sentiment_score_neg"]=sentiment_score_neg
it_df_factor["sentiment_score_mixed"]=sentiment_score_mix
it_df_factor["sentiment_score"]=sentiment_score
it_df_factor.to_pickle('data/it_df_sentiments_lexicon1_no_dup.pkl')

print(it_df_factor.head())

In [None]:
sorted(list(it_df_factor["sentiment_score"].unique()))

In [None]:
df=pd.read_pickle('data/it_df_sentiments_lexicon1_no_dup.pkl')
sent_stats=df["sentiment_score"]

In [None]:
len(sent_stats)
print(min(sent_stats))
print(max(sent_stats))
import seaborn as sns
import matplotlib.pyplot as plt
sns.violinplot(x=sent_stats)
ax2 = plt.axes([0.2, 0.6, .2, .2], facecolor='y')
sns.violinplot(x=sent_stats,ax=ax2)
ax2.set_title('zoom')
ax2.set_xlim([-0.025,0.025])


In [None]:
threshold_pos=0.6
threshold_neg=0.7
df_pos=sent_stats[(sent_stats>=threshold_pos)]
df_neg=sent_stats[(sent_stats<=-threshold_neg)]
print(len(df_neg))
print(len(df_pos))

sentiments=[]
for s in sent_stats:
    if s<=-threshold_neg:
        sentiments.append("negative")
    elif s>=threshold_pos:
        sentiments.append("positive")
    else:
        sentiments.append("neutral")
        
df["lex1_sentiment"]=sentiments
df.to_pickle('data/it_df_sentiments_lexicon1_no_dup.pkl')

In [None]:
visual=[]

sentences=[]
for i in tqdm(range(len(it_df_factor)), position=0, leave=True):
  if visual:
        sentences.append(visual)
  visual=[]
  for t in range(len(it_df_factor.loc[i,"tokens"])):
    if it_df_factor.loc[i,"lookup_sentiment"][t]=="negative":
      # adding markup commands to make background for negative sentiment crimson red
      visual.append(str("<code style="+'"background:crimson;color:white"'+"> "+it_df_factor.loc[i,"tokens"][t]+" </code>"))
    elif it_df_factor.loc[i,"lookup_sentiment"][t]=="positive":
      # adding markup commands to make background for positive sentiment mediumseagreen
      visual.append(str("<code style="+'"background:mediumseagreen;color:white"'+"> "+it_df_factor.loc[i,"tokens"][t]+" </code>"))
    else:
      visual.append(" "+it_df_factor.loc[i,"tokens"][t]+" ")

import markdown

html_export=[]
import IPython
# helper function to render highlights in this notebook
Markdown = lambda string: IPython.display.HTML(markdown.markdown(string))

for s in sentences:
    # display it in this notebokk
    #display(Markdown(" ".join(s)))
    # prepare for export to html file
    html = markdown.markdown(" ".join(s))
    html_export.append(html)

# export to current working directory
with open("./it_sentiments.html","w", encoding="UTF-8") as f:
    for ht in html_export:
        f.write(ht)

In [None]:
sorted(list(it_df_factor["sentiment_score_neg"].unique()))

In [None]:
sorted(list(it_df_factor["sentiment_score_pos"].unique()))

In [None]:
#now visualizing only extreme cases
extreme_df=it_df_factor[(it_df_factor["sentiment_score_neg"] >= 30)|(it_df_factor["sentiment_score_pos"]>=25)]
extreme_df.reset_index(drop=True, inplace=True)


visual=[]

sentences=[]
for i in tqdm(range(len(extreme_df[:10])), position=0, leave=True):
  if visual:
        sentences.append(visual)
  visual=[]
  for t in range(len(extreme_df.loc[i,"tokens"])):
    if extreme_df.loc[i,"lookup_sentiment"][t]=="negative":
      # adding markup commands to make background for negative sentiment crimson red
      visual.append(str("<code style="+'"background:crimson;color:white"'+"> "+extreme_df.loc[i,"tokens"][t]+" </code>"))
    elif extreme_df.loc[i,"lookup_sentiment"][t]=="positive":
      # adding markup commands to make background for positive sentiment mediumseagreen
      visual.append(str("<code style="+'"background:mediumseagreen;color:white"'+"> "+extreme_df.loc[i,"tokens"][t]+" </code>"))
    else:
      visual.append(" "+extreme_df.loc[i,"tokens"][t]+" ")

import markdown


import IPython
# helper function to render highlights in this notebook
Markdown = lambda string: IPython.display.HTML(markdown.markdown(string))
# for instance based inspection use row index and score
for s in sentences:
    # display it in this notebokk
    display(Markdown(" ".join(s)))
    


## NLTK with sentiment lexicon #2 (Porculex)


In [None]:
POS_WORDS_FILE="./italian_sentiment/posITA.txt"
NEG_WORDS_FILE="./italian_sentiment/negITA.txt"
sentiment_lookup={}
for pos_word in open(POS_WORDS_FILE, 'r', encoding="latin1").readlines()[17:]:
    sentiment_lookup[pos_word.strip("\n")]='positive'
for neg_word in open(NEG_WORDS_FILE, 'r', encoding="latin1").readlines()[17:]:
    sentiment_lookup[neg_word.strip("\n")]='negative'
    
for k,v in list(sentiment_lookup.items())[:10]:
    print("%s: %s"%(k, v))

In [None]:
from tqdm import tqdm
import pandas as pd
import pickle
import spacy
from nltk import word_tokenize, sent_tokenize
it_df_factor = pd.read_pickle('data/it_df_factor.pkl')
it_df_factor=it_df_factor.drop_duplicates(subset=['year',"text","label","region","canton","category","language"], ignore_index=True)
from collections import defaultdict

tokens=[]
sentiment=[]
sentiment_score_pos=[]
sentiment_score_neg=[]
sentiment_score_mix=[]
sentiment_score=[]
for t in tqdm(range(len(it_df_factor)), position=0, leave=True):
  sentilist=[]
  senti=defaultdict(int)
  sentences=sent_tokenize(it_df_factor.loc[t].text, language="italian")
  tokensent=[]
  for s in sentences:
    token=word_tokenize(s, language="italian")
    tokensent=tokensent+token
    polarityflag=0
    posi=[]
    nega=[]
    for tok in token:
        if tok.lower() in sentiment_lookup.keys():
          sentilist.append(sentiment_lookup[tok.lower()])
          if sentiment_lookup[tok.lower()]=="positive":
                posi.append(tok.lower())
          else:
                nega.append(tok.lower())
          polarityflag=1
        else:
          sentilist.append("neutral")
    if polarityflag==0:
        senti["neutral"]+=1
    else:
        if len(posi)>len(nega):
            senti["positive"]+=1
        elif len(nega)>len(posi):
            senti["negative"]+=1
        else:
            senti["mixed"]+=1
             
  tokens.append(tokensent)
  sentiment.append(sentilist)
  sentiscore_neg=senti["negative"]
  sentiscore_pos=senti["positive"]
  sentiscore_mix=senti["mixed"]
  aggregated=senti["mixed"]+senti["positive"]-senti["negative"]
  sentiscore=aggregated/len(list(sentences))
  sentiment_score_neg.append(sentiscore_neg)
  sentiment_score_pos.append(sentiscore_pos)
  sentiment_score_mix.append(sentiscore_mix)
  sentiment_score.append(sentiscore)
it_df_factor["lookup_sentiment"]=sentiment
it_df_factor["tokens"]=tokens
it_df_factor["sentiment_score_pos"]=sentiment_score_pos
it_df_factor["sentiment_score_neg"]=sentiment_score_neg
it_df_factor["sentiment_score_mixed"]=sentiment_score_mix
it_df_factor["sentiment_score"]=sentiment_score
it_df_factor.to_pickle('data/it_df_sentiments_lexicon2_no_dup.pkl')

print(it_df_factor.head())

In [None]:
sorted(list(it_df_factor["sentiment_score"].unique()))

In [None]:
df=pd.read_pickle('data/it_df_sentiments_lexicon2_no_dup.pkl')
sent_stats=df["sentiment_score"]

In [None]:
len(sent_stats)
print(min(sent_stats))
print(max(sent_stats))
import seaborn as sns
import matplotlib.pyplot as plt
sns.violinplot(x=sent_stats)
ax2 = plt.axes([0.2, 0.6, .2, .2], facecolor='y')
sns.violinplot(x=sent_stats,ax=ax2)
ax2.set_title('zoom')
ax2.set_xlim([-0.025,0.025])


In [None]:
threshold_pos=0.4
threshold_neg=0.7
df_pos=sent_stats[(sent_stats>=threshold_pos)]
df_neg=sent_stats[(sent_stats<=-threshold_neg)]
print(len(df_neg))
print(len(df_pos))
sentiments=[]
for s in sent_stats:
    if s<=-threshold_neg:
        sentiments.append("negative")
    elif s>=threshold_pos:
        sentiments.append("positive")
    else:
        sentiments.append("neutral")
        
df["lex2_sentiment"]=sentiments
df.to_pickle('data/it_df_sentiments_lexicon2_no_dup.pkl')

In [None]:
sorted(list(it_df_factor["sentiment_score_neg"].unique()))

In [None]:
sorted(list(it_df_factor["sentiment_score_pos"].unique()))

In [None]:
visual=[]

sentences=[]
for i in tqdm(range(len(it_df_factor)), position=0, leave=True):
  if visual:
        sentences.append(visual)
  visual=[]
  for t in range(len(it_df_factor.loc[i,"tokens"])):
    if it_df_factor.loc[i,"lookup_sentiment"][t]=="negative":
      # adding markup commands to make background for negative sentiment crimson red
      visual.append(str("<code style="+'"background:crimson;color:white"'+"> "+it_df_factor.loc[i,"tokens"][t]+" </code>"))
    elif it_df_factor.loc[i,"lookup_sentiment"][t]=="positive":
      # adding markup commands to make background for positive sentiment mediumseagreen
      visual.append(str("<code style="+'"background:mediumseagreen;color:white"'+"> "+it_df_factor.loc[i,"tokens"][t]+" </code>"))
    else:
      visual.append(" "+it_df_factor.loc[i,"tokens"][t]+" ")

import markdown

html_export=[]
import IPython
# helper function to render highlights in this notebook
Markdown = lambda string: IPython.display.HTML(markdown.markdown(string))

for s in sentences:
    # display it in this notebokk
    #display(Markdown(" ".join(s)))
    # prepare for export to html file
    html = markdown.markdown(" ".join(s))
    html_export.append(html)

# export to current working directory
with open("./it_sentiments2.html","w", encoding="UTF-8") as f:
    for ht in html_export:
        f.write(ht)

In [None]:
extreme_df=it_df_factor[(it_df_factor["sentiment_score_neg"] >= 30)|(it_df_factor["sentiment_score_pos"]>=20)]
extreme_df.reset_index(drop=True, inplace=True)


visual=[]

sentences=[]
for i in tqdm(range(len(extreme_df[:10])), position=0, leave=True):
  if visual:
        sentences.append(visual)
  visual=[]
  for t in range(len(extreme_df.loc[i,"tokens"])):
    if extreme_df.loc[i,"lookup_sentiment"][t]=="negative":
      # adding markup commands to make background for negative sentiment crimson red
      visual.append(str("<code style="+'"background:crimson;color:white"'+"> "+extreme_df.loc[i,"tokens"][t]+" </code>"))
    elif extreme_df.loc[i,"lookup_sentiment"][t]=="positive":
      # adding markup commands to make background for positive sentiment mediumseagreen
      visual.append(str("<code style="+'"background:mediumseagreen;color:white"'+"> "+extreme_df.loc[i,"tokens"][t]+" </code>"))
    else:
      visual.append(" "+extreme_df.loc[i,"tokens"][t]+" ")

import markdown


import IPython
# helper function to render highlights in this notebook
Markdown = lambda string: IPython.display.HTML(markdown.markdown(string))
# for instance based inspection use row index and score
for s in sentences:
    # display it in this notebokk
    display(Markdown(" ".join(s)))

## Inter-Annotator Agreement


In [None]:
import pandas as pd
import pickle
bert_df=pd.read_pickle("data/it_df_sentiments_bert_whole_len_no_dup.pkl")
sentita_df=pd.read_pickle('data/it_df_sentiments_sentita_whole_len_no_dup.pkl')
lex1_df=pd.read_pickle('data/it_df_sentiments_lexicon1_no_dup.pkl')
lex2_df=pd.read_pickle('data/it_df_sentiments_lexicon2_no_dup.pkl')

print("bert:")
print(bert_df.groupby('sentiment').count()['id'])
print("sentita:")
print(sentita_df.groupby('sentiment').count()['id'])
print("lex1 (ragusa):")
print(lex1_df.groupby('lex1_sentiment').count()['id'])
print("lex2 (porcu):")
print(lex2_df.groupby('lex2_sentiment').count()['id'])

In [None]:
from sklearn.metrics import cohen_kappa_score
print("Pairwise Cohen Kappa:")
print(str("BERT and Sentita: "+str(cohen_kappa_score(bert_df["sentiment"],sentita_df["sentiment"]))))
print(str("BERT and Lex1: "+str(cohen_kappa_score(bert_df["sentiment"],lex1_df["lex1_sentiment"]))))
print(str("BERT and Lex2: "+str(cohen_kappa_score(bert_df["sentiment"],lex2_df["lex2_sentiment"]))))
print(str("Sentita and Lex1: "+str(cohen_kappa_score(sentita_df["sentiment"],lex1_df["lex1_sentiment"]))))
print(str("Sentita and Lex2: "+str(cohen_kappa_score(sentita_df["sentiment"],lex2_df["lex2_sentiment"]))))
print(str("Lex1 and Lex2: "+str(cohen_kappa_score(lex1_df["lex1_sentiment"],lex2_df["lex2_sentiment"]))))


In [None]:
#fleiss kappa
from nltk import agreement

s1=bert_df["sentiment"].tolist()
s2=sentita_df["sentiment"].tolist()
s3=lex1_df["lex1_sentiment"].tolist()
s4=lex2_df["lex2_sentiment"].tolist()

formatted_codes = [[1,i,s1[i]] for i in range(len(s1))] + [[2,i,s2[i]] for i in range(len(s2))]  + [[3,i,s3[i]] for i in range(len(s3))]+ [[4,i,s4[i]] for i in range(len(s4))]
ratingtask = agreement.AnnotationTask(data=formatted_codes)
print('Fleiss\'s Kappa:',ratingtask.multi_kappa())