In [1]:
import re
import pandas as pd

import spacy
import nltk
from nltk.corpus import stopwords

from nrclex import NRCLex

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.sentiments import PatternAnalyzer

import eng_spacysentiment

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax



In [2]:
#preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nlp = spacy.load("en_core_web_sm")

#vader
analyzer = SentimentIntensityAnalyzer()

#spacy
nlp2 = eng_spacysentiment.load()

#bert
MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"  # Utile perché dovrebbe funzionare anche in italiano
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

#distilbert
MODEL_NAME2 = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer2 = AutoTokenizer.from_pretrained(MODEL_NAME2)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME2)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sylcherry/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
2025-02-27 10:14:53.874627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-27 10:14:54.137528: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-27 10:14:54.139451: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-27 10:14:54.603246: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AV

In [3]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return tokens

df = pd.read_csv('./csv_chunks_en_filtered.csv')

df['tokens'] = df['chunk'].apply(preprocess)

emotion_mapping = {
    'anger': 'anger',
    'anticipation': 'anticipation',
    'disgust': 'disgust',
    'fear': 'fear',
    'joy': 'joy',
    'sadness': 'sadness',
    'surprise': 'surprise',
    'trust': 'trust',
    'anticip': 'anticipation',
    'positive': 'positive',
    'negative': 'negative'
}

final_emotions = list(set(emotion_mapping.values()))

def sentiment_NCR(tokens):
    total_emotions = {emotion: 0 for emotion in final_emotions}
    emotion_count = 0
    
    for text in tokens:
        emotion = NRCLex(text)
        total_score = sum(emotion.affect_frequencies.values())
    
        if total_score > 0:
            normalized_emotions = {emotion_mapping.get(emotion_name, None): emotion_score / total_score
                                for emotion_name, emotion_score in emotion.affect_frequencies.items()
                                if emotion_mapping.get(emotion_name, None)}
            
            for emotion_name, normalized_score in normalized_emotions.items():
                total_emotions[emotion_name] += normalized_score
            
            emotion_count += 1

    average_emotions = {emotion_name: (score / emotion_count) if emotion_count > 0 else 0
                        for emotion_name, score in total_emotions.items()}
    
    total_sum = sum(average_emotions.values())
    if total_sum > 0:
        average_emotions = {key: round(value / total_sum, 3) for key, value in average_emotions.items()}
    
    return average_emotions

df[final_emotions] = df['tokens'].apply(sentiment_NCR).apply(pd.Series)

df['strongest_emotion'] = df[final_emotions].idxmax(axis=1)
df['tot_pos'] = df[['joy', 'trust', 'positive', 'surprise', 'anticipation']].sum(axis=1)
df['tot_neg'] = df[['sadness', 'disgust', 'fear', 'anger', 'negative']].sum(axis=1)

df

Unnamed: 0,ID_file,leg,date,class,language,surname,name,year_birth,gender,group,...,trust,sadness,negative,surprise,fear,positive,disgust,strongest_emotion,tot_pos,tot_neg
0,47914,XVI,06/11/2024,DEMANIO E PATRIMONIO REGIONALE,en,Testolin,Renzo,1968.0,M,UV,...,0.144,0.026,0.053,0.047,0.026,0.539,0.000,positive,0.895,0.105
1,47914,XVI,06/11/2024,DEMANIO E PATRIMONIO REGIONALE,en,Guichardaz,Erika,1976.0,F,PCP,...,0.250,0.000,0.000,0.000,0.000,0.750,0.000,positive,1.000,0.000
2,47915,XVI,06/11/2024,"CONSIGLIO REGIONALE, ORDINE PUBBLICO - FORZE A...",en,Testolin,Renzo,1968.0,M,UV,...,0.075,0.121,0.138,0.062,0.037,0.300,0.008,positive,0.687,0.312
3,47915,XVI,06/11/2024,"CONSIGLIO REGIONALE, ORDINE PUBBLICO - FORZE A...",en,Lucianaz,Diego,1963.0,M,RV,...,0.165,0.059,0.112,0.025,0.090,0.330,0.009,positive,0.717,0.283
4,48019,XVI,21/11/2024,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963.0,M,RV,...,0.125,0.042,0.167,0.000,0.167,0.375,0.000,positive,0.625,0.376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,48041,XVI,09/12/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",en,Testolin,Renzo,1968.0,M,UV,...,0.250,0.033,0.133,0.020,0.000,0.370,0.033,positive,0.780,0.219
444,48041,XVI,09/12/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",en,Minelli,Chiara,1966.0,F,PCP,...,0.264,0.019,0.084,0.060,0.047,0.319,0.013,positive,0.811,0.190
445,48050,XVI,09/12/2024,BILANCIO E CONTABILITA' REGIONALE,en,Malacrinò,Antonino,1977.0,M,FP-PD,...,0.336,0.028,0.111,0.011,0.028,0.342,0.002,positive,0.826,0.175
446,48050,XVI,09/12/2024,BILANCIO E CONTABILITA' REGIONALE,en,Aggravi,Stefano,1985.0,M,RV,...,0.279,0.027,0.124,0.014,0.049,0.337,0.007,positive,0.775,0.224


In [None]:
def sentiment_VADER(testo):
    sentiment = analyzer.polarity_scores(testo)
    return sentiment['pos'], sentiment['neg'], sentiment['neu'], sentiment['compound']

df = pd.read_csv('csv_chunks_en_filtered.csv')

df[['pos', 'neg', 'neu', 'polarità']] = df['chunk'].apply(lambda x: pd.Series(sentiment_VADER(x)))

df

In [None]:
def sentiment_textblob_def(testo):
    blob = TextBlob(testo, analyzer=PatternAnalyzer())
    polarità = blob.sentiment.polarity 
    soggettività = blob.sentiment.subjectivity
    return polarità, soggettività

print('Done :)')

def sentiment_textblob_bayes(testo):
    blob = TextBlob(testo, analyzer=NaiveBayesAnalyzer())
    classificazione = blob.sentiment.classification  
    p_pos = blob.sentiment.p_pos 
    p_neg = blob.sentiment.p_neg
    return classificazione, p_pos, p_neg

df = pd.read_csv('csv_chunks_en_filtered.csv')

df[['polarità', 'soggettività']] = df['chunk'].apply(lambda x: pd.Series(sentiment_textblob_def(x)))
df[['classificazione', 'p_pos', 'p_neg']] = df['chunk'].apply(lambda x: pd.Series(sentiment_textblob_bayes(x)))

df

In [None]:
def sentiment_spacy(text):
    doc = nlp2(text)
    return doc.cats

df = pd.read_csv('csv_chunks_en_filtered.csv')

df[['positive','negative', 'neutral']] = df['chunk'].apply(sentiment_spacy).apply(pd.Series)

df

In [4]:
def sentiment_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=-1).squeeze().tolist()
    
    sentiment_scores = {
        "very_negative": probs[0],
        "negative": probs[1],
        "neutral": probs[2],
        "positive": probs[3],
        "very_positive": probs[4]
    }
    
    return sentiment_scores

df = pd.read_csv('csv_chunks_en_filtered.csv')

df = df.join(df['chunk'].apply(sentiment_bert).apply(pd.Series))

df2 = pd.read_csv('csv_chunks_it_filtered.csv')

df2 = df2.join(df2['chunk'].apply(sentiment_bert).apply(pd.Series))

In [5]:
df

Unnamed: 0,ID_file,leg,date,class,language,surname,name,year_birth,gender,group,position,length,chunk,very_negative,negative,neutral,positive,very_positive
0,47914,XVI,06/11/2024,DEMANIO E PATRIMONIO REGIONALE,en,Testolin,Renzo,1968.0,M,UV,2,1412,The Guichardaz advisor requests information on...,0.094237,0.155690,0.246998,0.361006,0.142068
1,47914,XVI,06/11/2024,DEMANIO E PATRIMONIO REGIONALE,en,Guichardaz,Erika,1976.0,F,PCP,4,882,Sarebbe interesting feeling colui che péroli i...,0.203844,0.416074,0.265736,0.082525,0.031821
2,47915,XVI,06/11/2024,"CONSIGLIO REGIONALE, ORDINE PUBBLICO - FORZE A...",en,Testolin,Renzo,1968.0,M,UV,2,866,The Lucianaz advisor asks to know the latest n...,0.056520,0.123463,0.296272,0.408376,0.115370
3,47915,XVI,06/11/2024,"CONSIGLIO REGIONALE, ORDINE PUBBLICO - FORZE A...",en,Lucianaz,Diego,1963.0,M,RV,4,1875,"Well, so the regional administration, the gove...",0.294534,0.337903,0.195414,0.124930,0.047219
4,48019,XVI,21/11/2024,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963.0,M,RV,2,609,"Yes, quickly, the questions look at the situat...",0.105710,0.205657,0.295763,0.260798,0.132072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,48041,XVI,09/12/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",en,Testolin,Renzo,1968.0,M,UV,4,1561,J'interviens seulement pour rendre hommage moi...,0.092328,0.186853,0.207979,0.265860,0.246980
444,48041,XVI,09/12/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",en,Minelli,Chiara,1966.0,F,PCP,6,3754,I intervene on the communications of President...,0.226006,0.269512,0.203472,0.213193,0.087817
445,48050,XVI,09/12/2024,BILANCIO E CONTABILITA' REGIONALE,en,Malacrinò,Antonino,1977.0,M,FP-PD,2,18782,Good afternoon to all. The documents that we a...,0.026209,0.053132,0.151910,0.530848,0.237902
446,48050,XVI,09/12/2024,BILANCIO E CONTABILITA' REGIONALE,en,Aggravi,Stefano,1985.0,M,RV,4,16363,"Before moving on to the report, I think it is ...",0.087286,0.237192,0.266035,0.315003,0.094484


In [18]:
df_temp = df2[['ID_file', 'position', 'chunk', 'very_negative', 'negative', 'neutral', 'positive', 'very_positive']]
df_merged = df.merge(df_temp, on=['ID_file', 'position'], suffixes=('_en', '_it'))

correlation_matrix = {
    sentiment: df_merged[f"{sentiment}_en"].corr(df_merged[f"{sentiment}_it"])
    for sentiment in ['very_negative', 'negative', 'neutral', 'positive', 'very_positive']
}

print(correlation_matrix)

{'very_negative': 0.7743805329980478, 'negative': 0.715812397843305, 'neutral': 0.7184270036124332, 'positive': 0.7280968157387158, 'very_positive': 0.7670474809978999}


In [None]:
def sentiment_distilbert(text):
    inputs = tokenizer2(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model2(**inputs)
    probs = softmax(outputs.logits, dim=-1).squeeze().tolist()
    
    sentiment_scores = {
        "negative": probs[0],
        "positive": probs[1]
    }
    
    return sentiment_scores

df = pd.read_csv('csv_chunks_en_filtered.csv')

df = df.join(df['chunk'].apply(sentiment_distilbert).apply(pd.Series))

df

In [21]:
def sentiment_VADER_simple(testo):
    sentiment = analyzer.polarity_scores(testo)
    return 'positive' if sentiment['compound'] > 0 else 'negative'

def sentiment_textblob_def_simple(testo):
    blob = TextBlob(testo, analyzer=PatternAnalyzer())
    polarità = blob.sentiment.polarity 
    return 'positive' if polarità > 0 else 'negative'

def sentiment_textblob_bayes_simple(testo):
    blob = TextBlob(testo, analyzer=NaiveBayesAnalyzer())
    classificazione = blob.sentiment.classification  
    return 'positive' if classificazione == 'pos' else 'negative'

def sentiment_spacy_simple(text):
    doc = nlp2(text)
    sentiment = max(doc.cats, key=doc.cats.get)
    return sentiment

def sentiment_distilbert_simple(text):
    inputs = tokenizer2(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model2(**inputs)
    probs = softmax(outputs.logits, dim=-1).squeeze().tolist()
    
    sentiment_scores = {
        "negative": probs[0],
        "positive": probs[1]
    }
    
    return 'positive' if sentiment_scores['positive'] > sentiment_scores['negative'] else 'negative'

df = pd.read_csv('csv_chunks_en_filtered.csv')

df['VADER'] = df['chunk'].apply(sentiment_VADER_simple)
print('Done VADER :)')
df['TextBlob_def'] = df['chunk'].apply(sentiment_textblob_def_simple)
print('Done TextBlob_def :)')
df['TextBlob_bayes'] = df['chunk'].apply(sentiment_textblob_bayes_simple)
print('Done TextBlob_bayes :)')
df['Spacy'] = df['chunk'].apply(sentiment_spacy_simple)
print('Done Spacy :)')
df['DistilBERT'] = df['chunk'].apply(sentiment_distilbert_simple)
print('Done DistilBERT :)')

df

Done VADER :)
Done TextBlob_def :)
Done TextBlob_bayes :)
Done Spacy :)
Done DistilBERT :)


Unnamed: 0,ID_file,leg,class,language,surname,name,year_birth,gender,group,position,length,chunk,VADER,TextBlob_def,TextBlob_bayes,Spacy,DistilBERT
0,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,2,609,"Yes, quickly, the questions look at the situat...",negative,positive,positive,negative,negative
1,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Testolin,Renzo,1968,M,UV,4,3873,"First of all, I would like to thank the Counci...",positive,positive,positive,positive,negative
2,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,6,557,"Thank you Mr President Testolin, today you sur...",positive,positive,positive,positive,positive
3,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Lavy,Erik,1995,M,LEGA VDA,2,5261,We have already addressed this issue in the la...,positive,positive,positive,positive,negative
4,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Bertschy,Luigi Giovanni,1965,M,UV,4,4884,"Thank you, colleague Lavy, also for how you ha...",positive,positive,positive,positive,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,48017,XVI,CREDITO,en,Lavy,Erik,1995,M,LEGA VDA,6,2952,"President Testolin, I am quite shocked, also i...",positive,positive,positive,positive,negative
224,48017,XVI,CREDITO,en,Testolin,Renzo,1968,M,UV,8,103,Just to say that populism is allowed in reruns...,positive,negative,positive,positive,negative
225,48018,XVI,"ENTI LOCALI, Comuni",en,Brunod,Dennis,1978,M,RV,2,1801,"At the end of October 2024, we learned from pr...",positive,positive,positive,positive,negative
226,48018,XVI,"ENTI LOCALI, Comuni",en,Testolin,Renzo,1968,M,UV,4,2187,English: Allow me to start off with a bit of a...,positive,positive,positive,positive,negative


In [30]:
columns_to_check = ['VADER', 'TextBlob_def', 'TextBlob_bayes', 'Spacy', 'DistilBERT']

def majority_voting(df, model_columns):
    final_predictions = []
    
    # Per ogni riga del DataFrame (ogni testo)
    for index, row in df.iterrows():
        # Raccogliamo i voti dei modelli per il sentiment (positivo o negativo)
        votes = [row[model] for model in model_columns]
        
        # Votazione maggioritaria (se più di metà dei modelli predice 'positivo', predici 'positivo')
        if votes.count('positive') > votes.count('negative'):
            final_predictions.append('positive')
        else:
            final_predictions.append('negative')
    
    return final_predictions

def consistency_between_models(df, model_columns):
    # Conta il numero di casi in cui tutti i modelli sono d'accordo
    consistency_count = 0
    
    for index, row in df.iterrows():
        if len(set([row[model] for model in model_columns])) == 1:
            consistency_count += 1
    
    total_cases = len(df)
    consistency_percentage = (consistency_count / total_cases) * 100
    return consistency_percentage

# Calcoliamo la consistenza
consistency_percentage = consistency_between_models(df, columns_to_check)
df['Majority voting'] = majority_voting(df, columns_to_check)

print(f"Consistency between models: {consistency_percentage:.2f}%")

df_filtered = df[df[columns_to_check].nunique(axis=1) > 1]

df


Consistency between models: 30.26%


Unnamed: 0,ID_file,leg,class,language,surname,name,year_birth,gender,group,position,length,chunk,VADER,TextBlob_def,TextBlob_bayes,Spacy,DistilBERT,Majority voting
0,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,2,609,"Yes, quickly, the questions look at the situat...",negative,positive,positive,negative,negative,negative
1,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Testolin,Renzo,1968,M,UV,4,3873,"First of all, I would like to thank the Counci...",positive,positive,positive,positive,negative,positive
2,48019,XVI,"SPORT E TEMPO LIBERO, Infrastrutture ricreativ...",en,Lucianaz,Diego,1963,M,RV,6,557,"Thank you Mr President Testolin, today you sur...",positive,positive,positive,positive,positive,positive
3,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Lavy,Erik,1995,M,LEGA VDA,2,5261,We have already addressed this issue in the la...,positive,positive,positive,positive,negative,positive
4,47950,XVI,"TRASPORTI E VIABILITÀ, Impianti a fune",en,Bertschy,Luigi Giovanni,1965,M,UV,4,4884,"Thank you, colleague Lavy, also for how you ha...",positive,positive,positive,positive,negative,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,48017,XVI,CREDITO,en,Lavy,Erik,1995,M,LEGA VDA,6,2952,"President Testolin, I am quite shocked, also i...",positive,positive,positive,positive,negative,positive
224,48017,XVI,CREDITO,en,Testolin,Renzo,1968,M,UV,8,103,Just to say that populism is allowed in reruns...,positive,negative,positive,positive,negative,positive
225,48018,XVI,"ENTI LOCALI, Comuni",en,Brunod,Dennis,1978,M,RV,2,1801,"At the end of October 2024, we learned from pr...",positive,positive,positive,positive,negative,positive
226,48018,XVI,"ENTI LOCALI, Comuni",en,Testolin,Renzo,1968,M,UV,4,2187,English: Allow me to start off with a bit of a...,positive,positive,positive,positive,negative,positive
