In [1]:
import re
import pandas as pd

import spacy
import nltk
from nltk.corpus import stopwords

from nrclex import NRCLex

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

In [None]:
#preprocessing
nltk.download('stopwords')
stop_words_it = set(stopwords.words('italian'))
stop_words_fr = set(stopwords.words('french'))

nlp_it = spacy.load("it_core_news_md")
nlp_fr = spacy.load("fr_core_news_md")

#bert
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME = "nlptown/bert-base-multilingual-uncased-sentiment"  # Utile perché dovrebbe funzionare anche in italiano
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sylcherry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2025-03-06 10:09:00.037953: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-06 10:09:00.289830: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-06 10:09:00.291720: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-06 10:09:00.734538: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instruct

In [None]:
def preprocess(text, lang):
    if lang == 'fr':
        nlp = nlp_fr
        stop_words = stop_words_fr
    else:
        nlp = nlp_it
        stop_words = stop_words_it
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return tokens

emotion_mapping = {
    'anger': 'anger',
    'anticipation': 'anticipation',
    'disgust': 'disgust',
    'fear': 'fear',
    'joy': 'joy',
    'sadness': 'sadness',
    'surprise': 'surprise',
    'trust': 'trust',
    'anticip': 'anticipation',
    'positive': 'positive',
    'negative': 'negative'
}

final_emotions = list(set(emotion_mapping.values()))

def sentiment_NCR(tokens):
    total_emotions = {emotion: 0 for emotion in final_emotions}
    emotion_count = 0
    
    for text in tokens:
        emotion = NRCLex(text)
        total_score = sum(emotion.affect_frequencies.values())
    
        if total_score > 0:
            normalized_emotions = {emotion_mapping.get(emotion_name, None): emotion_score / total_score
                                for emotion_name, emotion_score in emotion.affect_frequencies.items()
                                if emotion_mapping.get(emotion_name, None)}
            
            for emotion_name, normalized_score in normalized_emotions.items():
                total_emotions[emotion_name] += normalized_score
            
            emotion_count += 1

    average_emotions = {emotion_name: (score / emotion_count) if emotion_count > 0 else 0
                        for emotion_name, score in total_emotions.items()}
    
    total_sum = sum(average_emotions.values())
    if total_sum > 0:
        average_emotions = {key: round(value / total_sum, 3) for key, value in average_emotions.items()}
    
    return average_emotions

def sentiment_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=-1).squeeze().tolist()

    sentiment_scores = {
        "very_negative_BERT": probs[0],
        "negative_BERT": probs[1],
        "neutral_BERT": probs[2],
        "positive_BERT": probs[3],
        "very_positive_BERT": probs[4]
    }

    return sentiment_scores

def chunk_text(text, max_words=512):

    words = text.split()

    chunks = []

    for i in range(0, len(words), max_words):
        chunk = words[i:i + max_words]
        chunk_text = ' '.join(chunk)
        chunks.append(chunk_text)

    return chunks

def sentiment_analysis_for_long_text(text):
    chunks = chunk_text(text)

    sentiment_results = []

    for chunk in chunks:
        sentiment_results.append(sentiment_bert(chunk))

    sentiment_df = pd.DataFrame(sentiment_results)

    aggregated_sentiment = sentiment_df.mean()

    return aggregated_sentiment

In [5]:
df = pd.read_csv('./csv_chunks_filtered.csv')

df

Unnamed: 0,ID_file,leg,date,class,obj_pos,language,ID_cons,year_birth,gender,group,position,length,chunk
0,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,201.0,1975.0,M,SA,2,3141,"""Prima le persone poi le cose"", con queste sem..."
1,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,77.0,1975.0,M,UV,4,914,Le groupe de l'Union Valdôtaine tient à transm...
2,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,224.0,1986.0,M,FP-PD,6,1290,"Intervengo anch'io per unirmi, sia a nome pers..."
3,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,206.0,1966.0,F,PCP,8,1054,I colleghi che mi hanno preceduta hanno già tr...
4,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,293.0,1968.0,M,UV,10,857,"Per associarci, anche come Governo regionale, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3144,47712,XVI,25/07/2024,"ASSISTENZA SANITARIA E OSPEDALIERA, ASSISTENZA...",24,it,198.0,1964.0,M,FI,2,3924,Abbiamo presentato quest'iniziativa per portar...
3145,47712,XVI,25/07/2024,"ASSISTENZA SANITARIA E OSPEDALIERA, ASSISTENZA...",24,it,201.0,1975.0,M,SA,4,392,"Il collega Marquis, nel suo ruolo di factotum ..."
3146,47713,XVI,25/07/2024,TURISMO E INDUSTRIA ALBERGHIERA,25,it,185.0,1983.0,M,LEGA VDA,2,3138,"""Modena Skipass"" è stata per quasi 30 anni la ..."
3147,47713,XVI,25/07/2024,TURISMO E INDUSTRIA ALBERGHIERA,25,it,152.0,1962.0,M,UV,4,3125,Ringrazio i colleghi della Lega che hanno volu...


In [None]:
df['tokens'] = df.apply(lambda row: preprocess(row['chunk'], row['language']), axis=1)

#df[final_emotions] = df['tokens'].apply(sentiment_NCR).apply(pd.Series)

#df['strongest_emotion'] = df[final_emotions].idxmax(axis=1)
#df['tot_pos'] = df[['joy', 'trust', 'positive', 'surprise', 'anticipation']].sum(axis=1)
#df['tot_neg'] = df[['sadness', 'disgust', 'fear', 'anger', 'negative']].sum(axis=1)

df[['very_negative_BERT', 'negative_BERT', 'neutral_BERT', 'positive_BERT', 'very_positive_BERT']] = df['chunk'].apply(sentiment_analysis_for_long_text).apply(pd.Series)

df.to_csv('./csv_chunks_sentiment.csv', index=False)

In [19]:
df.to_csv('./csv_chunks_sentiment.csv', index=False)

In [23]:
df = pd.read_csv('./csv_chunks_sentiment.csv')
df

Unnamed: 0,ID_file,leg,date,class,obj_pos,language,ID_cons,year_birth,gender,group,position,length,chunk,tokens,very_negative_BERT,negative_BERT,neutral_BERT,positive_BERT,very_positive_BERT
0,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,201.0,1975.0,M,SA,2,3141,"""Prima le persone poi le cose"", con queste sem...","['prima', 'le', 'persone', 'poi', 'le', 'cose'...",0.008774,0.017888,0.075363,0.387717,0.510258
1,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,77.0,1975.0,M,UV,4,914,Le groupe de l'Union Valdôtaine tient à transm...,"['le', 'groupe', 'de', 'lunion', 'valdôtaine',...",0.003549,0.008610,0.127961,0.458826,0.401054
2,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,224.0,1986.0,M,FP-PD,6,1290,"Intervengo anch'io per unirmi, sia a nome pers...","['intervengo', 'anchio', 'per', 'unirmi', 'sia...",0.009853,0.019888,0.075282,0.395171,0.499806
3,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,206.0,1966.0,F,PCP,8,1054,I colleghi che mi hanno preceduta hanno già tr...,"['colleghi', 'che', 'mi', 'hanno', 'preceduta'...",0.011296,0.030934,0.091448,0.370631,0.495692
4,47801,XVI,02/10/2024,"CONSIGLIO REGIONALE, Attività consiliare d'aula",1,it,293.0,1968.0,M,UV,10,857,"Per associarci, anche come Governo regionale, ...","['per', 'associarci', 'anche', 'come', 'govern...",0.004999,0.013256,0.101999,0.432239,0.447508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755,47712,XVI,25/07/2024,"ASSISTENZA SANITARIA E OSPEDALIERA, ASSISTENZA...",24,it,198.0,1964.0,M,FI,2,3924,Abbiamo presentato quest'iniziativa per portar...,"['abbiamo', 'presentato', 'questiniziativa', '...",0.077617,0.205141,0.263839,0.342937,0.110465
1756,47712,XVI,25/07/2024,"ASSISTENZA SANITARIA E OSPEDALIERA, ASSISTENZA...",24,it,201.0,1975.0,M,SA,4,392,"Il collega Marquis, nel suo ruolo di factotum ...","['il', 'collega', 'marquis', 'nel', 'suo', 'ru...",0.011394,0.045980,0.193305,0.437839,0.311481
1757,47713,XVI,25/07/2024,TURISMO E INDUSTRIA ALBERGHIERA,25,it,185.0,1983.0,M,LEGA VDA,2,3138,"""Modena Skipass"" è stata per quasi 30 anni la ...","['modena', 'skipass', 'è', 'stata', 'per', 'qu...",0.112292,0.290896,0.296555,0.220730,0.079527
1758,47713,XVI,25/07/2024,TURISMO E INDUSTRIA ALBERGHIERA,25,it,152.0,1962.0,M,UV,4,3125,Ringrazio i colleghi della Lega che hanno volu...,"['ringrazio', 'colleghi', 'della', 'lega', 'ch...",0.067683,0.169414,0.244999,0.343003,0.174901


In [25]:
df_F = df[df['gender']=='F']
df_M = df[df['gender']=='M']

num_col = ['very_negative_BERT', 'negative_BERT', 'neutral_BERT', 'positive_BERT', 'very_positive_BERT']

counts = df_F['ID_file'].value_counts()
counts2 = df_M['ID_file'].value_counts()
mean_F = counts.mean()
mean_M = counts2.mean()

mean = df_F['length'].mean()
mean2 = df_M['length'].mean()

mean_df = df[num_col].mean()
mean_df1 = df_F[num_col].mean()
mean_df2 = df_M[num_col].mean()

# Creazione del DataFrame riassuntivo
df_summary = pd.DataFrame({'df': mean_df, 'df1': mean_df1, 'df2': mean_df2})

# Visualizzazione del risultato
print(mean_F, mean_M, mean, mean2)
print(df_summary)

2.235294117647059 3.8626373626373627 3248.2017543859647 3440.686344238976
                          df       df1       df2
very_negative_BERT  0.125985  0.149686  0.119485
negative_BERT       0.241941  0.275840  0.234029
neutral_BERT        0.268351  0.273333  0.267720
positive_BERT       0.243007  0.205296  0.252521
very_positive_BERT  0.120716  0.095844  0.126245


In [None]:
import scipy.stats as stats

# Lista delle colonne degli score sentiment
sentiment_scores = ['very_negative_BERT', 'negative_BERT', 'neutral_BERT', 'positive_BERT', 'very_positive_BERT']

# Creazione del DataFrame per i risultati del t-test
t_test_results = {}

# Esegui il t-test per ogni score
for score in sentiment_scores:
    # Estrai i dati relativi al punteggio per df_F e df_M
    group_F = df_F[score]
    group_M = df_M[score]
    
    # Esegui il t-test per campioni indipendenti
    t_stat, p_value = stats.ttest_ind(group_F, group_M)  # `equal_var=False` se le varianze non sono uguali
    
    # Salva i risultati
    t_test_results[score] = {'t-statistic': t_stat, 'p-value': p_value}

# Visualizza i risultati
t_test_results_df = pd.DataFrame(t_test_results).T
t_test_results_df

Unnamed: 0,t-statistic,p-value
very_negative_BERT,4.683071,3.044463e-06
negative_BERT,5.889026,4.649461e-09
neutral_BERT,0.962143,0.3361109
positive_BERT,-6.291762,3.958968e-10
very_positive_BERT,-4.491268,7.543886e-06
