In [114]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.stats import entropy, shapiro, levene, ttest_ind, mannwhitneyu

In [74]:
topic_df = pd.read_csv('../../src/nlp/topic_modeling/global_metadata.csv')
text_embeddings = np.load('../../src/nlp/topic_modeling/user_embeddings.npy')
sentiment_df = pd.read_csv('../../src/nlp/sentiment/sentiment_scores.csv')

In [75]:
topic_df['embedding'] = list(text_embeddings)

# Merging sentiment and topic modeling tasks data
merged_df = topic_df.merge(sentiment_df, left_on='id_x', right_on='id', how='inner')
merged_df = merged_df[merged_df['model'] == 'bertweet']
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 98944 entries, 1 to 197887
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   author_x             98944 non-null  object 
 1   id_x                 98944 non-null  object 
 2   type                 98944 non-null  object 
 3   community_id         85993 non-null  float64
 4   community_type       98944 non-null  object 
 5   is_hub               98944 non-null  bool   
 6   is_bridge            98944 non-null  bool   
 7   topic                98944 non-null  object 
 8   topic_id             98944 non-null  int64  
 9   embedding            98944 non-null  object 
 10  author_y             98944 non-null  object 
 11  model                98944 non-null  object 
 12  neg_percentage       98944 non-null  float64
 13  neu_percentage       98944 non-null  float64
 14  pos_percentage       98944 non-null  float64
 15  predicted_sentiment  98944 non-null  obj

In [76]:
merged_df.head()

Unnamed: 0,author_x,id_x,type,community_id,community_type,is_hub,is_bridge,topic,topic_id,embedding,author_y,model,neg_percentage,neu_percentage,pos_percentage,predicted_sentiment,id
1,PsychLegalMind,t3_1lzrb15,post,0.0,Strong community,True,False,17_russia_ukraine_putin_nato,17,"[-0.015391379, 0.005025496, 0.03813486, 0.0105...",PsychLegalMind,bertweet,0.500282,0.491378,0.00834,negative,t3_1lzrb15
3,the_original_Retro,t3_1lys6tq,post,0.0,Strong community,True,False,56_epstein_files_list_release,56,"[0.006398821, -0.040616985, -0.045154493, -0.0...",the_original_Retro,bertweet,0.447602,0.545105,0.007293,neutral,t3_1lys6tq
5,Time_Minute_6036,t3_1lyzum1,post,0.0,Strong community,True,False,-1_israel_trump_right_men,-1,"[-0.053852115, -0.02925583, 0.08934749, -0.011...",Time_Minute_6036,bertweet,0.824866,0.170586,0.004548,negative,t3_1lyzum1
7,najumobi,t3_1lyd2ym,post,0.0,Strong community,True,False,27_tariffs_trade_tariff_products,27,"[-0.035550196, 0.0013031296, 0.08044617, -0.03...",najumobi,bertweet,0.062563,0.631832,0.305604,neutral,t3_1lyd2ym
9,Awesomeuser90,t3_1lx5svi,post,0.0,Strong community,True,False,-1_israel_trump_right_men,-1,"[0.0059204116, -0.020216445, 0.063375555, -0.0...",Awesomeuser90,bertweet,0.02246,0.96508,0.012461,neutral,t3_1lx5svi


### Evaluating topic entropy

In [77]:
# Computing topic entropy for each community
def compute_topic_entropy(df):
    results = []
    for (comm_id, comm_type), group in df.groupby(['community_id', 'community_type']):
        counts = group['topic'].value_counts().values

        # Compute entropy
        H = entropy(counts, base=2)
        
        # Normalize
        n = len(counts)
        H_norm = H / np.log2(n) if n > 1 else 0

        results.append({
            "community_id": comm_id,
            "community_type" : comm_type,
            "n_texts": len(group),
            "n_topics": n,
            "entropy_norm": H_norm
        })

    return pd.DataFrame(results)

In [78]:
topic_entropy_df = compute_topic_entropy(merged_df)
topic_entropy_df

Unnamed: 0,community_id,community_type,n_texts,n_topics,entropy_norm
0,0.0,Strong community,25042,615,0.491160
1,1.0,Strong community,23144,642,0.586895
2,2.0,Strong community,3917,255,0.506603
3,3.0,Weak community,1277,172,0.503045
4,4.0,Weak community,1731,262,0.526512
...,...,...,...,...,...
73,73.0,Weak community,195,58,0.636004
74,74.0,Weak community,215,42,0.619946
75,75.0,Weak community,156,33,0.567305
76,76.0,Strong community,177,45,0.694833


In [79]:
# compute sentiment polarization for communities
def compute_sentiment_polarization(df):
    results = []
    for (comm_id, comm_type), group in df.groupby(['community_id', 'community_type']):
        mean_neg =  group['neg_percentage'].mean()
        mean_neu = group['neu_percentage'].mean()
        mean_pos = group['pos_percentage'].mean()

        vals = np.array([mean_neg, mean_neu, mean_pos])

        max_val = vals.max()

        polarization = (max_val - 1/3) / (1 - 1/3)
        results.append({
            "community_id": comm_id,
            "community_type" : comm_type,
            "mean_neg" : mean_neg,
            "mean_neu" : mean_neu,
            "mean_pos" : mean_pos,
            "polarization": polarization
        })
    return pd.DataFrame(results)


In [80]:
sentiment_polarization_df = compute_sentiment_polarization(merged_df)
sentiment_polarization_df.head()

Unnamed: 0,community_id,community_type,mean_neg,mean_neu,mean_pos,polarization
0,0.0,Strong community,0.465068,0.472255,0.062677,0.208382
1,1.0,Strong community,0.455507,0.489531,0.054962,0.234296
2,2.0,Strong community,0.257927,0.646905,0.095168,0.470357
3,3.0,Weak community,0.572473,0.387418,0.040109,0.35871
4,4.0,Weak community,0.472263,0.465071,0.062666,0.208394


In [81]:
sentiment_polarization_df['polarization'].max(), sentiment_polarization_df['polarization'].min()

(0.5401772854905896, 0.19645016630913345)

In [86]:
result_df = topic_entropy_df.merge(sentiment_entropy_df, left_on='community_id', right_on='community_id', how='left')
result_df['community_type'] = result_df['community_type_x']
result_df.drop(columns=['community_type_x', 'community_type_y'], inplace=True)
result_df

Unnamed: 0,community_id,n_texts,n_topics,entropy_norm,mean_neg,mean_neu,mean_pos,polarization,community_type
0,0.0,25042,615,0.491160,0.465068,0.472255,0.062677,0.208382,Strong community
1,1.0,23144,642,0.586895,0.455507,0.489531,0.054962,0.234296,Strong community
2,2.0,3917,255,0.506603,0.257927,0.646905,0.095168,0.470357,Strong community
3,3.0,1277,172,0.503045,0.572473,0.387418,0.040109,0.358710,Weak community
4,4.0,1731,262,0.526512,0.472263,0.465071,0.062666,0.208394,Weak community
...,...,...,...,...,...,...,...,...,...
73,73.0,195,58,0.636004,0.441780,0.481443,0.076777,0.222164,Weak community
74,74.0,215,42,0.619946,0.424372,0.482537,0.093091,0.223806,Weak community
75,75.0,156,33,0.567305,0.486992,0.442818,0.070190,0.230488,Weak community
76,76.0,177,45,0.694833,0.398624,0.498962,0.102415,0.248443,Strong community


In [109]:
entropy_strong = pd.Series(result_df.loc[result_df['community_type'] == 'Strong community']['entropy_norm'])
entropy_weak = pd.Series(result_df.loc[result_df['community_type'] == 'Weak community']['entropy_norm'])
polarity_strong = pd.Series(result_df.loc[result_df['community_type'] == 'Strong community']['polarization'])
polarity_weak = pd.Series(result_df.loc[result_df['community_type'] == 'Weak community']['polarization'])

In [111]:
# Verifiyng data distribuition
stat, p = shapiro(entropy_strong)
print(f"Entropy Strong - stat={stat:.3f}, p={p:.3f}")

stat, p = shapiro(entropy_weak)
print(f"Entropy Weak - stat={stat:.3f}, p={p:.3f}")

stat, p = shapiro(polarity_strong)
print(f"Polarization Strong - stat={stat:.3f}, p={p:.3f}")

stat, p = shapiro(polarity_weak)
print(f"Polarization Weak - stat={stat:.3f}, p={p:.3f}")

Entropy Strong - stat=0.958, p=0.691
Entropy Weak - stat=0.991, p=0.930
Polarization Strong - stat=0.763, p=0.002
Polarization Weak - stat=0.861, p=0.000


Looking at shapiro test we can assume that entropy distribuition is normal, while polarization is not

In [113]:
stat, p = levene(entropy_strong, entropy_weak)
print(f"Levene test Entropy - stat={stat:.3f}, p={p:.3f}")

stat, p = levene(polarity_strong, polarity_weak)
print(f"Levene test Polarization - stat={stat:.3f}, p={p:.3f}")

Levene test Entropy - stat=1.693, p=0.197
Levene test Polarization - stat=0.002, p=0.962


Looking at levene test bot variances are similar 

#### Entropy -> ttest indipendent, Polarization -> Mann-whitney U test

In [118]:
stat, p = ttest_ind(entropy_strong, entropy_weak, equal_var=True)
print(f"t-test Entropy - stat={stat:.3f}, p={p:.3f}")

stat, p = mannwhitneyu(polarity_strong, polarity_weak, alternative='two-sided')
print(f"Mann-Whitney U test Polarization - stat={stat:.3f}, p={p:.3f}")

t-test Entropy - stat=0.385, p=0.702
Mann-Whitney U test Polarization - stat=329.000, p=0.123


t-test Entropy:
p = 0.702 > 0.05 → nessuna differenza significativa tra strong e weak community per l’entropia dei topic.

Mann-Whitney U test Polarization:
p = 0.123 > 0.05 → nessuna differenza significativa tra strong e weak community per la polarizzazione del sentiment.

Conclusione preliminare sulla QR1:
Le community più coese (strong) non mostrano né una maggiore omogeneità tematica (entropy) né una polarizzazione sentimentale significativamente diversa rispetto alle community meno coese (weak), almeno nei dati e metriche analizzate finora.
