### Emotion analysis and statistical tests

In [None]:
import pandas as pd
import numpy as np
import sys

import matplotlib.pyplot as plt
# from bertopic import BERTopic
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
from scipy import stats

In [4]:
# Download and place dataset under the dataset/ folder

news = pd.read_csv('dataset/news.csv')
news_tweets = pd.read_csv('dataset/news_tweets.csv')
user_resp = pd.read_csv('dataset/user_resp.csv')

In [6]:
news.shape, news_tweets.shape, user_resp.shape

((35886, 32), (24584, 49), (4039608, 65))

In [7]:
news_tweets_subset = news_tweets[['conversation_id', 'topics', 'subtopic', 'trump_pos_new', 'trump_neg_new', 'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new', 'Trump_flag', 'Biden_flag', 'Trump_Biden_flag']]

In [8]:
# user_resp = user_resp.merge(news_tweets_subset, on = 'conversation_id')
user_resp = user_resp.rename(columns={"Trump_flag": "trump_flag", "Biden_flag": "biden_flag"})

In [9]:
user_resp.columns

Index(['Unnamed: 0.2', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1',
       'tweet_id', 'conversation_id', 'author_id_x', 'created_at_x', 'geo_x',
       'lang_x', 'like_count_x', 'quote_count_x', 'reply_count_x',
       'retweet_count_x', 'source_x', 'text_x', 'anger', 'joy', 'optimism',
       'sadness', 'tweet_id_y', 'author_id_y', 'created_at_y', 'geo_y',
       'lang_y', 'like_count_y', 'quote_count_y', 'reply_count_y',
       'retweet_count_y', 'source_y', 'text_y', 'publication', 'topics_x',
       'topic_labels', 'theme', 'vad_pos_senti', 'vad_neu_senti',
       'vad_neg_senti', 'trump_neg', 'trump_neu', 'trump_pos', 'biden_neg',
       'biden_neu', 'biden_pos', 'trump_flag', 'biden_flag', 'tweet_freq_y',
       'neg_senti', 'neu_senti', 'pos_senti', 'topics_y', 'subtopic',
       'topic_ids', 'subtopic.1', 'trump_pos_new', 'trump_neg_new',
       'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new',
       'trump_flag', 'biden_flag', 'Trump_Biden_flag', 'topic_

In [10]:
R_user_resp = user_resp[((user_resp['publication'] == 'Breitbart News') | (user_resp['publication'] == 'Fox News'))]
L_user_resp = user_resp[((user_resp['publication'] == 'CNN') | (user_resp['publication'] == 'The Washington Post'))]
C_user_resp = user_resp[((user_resp['publication'] == 'Business Insider') | (user_resp['publication'] ==  'USA Today'))]

In [11]:
R_news_tweets = news_tweets[((news_tweets['publication'] == 'Breitbart News') | (news_tweets['publication'] == 'Fox News'))]
L_news_tweets = news_tweets[((news_tweets['publication'] == 'CNN') | (news_tweets['publication'] == 'The Washington Post'))]
C_news_tweets = news_tweets[((news_tweets['publication'] == 'Business Insider') | (news_tweets['publication'] ==  'USA Today'))]

In [12]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']

In [13]:
from pingouin import kruskal

def get_KW_stat(emot1, emot2):
    
    df1 = pd.DataFrame(emot1.values, columns = ['senti'])
    df1['type_flag'] = 1

    df2 = pd.DataFrame(emot2.values, columns = ['senti'])
    df2['type_flag'] = 2
    
    temp_df = pd.concat((df1,df2), axis = 0)
    kwTest = kruskal(temp_df, dv='senti', between='type_flag')
    H = kwTest.H[0]
    p = kwTest['p-unc'][0]
    dof = kwTest['ddof1'][0]
    n = temp_df.shape[0]
    esq = H * (n + 1)/(n**2 - 1)
    return H, p, dof, esq


def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos_new'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg_new'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

def get_significance_stats(dist1, dist2, pub, entity, emotion):

    emot1_shapiro = stats.shapiro(dist1)
    emot2_shapiro = stats.shapiro(dist2)
    emot1_shapiro = (emot1_shapiro.statistic, emot1_shapiro.pvalue)
    emot2_shapiro = (emot2_shapiro.statistic, emot2_shapiro.pvalue)
    H, p, dof, es = get_KW_stat(dist1, dist2)
    # t, p , dof = welch_ttest(emot1, emot2)
    res = [entity, pub, emotion, dist1.shape[0], dist2.shape[0], emot1_shapiro[0], emot1_shapiro[1], emot2_shapiro[0], emot2_shapiro[1], p, H, dof, es]
    # print(res)
    return res

def get_mean_emotions(df):
    
    return df['anger'].mean(), df['joy'].mean(), df['optimism'].mean(), df['sadness'].mean()

def get_mean_emotions_per_sentiment(df, entity):
    
    entity_mentions = df[df[entity + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos_new'].mean()
    neu_mean = entity_mentions[entity + '_neu_new'].mean()
    neg_mean = entity_mentions[entity + '_neg_new'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos_new'] > pos_mean * 1.5))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg_new'] > neg_mean * 1.5))]
    
#     print(entity, pos_entity_mentions.shape, pos_entity_mentions_.shape)
#     print(entity, neg_entity_mentions.shape, neg_entity_mentions_.shape)
    
    pos_tweet_emot = get_mean_emotions(pos_entity_mentions)
    neg_tweet_emot = get_mean_emotions(neg_entity_mentions)
    
    return pos_entity_mentions, neg_entity_mentions, pos_tweet_emot, neg_tweet_emot

  return warn(


### News headlines statistical test

In [14]:
news.columns

Index(['Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0', 'Unnamed: 0.1', 'id',
       'author', 'date', 'title', 'publication', 'trump_neg', 'trump_neu',
       'trump_pos', 'biden_neg', 'biden_neu', 'biden_pos', 'trump_flag',
       'biden_flag', 'vad_neg', 'vad_neu', 'vad_pos', 'vad_comp', 'text',
       'processed_text', 'topic_ids', 'topics', 'subtopic', 'trump_pos_new',
       'trump_neg_new', 'trump_neu_new', 'biden_pos_new', 'biden_neg_new',
       'biden_neu_new'],
      dtype='object')

In [15]:
R_news = news[((news['publication'] == 'Breitbart News') | (news['publication'] == 'Fox News'))]
L_news = news[((news['publication'] == 'CNN') | (news['publication'] == 'The Washington Post'))]
C_news = news[((news['publication'] == 'Business Insider') | (news['publication'] ==  'USA Today'))]

In [16]:
R_news.shape, L_news.shape, C_news.shape

((15704, 32), (11163, 32), (9019, 32))

In [17]:
# user_resp[user_resp.topics == 'fact-check'][['tweet_id', 'conversation_id']].astype(int)

In [18]:
cols = ['trump_pos_new', 'trump_neg_new', 'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new']

In [19]:
R_news[cols].describe()

Unnamed: 0,trump_pos_new,trump_neg_new,trump_neu_new,biden_pos_new,biden_neg_new,biden_neu_new
count,10888.0,10888.0,10888.0,6692.0,6692.0,6692.0
mean,0.161783,0.503286,0.334931,0.177913,0.409237,0.41285
std,0.259269,0.392599,0.328652,0.255087,0.37287,0.326753
min,0.001367,0.004249,0.004203,0.001306,0.004645,0.00455
25%,0.007307,0.082021,0.04543,0.014483,0.063505,0.094535
50%,0.027474,0.485243,0.187865,0.046851,0.248991,0.346453
75%,0.173959,0.936599,0.628287,0.228202,0.832834,0.731991
max,0.974883,0.994074,0.980883,0.975993,0.992216,0.977706


In [20]:
news['trump_flag'] = news.title.str.contains('Trump')
news['biden_flag'] = news.title.str.contains('Biden')

### Comparing sentiment distributions (toward an entity) across L and R news outlets 

In [21]:
# emotions = ['anger', 'joy', 'optimism', 'sadness']
sstat = []
for ent in ['biden', 'trump']:
    
    # l_df_pos, l_df_neg, _, _ = get_mean_emotions_per_sentiment(L_user_resp[L_user_resp[ent + '_flag'] == True], ent)
    # r_df_pos, r_df_neg, _, _ = get_mean_emotions_per_sentiment(R_user_resp[R_user_resp[ent + '_flag'] == True], ent)

    for senti in ['neg', 'pos']:
        
        # n = min(l_df_neg.shape[0], r_df_neg.shape[0])
        dist1 = L_news_tweets[L_news_tweets[ent.title() + '_flag'] == True][ent + '_' + senti + '_new'].dropna()
        dist2 = R_news_tweets[R_news_tweets[ent.title() + '_flag'] == True][ent + '_' + senti + '_new'].dropna()
        
#         n = min(dist1.shape[0], dist2.shape[0])
        
#         dist1 = dist1.sample(n)
#         dist2 = dist2.sample(n)
        
        # print(stats.shapiro(dist1), stats.shapiro(dist2))
        # sys.exit()

        res = get_significance_stats(dist1, dist2, ent, senti, 'None')
        # print(res)
        sstat.append(res)
        # sys.exit()
    # print()
    
stats_df = pd.DataFrame(sstat, columns = ['senti', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist1', 'shapiro_t_dist2', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df



Unnamed: 0,senti,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist1,shapiro_t_dist2,shapiro_p_dist2,KW-p,KW-H,KW-dof,ES
0,neg,biden,,3687,1933,0.723,0.0,0.836,0.0,0.0,437.627,1,0.078
1,pos,biden,,3687,1933,0.864,0.0,0.72,0.0,0.0,508.343,1,0.09
2,neg,trump,,10645,2656,0.811,0.0,0.792,0.0,0.0,482.047,1,0.036
3,pos,trump,,10645,2656,0.618,0.0,0.703,0.0,0.0,164.748,1,0.012


In [22]:
stats_df.to_csv('results/statistical_test_L_vs_R_news_tweets.csv')

In [23]:
news.topics.unique()

array(['covid', 'social_media', 'election', 'other', 'economy',
       'internatinal', 'court', 'democrats', 'healthcare', 'climate',
       'republican', 'security', 'immigration', 'floyd', 'capitol',
       'hunter_biden', 'proud boys', 'first_lady', 'abortion',
       'pres_debate'], dtype=object)

### Comparing sentiment distributions separately for different topics

In [25]:
# emotions = ['anger', 'joy', 'optimism', 'sadness']
topics = set(L_news_tweets.topics).intersection(set(R_news_tweets.topics))
sstat = []
for ent in ['biden', 'trump']:
    
    # l_df_pos, l_df_neg, _, _ = get_mean_emotions_per_sentiment(L_user_resp[L_user_resp[ent + '_flag'] == True], ent)
    # r_df_pos, r_df_neg, _, _ = get_mean_emotions_per_sentiment(R_user_resp[R_user_resp[ent + '_flag'] == True], ent)
    
    for top in topics:
        
        dataL = L_news_tweets[L_news_tweets.topics == top]
        dataR = R_news_tweets[R_news_tweets.topics == top]

        for senti in ['neg', 'pos']:

            # n = min(l_df_neg.shape[0], r_df_neg.shape[0])
            dist1 = dataL[dataL[ent.title() + '_flag'] == True][ent + '_' + senti + '_new'].dropna()
            dist2 = dataR[dataR[ent.title() + '_flag'] == True][ent + '_' + senti + '_new'].dropna()
            
#             n = min(dist1.shape[0], dist2.shape[0])
        
#             dist1 = dist1.sample(n)
#             dist2 = dist2.sample(n)
            # print(dist1.shape, dist2.shape)

            # print(stats.shapiro(dist1), stats.shapiro(dist2))
            # sys.exit()
            if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

                res = get_significance_stats(dist1, dist2, ent, senti, top)
                # print(res)
                sstat.append(res)
                
            # else:
            #     print(top, senti, dist1.shape, dist2.shape)
            # sys.exit()
        # print()
    
stats_df = pd.DataFrame(sstat, columns = ['senti', 'Entity', 'Topic', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist1', 'shapiro_t_dist2', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df

Unnamed: 0,senti,Entity,Topic,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist1,shapiro_t_dist2,shapiro_p_dist2,KW-p,KW-H,KW-dof,ES
0,neg,biden,immigration,11,19,0.828,0.022,0.859,0.009,0.561,0.338,1,0.012
1,pos,biden,immigration,11,19,0.73,0.001,0.769,0.0,0.591,0.289,1,0.01
2,neg,biden,republican,117,60,0.724,0.0,0.861,0.0,0.0,21.608,1,0.123
3,pos,biden,republican,117,60,0.843,0.0,0.515,0.0,0.0,24.095,1,0.137
4,neg,biden,conspiracy_theory,105,28,0.837,0.0,0.798,0.0,0.014,6.005,1,0.045
5,pos,biden,conspiracy_theory,105,28,0.654,0.0,0.62,0.0,0.096,2.778,1,0.021
6,neg,biden,democrats,511,294,0.634,0.0,0.836,0.0,0.0,123.307,1,0.153
7,pos,biden,democrats,511,294,0.883,0.0,0.758,0.0,0.0,96.729,1,0.12
8,neg,biden,economy,103,60,0.743,0.0,0.868,0.0,0.002,9.169,1,0.057
9,pos,biden,economy,103,60,0.893,0.0,0.743,0.0,0.0,19.247,1,0.119


In [515]:
# dist1 = R_news[R_news.topics == 'security']['trump_pos_new'].sample(200)
# dist2 = R_news[R_news.topics == 'security']['trump_pos_new'].sample(200)

# get_significance_stats(dist1, dist2, 'Trump', senti, top)

In [516]:
 # L_news[L_news.topics == 'security']['trump_pos_new'].sample(200)

In [26]:
stats_df[['senti', 'Entity', 'Topic', 'KW-p', 'KW-H', 'ES']].to_csv('results/statistical_test_L_vs_R_news_topic_wise.csv')

In [27]:
stats_df.sort_values(by=['ES'], ascending=False) #.iloc[0:25].Entity.value_counts()

Unnamed: 0,senti,Entity,Topic,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist1,shapiro_t_dist2,shapiro_p_dist2,KW-p,KW-H,KW-dof,ES
23,pos,biden,newyork,29,48,0.871,0.002,0.653,0.0,0.0,15.13,1,0.199
20,neg,biden,impeachment,30,22,0.681,0.0,0.816,0.001,0.005,8.031,1,0.157
22,neg,biden,newyork,29,48,0.734,0.0,0.804,0.0,0.001,11.89,1,0.156
6,neg,biden,democrats,511,294,0.634,0.0,0.836,0.0,0.0,123.307,1,0.153
16,neg,biden,blm,100,90,0.748,0.0,0.861,0.0,0.0,27.728,1,0.147
21,pos,biden,impeachment,30,22,0.861,0.001,0.726,0.0,0.007,7.313,1,0.143
3,pos,biden,republican,117,60,0.843,0.0,0.515,0.0,0.0,24.095,1,0.137
2,neg,biden,republican,117,60,0.724,0.0,0.861,0.0,0.0,21.608,1,0.123
7,pos,biden,democrats,511,294,0.883,0.0,0.758,0.0,0.0,96.729,1,0.12
9,pos,biden,economy,103,60,0.893,0.0,0.743,0.0,0.0,19.247,1,0.119


### Comparing sentiment toward Trump vs Biden for each news outlet

In [28]:
sstat = []
for senti in ['neg', 'pos']:

    for n, grp in news.groupby('publication'):

        # print(n, grp.shape)
        dist1 = grp[grp['trump_flag'] == True]['trump_' + senti + '_new']
        dist2 = grp[grp['biden_flag'] == True]['biden_' + senti + '_new']
        
        res = get_significance_stats(dist1, dist2, n, senti, 'None')
            # print(res)
        sstat.append(res)
        
stats_df = pd.DataFrame(sstat, columns = ['senti', 'Pub', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df



Unnamed: 0,senti,Pub,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,neg,Breitbart News,,5154,2972,0.814,0.0,0.825,0.0,0.0,144.198,1,0.018
1,neg,Business Insider,,4074,1206,0.808,0.0,0.775,0.0,0.0,689.017,1,0.131
2,neg,CNN,,5397,1643,0.826,0.0,0.671,0.0,0.0,1099.357,1,0.156
3,neg,Fox News,,5734,3720,0.827,0.0,0.819,0.0,0.0,88.246,1,0.009
4,neg,The Washington Post,,3896,1198,0.82,0.0,0.751,0.0,0.0,826.089,1,0.162
5,neg,USA Today,,3395,1352,0.826,0.0,0.692,0.0,0.0,423.079,1,0.089
6,pos,Breitbart News,,5154,2972,0.657,0.0,0.712,0.0,0.0,115.301,1,0.014
7,pos,Business Insider,,4074,1206,0.591,0.0,0.832,0.0,0.0,652.408,1,0.124
8,pos,CNN,,5397,1643,0.544,0.0,0.784,0.0,0.0,842.183,1,0.12
9,pos,Fox News,,5734,3720,0.647,0.0,0.691,0.0,0.0,78.014,1,0.008


### User response Statistical test

### Fav vs non-fav

In [34]:
user_resp.shape, user_resp.columns

((4039608, 65),
 Index(['Unnamed: 0.2', 'Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1',
        'tweet_id', 'conversation_id', 'author_id_x', 'created_at_x', 'geo_x',
        'lang_x', 'like_count_x', 'quote_count_x', 'reply_count_x',
        'retweet_count_x', 'source_x', 'text_x', 'anger', 'joy', 'optimism',
        'sadness', 'tweet_id_y', 'author_id_y', 'created_at_y', 'geo_y',
        'lang_y', 'like_count_y', 'quote_count_y', 'reply_count_y',
        'retweet_count_y', 'source_y', 'text_y', 'publication', 'topics_x',
        'topic_labels', 'theme', 'vad_pos_senti', 'vad_neu_senti',
        'vad_neg_senti', 'trump_neg', 'trump_neu', 'trump_pos', 'biden_neg',
        'biden_neu', 'biden_pos', 'trump_flag', 'biden_flag', 'tweet_freq_y',
        'neg_senti', 'neu_senti', 'pos_senti', 'topics_y', 'subtopic',
        'topic_ids', 'subtopic.1', 'trump_pos_new', 'trump_neg_new',
        'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new',
        'trump_flag', 'biden_flag

In [36]:
def get_pos_and_neg_mentions(df, pub, ent):

    pos_mentions = df[((df['publication'] == pub) & (df[ent + '_pos'] > df[ent + '_pos'].mean()))]
    neg_mentions = df[((df['publication'] == pub) & (df[ent + '_neg'] > df[ent + '_neg'].mean()))]
    
    return pos_mentions, neg_mentions

In [37]:
T_pos_men_CNN, T_neg_men_CNN = get_pos_and_neg_mentions(user_resp, 'CNN', 'trump')
B_pos_men_CNN, B_neg_men_CNN = get_pos_and_neg_mentions(user_resp, 'CNN', 'biden')

T_pos_men_TWP, T_neg_men_TWP = get_pos_and_neg_mentions(user_resp, 'The Washington Post', 'trump')
B_pos_men_TWP, B_neg_men_TWP = get_pos_and_neg_mentions(user_resp, 'The Washington Post', 'biden')

T_pos_men_UST, T_neg_men_UST = get_pos_and_neg_mentions(user_resp, 'USA Today', 'trump')
B_pos_men_UST, B_neg_men_UST = get_pos_and_neg_mentions(user_resp, 'USA Today', 'biden')

T_pos_men_BI, T_neg_men_BI = get_pos_and_neg_mentions(user_resp, 'Business Insider', 'trump')
B_pos_men_BI, B_neg_men_BI = get_pos_and_neg_mentions(user_resp, 'Business Insider', 'biden')

T_pos_men_FN, T_neg_men_FN = get_pos_and_neg_mentions(user_resp, 'Fox News', 'trump')
B_pos_men_FN, B_neg_men_FN = get_pos_and_neg_mentions(user_resp, 'Fox News', 'biden')

T_pos_men_BN, T_neg_men_BN = get_pos_and_neg_mentions(user_resp, 'Breitbart News', 'trump')
B_pos_men_BN, B_neg_men_BN = get_pos_and_neg_mentions(user_resp, 'Breitbart News', 'biden')

In [38]:
T_pos_men_FN.shape, T_neg_men_FN.shape, B_pos_men_FN.shape, B_neg_men_FN.shape

((44991, 65), (181601, 65), (35279, 65), (49884, 65))

In [39]:
T_pos_men_BN.shape, T_neg_men_BN.shape, B_pos_men_BN.shape, B_neg_men_BN.shape

((61965, 65), (187229, 65), (32916, 65), (48338, 65))

In [40]:
R_fav = pd.concat((T_pos_men_FN, T_pos_men_BN, B_neg_men_FN, B_neg_men_BN), axis = 0)
R_unfav = pd.concat((T_neg_men_FN, T_neg_men_BN, B_pos_men_FN, B_pos_men_BN), axis = 0)
print(R_fav.shape, R_unfav.shape)

L_fav = pd.concat((B_pos_men_CNN, B_pos_men_TWP, T_neg_men_CNN, T_neg_men_TWP), axis = 0)
L_unfav = pd.concat((T_pos_men_CNN, T_pos_men_TWP, B_neg_men_CNN, B_neg_men_TWP,), axis = 0)
L_fav.shape, L_unfav.shape

(205178, 65) (437025, 65)


((1416643, 65), (479598, 65))

In [41]:
def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity.title() + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

def get_normalized_dist(dist):
    
    return (dist - dist.min()) / (dist.max() - dist.min())

def get_normalized_emotion_scores(df):
    
    df['anger_norm'] = get_normalized_dist(df.anger)
    df['optimism_norm'] = get_normalized_dist(df.optimism)
    df['sadness_norm'] = get_normalized_dist(df.sadness)
    df['joy_norm'] = get_normalized_dist(df.joy)
    
    return df

# def get_normalized_emotion_scores(df):
    
#     from sklearn.preprocessing import MinMaxScaler
    
#     scaler = MinMaxScaler()

#     df_ = df[['anger', 'optimism', 'sadness', 'joy']]
#     normalized_data = scaler.fit_transform(df_)
#     df_ = pd.DataFrame(normalized_data, columns = ['anger_norm', 'optimism_norm', 'sadness_norm', 'joy_norm'])
#     df = pd.concat((df, df_), axis = 1)

#     return df

In [42]:
L_user_resp = get_normalized_emotion_scores(L_user_resp)
R_user_resp = get_normalized_emotion_scores(R_user_resp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['anger_norm'] = get_normalized_dist(df.anger)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['optimism_norm'] = get_normalized_dist(df.optimism)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sadness_norm'] = get_normalized_dist(df.sadness)
A value is trying to be set on a copy of a slice

In [43]:
# # df_L_T = L_user_resp[L_user_resp.Trump_flag == True]

# L_unfav1, L_fav1 = get_pos_neg_mentions(L_user_resp[L_user_resp.Trump_flag == True], 'trump')
# L_fav2, L_unfav2 = get_pos_neg_mentions(L_user_resp[L_user_resp.Biden_flag == True], 'biden')

# L_fav = pd.concat((L_fav1, L_fav2), axis = 0)
# L_unfav = pd.concat((L_unfav1, L_unfav2), axis = 0)

# # df_L_T = L_user_resp[L_user_resp.Trump_flag == True]

# R_fav1, R_unfav1 = get_pos_neg_mentions(R_user_resp[R_user_resp.Trump_flag == True], 'trump')
# R_unfav2, R_fav2 = get_pos_neg_mentions(R_user_resp[R_user_resp.Biden_flag == True], 'biden')

# R_fav = pd.concat((R_fav1, R_fav2), axis = 0)
# R_unfav = pd.concat((R_unfav1, R_unfav2), axis = 0)

In [44]:
# L_unfav.trump_pos.mean(), L_fav.trump_pos.mean(), L_unfav.biden_pos.mean(), L_fav.biden_pos.mean()
# R_unfav.trump_pos.mean(), R_fav.trump_pos.mean(), R_unfav.biden_pos.mean(), R_fav.biden_pos.mean()

L_fav.shape, L_unfav.shape, R_fav.shape, R_unfav.shape

((1416643, 65), (479598, 65), (205178, 65), (437025, 65))

In [45]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
L_fav[L_fav.anger.isna() == True][emotions].head()

Unnamed: 0,anger,joy,optimism,sadness


In [46]:
emotions = ['anger_norm', 'joy_norm', 'optimism_norm', 'sadness_norm']
L_fav[L_fav.anger_norm.isna() == True][emotions].head()

AttributeError: 'DataFrame' object has no attribute 'anger_norm'

In [753]:
R_fav.columns

Index(['Unnamed: 0.6', 'Unnamed: 0.5', 'Unnamed: 0.4', 'Unnamed: 0',
       'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0.1.1',
       'Unnamed: 0.1.1.1', 'tweet_id', 'conversation_id', 'author_id',
       'created_at', 'geo', 'lang', 'like_count', 'quote_count', 'reply_count',
       'retweet_count', 'source', 'text', 'display_name', 'topics',
       'topic_labels', 'theme', 'pos_senti', 'neu_senti', 'neg_senti',
       'publication', 'date', 'week', 'polIncl', 'hashtag', 'clean_text',
       'proc_text', 'Segmented#', 'processed_text', 'topic_ids', 'subtopic',
       'trump_pos_new', 'trump_neg_new', 'trump_neu_new', 'biden_pos_new',
       'biden_neg_new', 'biden_neu_new', 'Trump_flag', 'Biden_flag',
       'Trump_Biden_flag', 'topic_id'],
      dtype='object')

### Overall Fav-vs-UnFav

In [710]:
# def get_significance_stats(dist1, dist2, pub, entity, emotion):
    
#     print(dist1.shape, dist2.shape)

#     emot1_shapiro = stats.shapiro(dist1)
#     emot2_shapiro = stats.shapiro(dist2)
#     emot1_shapiro = (emot1_shapiro.statistic, emot1_shapiro.pvalue)
#     emot2_shapiro = (emot2_shapiro.statistic, emot2_shapiro.pvalue)
#     H, p, dof, es = get_KW_stat(dist1, dist2)
#     # t, p , dof = welch_ttest(emot1, emot2)
#     res = [entity, pub, emotion, dist1.shape[0], dist2.shape[0], emot1_shapiro[0], emot1_shapiro[1], emot2_shapiro[0], emot2_shapiro[1], p, H, dof, es]
#     # print(res)
#     return res

In [47]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
# emotions = ['anger_norm', 'joy_norm', 'optimism_norm', 'sadness_norm']
sstat = []

for emot in emotions:
    
    for i, ur in enumerate([(R_fav, R_unfav),(L_fav, L_unfav)]):

        data1 = ur[0]
        data2 = ur[1]
        # print(data1.shape, data2.shape)
        dist1 = data1[emot].dropna()
        dist2 = data2[emot].dropna()
        
        # print(dist1.shape, dist2.shape)
        
        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):
            
            if(i%2 == 0):
                res = get_significance_stats(dist1, dist2, 'R', None, emot)
            else:
                res = get_significance_stats(dist1, dist2, 'L', None, emot)
            # print(res)
            sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(5)
stats_df



Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,,R,anger,205178,437025,0.81665,0.0,0.79013,0.0,0.0,2546.07512,1,0.00396
1,,L,anger,1416643,479598,0.80555,0.0,0.81617,0.0,0.0,205.83055,1,0.00011
2,,R,joy,205178,437025,0.54758,0.0,0.5001,0.0,0.0,2791.83923,1,0.00435
3,,L,joy,1416643,479598,0.50758,0.0,0.51033,0.0,0.0,243.24842,1,0.00013
4,,R,optimism,205178,437025,0.61213,0.0,0.57999,0.0,0.0,1372.35711,1,0.00214
5,,L,optimism,1416643,479598,0.61381,0.0,0.6134,0.0,0.25376,1.3025,1,0.0
6,,R,sadness,205178,437025,0.61389,0.0,0.59011,0.0,0.0,1116.2638,1,0.00174
7,,L,sadness,1416643,479598,0.60837,0.0,0.62587,0.0,0.0,645.84242,1,0.00034


In [48]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
# emotions = ['anger_norm', 'joy_norm', 'optimism_norm', 'sadness_norm']
sstat = []
topics = set(L_user_resp.topics).intersection(set(R_user_resp.topics))

for emot in emotions:
    
    for top in topics:

        for i, ur in enumerate([(R_fav, R_unfav),(L_fav, L_unfav)]):
        
            data1 = ur[0]
            data2 = ur[1]
            dist1 = data1[data1.topics == top][emot].dropna()
            dist2 = data2[data2.topics == top][emot].dropna()

            if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

                if(i%2 == 0):
                    res = get_significance_stats(dist1, dist2, 'R', top, emot)
                else:
                    res = get_significance_stats(dist1, dist2, 'L', top, emot)
                # print(res)
                sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
# stats_df

AttributeError: 'DataFrame' object has no attribute 'topics'

In [776]:
stats_df[stats_df['Entity'] == 'R']

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,capitol,R,anger,626,3205,0.705,0.0,0.729,0.0,0.04,4.225,1,0.001
2,republican,R,anger,653,157,0.822,0.0,0.817,0.0,0.603,0.271,1,0.0
4,economy,R,anger,2692,310,0.808,0.0,0.856,0.0,0.005,8.034,1,0.003
6,fact-check,R,anger,1388,129,0.747,0.0,0.83,0.0,0.054,3.727,1,0.002
8,security,R,anger,174,2255,0.829,0.0,0.776,0.0,0.001,11.622,1,0.005
10,others,R,anger,21597,16996,0.807,0.0,0.819,0.0,0.0,35.613,1,0.001
12,media,R,anger,187,4040,0.803,0.0,0.838,0.0,0.003,8.812,1,0.002
14,democarts,R,anger,3239,16115,0.81,0.0,0.807,0.0,0.038,4.32,1,0.0
16,america,R,anger,1147,18,0.819,0.0,0.844,0.007,0.331,0.947,1,0.001
18,election,R,anger,7625,23774,0.79,0.0,0.803,0.0,0.0,13.674,1,0.0


In [51]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
topics = set(L_user_resp.topics).intersection(set(R_user_resp.topics))
sstat = []
for emot in emotions:
    
    for ur in [L_user_resp, R_user_resp]:

        dataFav = ur[ur[ent.title() + '_flag'] == True]
        dataR = R_user_resp[R_user_resp[ent.title() + '_flag'] == True]

    # for top in topics:

        dist1 = dataL[dataL.topics == top][emot].dropna()
        dist2 = dataR[dataR.topics == top][emot].dropna()

        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

            res = get_significance_stats(dist1, dist2, ent, top, emot)
            # print(res)
            sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df

AttributeError: 'DataFrame' object has no attribute 'topics'

In [None]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
topics = set(L_user_resp.topics).intersection(set(R_user_resp.topics))
sstat = []
for emot in emotions:

    dataFav = L_user_resp[L_user_resp[ent.title() + '_flag'] == True]
    dataR = R_user_resp[R_user_resp[ent.title() + '_flag'] == True]

    for top in topics:

        dist1 = dataL[dataL.topics == top][emot].dropna()
        dist2 = dataR[dataR.topics == top][emot].dropna()

        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

            res = get_significance_stats(dist1, dist2, ent, top, emot)
            # print(res)
            sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df

### Left vs Right (emotion distributions) based on topics

In [587]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
topics = set(L_user_resp.topics).intersection(set(R_user_resp.topics))
sstat = []
for emot in emotions:

    for top in topics:

        dist1 = L_user_resp[L_user_resp.topics == top][emot].dropna()
        dist2 = R_user_resp[R_user_resp.topics == top][emot].dropna()

        # print(dist2.shape, dist1.shape)

        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

            res = get_significance_stats(dist1, dist2, ent, top, emot)
            # print(res)
            sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,pres_debate,trump,anger,10034,5324,0.789,0.0,0.783,0.0,0.004,8.099,1,0.001
1,economy,trump,anger,8096,4207,0.833,0.0,0.815,0.0,0.0,22.668,1,0.002
2,others,trump,anger,150168,45012,0.829,0.0,0.809,0.0,0.0,464.044,1,0.002
3,fact-check,trump,anger,14001,1460,0.816,0.0,0.755,0.0,0.0,36.659,1,0.002
4,blm,trump,anger,14838,5523,0.783,0.0,0.802,0.0,0.0,24.183,1,0.001
5,international,trump,anger,8611,719,0.822,0.0,0.842,0.0,0.0,13.821,1,0.001
6,supreme_court,trump,anger,25720,5330,0.828,0.0,0.794,0.0,0.0,171.917,1,0.006
7,media,trump,anger,16445,7158,0.788,0.0,0.827,0.0,0.0,136.719,1,0.006
8,democarts,trump,anger,27540,21555,0.827,0.0,0.812,0.0,0.0,33.533,1,0.001
9,america,trump,anger,6516,1728,0.821,0.0,0.833,0.0,0.931,0.008,1,0.0


In [347]:
def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity.title() + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

In [348]:
T_pos_mentions_R,  T_neg_mentions_R = get_pos_neg_mentions(R_user_resp, "trump")
B_pos_mentions_R,  B_neg_mentions_R = get_pos_neg_mentions(R_user_resp, "biden")

T_pos_mentions_L,  T_neg_mentions_L = get_pos_neg_mentions(L_user_resp, "trump")
B_pos_mentions_L,  B_neg_mentions_L = get_pos_neg_mentions(L_user_resp, "biden")

In [351]:
# T_pos_mentions_R.shape, T_neg_mentions_R.shape, B_pos_mentions_R.shape, B_neg_mentions_R.shape, T_pos_mentions_L.shape, T_neg_mentions_L.shape, B_pos_mentions_L.shape, B_neg_mentions_L.shape

In [371]:
def get_mean_emotions_per_topic_for_pos_neg_entity_mentions(df1, df2, cols):
    
    return pd.concat((df1.groupby('topics').mean()[cols].rename(columns = {'anger': 'anger_P', 'joy': 'joy_P', 'optimism': 'optimism_P', 'sadness': 'sadness_P'}), 
           df2.groupby('topics').mean()[cols].rename(columns = {'anger': 'anger_N', 'joy': 'joy_N', 'optimism': 'optimism_N', 'sadness': 'sadness_N'})), axis = 1)

R_fav = pd.concat((T_pos_mentions_R, B_neg_mentions_R), axis = 0)
R_unfav = pd.concat((T_neg_mentions_R, B_pos_mentions_R), axis = 0)
R_fav.shape, R_unfav.shape

L_fav = pd.concat((T_neg_mentions_L, B_pos_mentions_L), axis = 0)
L_unfav = pd.concat((T_pos_mentions_L, B_neg_mentions_L), axis = 0)
L_fav.shape, L_unfav.shape

# get_mean_emotions_per_topic_for_pos_neg_entity_mentions(L_fav, L_unfav, cols).to_csv('results/user_resp_analysis/L_fav_vs_unfav_topic_wise.csv')

((320428, 43), (169575, 43))

In [409]:
import warnings
warnings.filterwarnings('ignore')

### Comparing emotion distributions in user response between positive and negative mentions of an entity 

In [397]:
def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity.title() + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

In [398]:
T_pos_mentions_R,  T_neg_mentions_R = get_pos_neg_mentions(R_user_resp, "trump")
B_pos_mentions_R,  B_neg_mentions_R = get_pos_neg_mentions(R_user_resp, "biden")

T_pos_mentions_L,  T_neg_mentions_L = get_pos_neg_mentions(L_user_resp, "trump")
B_pos_mentions_L,  B_neg_mentions_L = get_pos_neg_mentions(L_user_resp, "biden")

In [440]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
topics = set(L_user_resp.topics).intersection(set(R_user_resp.topics))
sstat = []

def get_stats_df(df1, df2, ent):
    
    for emot in emotions:

        # for top in topics:

        dist1 = T_pos_mentions_R[emot].dropna()
        dist2 = T_neg_mentions_R[emot].dropna()
            
            # print(set(df1.topics))
            
            # sys.exit()

            # if(top in set(df1.topics) and top in set(df2.topics)):

            # dist1 = df1[df1.topics == top][emot].dropna()
            # dist2 = df2[df2.topics == top][emot].dropna()

        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

            res = get_significance_stats(dist1, dist2, ent, top, emot)
            # print(res)
            sstat.append(res)
    
    return sstat
        
df1 = B_pos_mentions_L
df2 = B_neg_mentions_L

sstat = get_stats_df(df1, df2, ent = 'Biden')
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)

In [441]:
stats_df.sort_values(by='ES', ascending=False)

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
2,pres_debate,Biden,optimism,21919,57444,0.623,0.0,0.588,0.0,0.0,252.625,1,0.003
0,pres_debate,Biden,anger,21919,57444,0.807,0.0,0.789,0.0,0.0,159.892,1,0.002
3,pres_debate,Biden,sadness,21919,57444,0.607,0.0,0.586,0.0,0.0,105.184,1,0.001
1,pres_debate,Biden,joy,21919,57444,0.508,0.0,0.504,0.0,0.001,11.206,1,0.0
