### Statistical tests and user response analysis

In [None]:
import pandas as pd
import numpy as np
import sys

from scipy import stats

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Download and place the dataset under dataset folder

news = pd.read_csv('dataset/news.csv')
news_tweets = pd.read_csv('dataset/news_tweets.csv')
user_resp = pd.read_csv('dataset/user_resp.csv')

In [5]:
news.shape, news_tweets.shape, user_resp.shape

((35886, 32), (24584, 49), (4039629, 61))

In [42]:
R_user_resp.columns

Index(['Unnamed: 0', 'tweet_id', 'conversation_id', 'author_id_x',
       'created_at_x', 'geo_x', 'lang_x', 'like_count_x', 'quote_count_x',
       'reply_count_x', 'retweet_count_x', 'source_x', 'text_x', 'anger',
       'joy', 'optimism', 'sadness', 'tweet_id_y', 'author_id_y',
       'created_at_y', 'geo_y', 'lang_y', 'like_count_y', 'quote_count_y',
       'reply_count_y', 'retweet_count_y', 'source_y', 'text_y', 'publication',
       'topics_x', 'topic_labels', 'theme', 'vad_pos_senti', 'vad_neu_senti',
       'vad_neg_senti', 'trump_neg', 'trump_neu', 'trump_pos', 'biden_neg',
       'biden_neu', 'biden_pos', 'trump_flag', 'biden_flag', 'tweet_freq_y',
       'neg_senti', 'neu_senti', 'pos_senti', 'topics_y', 'subtopic',
       'topic_ids', 'subtopic.1', 'trump_pos_new', 'trump_neg_new',
       'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new',
       'Trump_flag', 'Biden_flag', 'Trump_Biden_flag', 'topic_id'],
      dtype='object')

## Mean emotions across topics

In [6]:
R_user_resp = user_resp[((user_resp['publication'] == 'Breitbart News') | (user_resp['publication'] == 'Fox News'))]
L_user_resp = user_resp[((user_resp['publication'] == 'CNN') | (user_resp['publication'] == 'The Washington Post'))]
C_user_resp = user_resp[((user_resp['publication'] == 'Business Insider') | (user_resp['publication'] ==  'USA Today'))]

In [37]:
# round(C_user_resp.optimism.astype(float).mean(),3)

0.121

In [64]:
def get_mean_emotions_per_topic(df, emotion):
    
    scores = []
    for top, grp in df.groupby('topics_y'):

        # print(top, grp[emotion].astype(float).mean())
        scores.append((top, grp[emotion].astype(float).mean()))
        
    return pd.DataFrame(scores, columns = ['topic', emotion + '_score'])
        
R_user_resp_top_scores = get_mean_emotions_per_topic(R_user_resp, 'anger').merge(get_mean_emotions_per_topic(R_user_resp, 'joy'), on = 'topic').merge(get_mean_emotions_per_topic(R_user_resp, 'sadness'), on = 'topic').merge(get_mean_emotions_per_topic(R_user_resp, 'optimism'), on = 'topic')
C_user_resp_top_scores = get_mean_emotions_per_topic(C_user_resp, 'anger').merge(get_mean_emotions_per_topic(C_user_resp, 'joy'), on = 'topic').merge(get_mean_emotions_per_topic(C_user_resp, 'sadness'), on = 'topic').merge(get_mean_emotions_per_topic(C_user_resp, 'optimism'), on = 'topic')
L_user_resp_top_scores = get_mean_emotions_per_topic(L_user_resp, 'anger').merge(get_mean_emotions_per_topic(L_user_resp, 'joy'), on = 'topic').merge(get_mean_emotions_per_topic(L_user_resp, 'sadness'), on = 'topic').merge(get_mean_emotions_per_topic(L_user_resp, 'optimism'), on = 'topic')

In [69]:
R_user_resp_top_scores.shape, C_user_resp_top_scores.shape, L_user_resp_top_scores.shape

((20, 5), (20, 5), (20, 5))

In [399]:
i = pd.DataFrame(news.publication.value_counts()).rename(columns = {'publication': 'headlines'})
j = pd.DataFrame(news_tweets.publication.value_counts()).rename(columns = {'publication': 'tweets'})
k = pd.DataFrame(user_resp.publication.value_counts()).rename(columns = {'publication': 'responses'})

In [404]:
pd.concat((i,j,k), axis = 1).dropna().to_csv('results/freq.csv')

In [3]:
# user_resp = pd.read_parquet('results_old/user_resp_newslant_with_senti.parquet')

In [6]:
# news_tweets = pd.read_csv('results/news_tweets.csv')

In [337]:
final_user_resp.shape

(4039614, 60)

In [326]:
news_tweets[news_tweets.conversation_id != news_tweets.tweet_id][['tweet_id', 'conversation_id', 'author_id',
       'created_at', 'source', 'text', 'display_name', 'topics',
       'topic_labels', 'theme', 'pos_senti', 'neu_senti', 'neg_senti',
       'publication', 'polIncl']]

Unnamed: 0,tweet_id,conversation_id,author_id,created_at,source,text,display_name,topics,topic_labels,theme,pos_senti,neu_senti,neg_senti,publication,polIncl
817,1341439885253722115,1341437360513400832,759251,2020-12-22 17:46:06+00:00,TweetDeck,- February 4 -\n \nHouse Speaker Nancy Pelosi ...,CNN,democrats,other,other,0.057924,0.645182,0.296894,CNN,L
3006,1298658967611215874,1298582772932648960,759251,2020-08-26 16:30:00+00:00,TweetDeck,The sudden change in federal guidelines on cor...,CNN,national,covid_cases,covid,0.041912,0.704824,0.253265,CNN,L
3241,1295426129063563267,1295425955951980550,759251,2020-08-17 18:23:51+00:00,TweetDeck,"Eva Longoria, known for her role on Desperate ...",CNN,republican,other,other,0.296574,0.694021,0.009405,CNN,L
4343,1274010446287253507,1274008544258514945,759251,2020-06-19 16:05:35+00:00,TweetDeck,"""Since the police killing of George Floyd, Whi...",CNN,covid,other,other,0.026063,0.418564,0.555374,CNN,L
5184,1253463110305763328,1253462041425334273,759251,2020-04-23 23:17:48+00:00,TweetDeck,The measure passed the Senate earlier this wee...,CNN,other,economy_stimulus,economy,0.783940,0.214403,0.001657,CNN,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24310,1243920161264058369,1243920159674380288,15754281,2020-03-28 15:17:32+00:00,Twitter Web App,Congress passed the stimulus bill and Presiden...,USA TODAY,economy,economy_stimulus,economy,0.523542,0.473258,0.003200,USA Today,C
24340,1242097758774996992,1242097755323076608,15754281,2020-03-23 14:35:57+00:00,Twitter Web App,President Donald Trump announced new actions t...,USA TODAY,newyork,other,other,0.278815,0.687245,0.033939,USA Today,C
24344,1241746776148914176,1241746455972495361,15754281,2020-03-22 15:21:16+00:00,Twitter Web App,President Donald Trump offered assistance to N...,USA TODAY,covid,other,other,0.046696,0.547910,0.405394,USA Today,C
24426,1238063517955047424,1238062675273252864,15754281,2020-03-12 11:25:19+00:00,Twitter Web App,Minutes after addressing the nation about the ...,USA TODAY,covid,international_gen,international,0.038680,0.640357,0.320963,USA Today,C


In [335]:
len(set(news_tweets.conversation_id).intersection(set(final_user_resp.conversation_id)))

19815

In [336]:
len(set(final_user_resp.conversation_id) - set(news_tweets.conversation_id))

0

In [None]:
news_tweets.publication

In [71]:
user_resp.columns

Index(['Unnamed: 0', 'tweet_id', 'conversation_id', 'author_id_x',
       'created_at_x', 'geo_x', 'lang_x', 'like_count_x', 'quote_count_x',
       'reply_count_x', 'retweet_count_x', 'source_x', 'text_x', 'anger',
       'joy', 'optimism', 'sadness', 'tweet_id_y', 'author_id_y',
       'created_at_y', 'geo_y', 'lang_y', 'like_count_y', 'quote_count_y',
       'reply_count_y', 'retweet_count_y', 'source_y', 'text_y', 'publication',
       'topics_x', 'topic_labels', 'theme', 'vad_pos_senti', 'vad_neu_senti',
       'vad_neg_senti', 'trump_neg', 'trump_neu', 'trump_pos', 'biden_neg',
       'biden_neu', 'biden_pos', 'trump_flag', 'biden_flag', 'tweet_freq_y',
       'neg_senti', 'neu_senti', 'pos_senti', 'topics_y', 'subtopic',
       'topic_ids', 'subtopic.1', 'trump_pos_new', 'trump_neg_new',
       'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new',
       'Trump_flag', 'Biden_flag', 'Trump_Biden_flag', 'topic_id'],
      dtype='object')

In [78]:
f_user_resp.shape, user_resp.shape

((4027699, 61), (4039629, 61))

In [76]:
f_user_resp = user_resp[user_resp.conversation_id.isin(news_tweets.conversation_id.unique())]

In [35]:
f_user_resp_ = f_user_resp.merge(news_tweets[['conversation_id', 'topics', 'subtopic', 'topic_ids', 'subtopic', 'trump_pos_new', 'trump_neg_new', 'trump_neu_new', 'biden_pos_new', 
                                              'biden_neg_new', 'biden_neu_new', 'Trump_flag', 'Biden_flag', 'Trump_Biden_flag', 'topic_id']], on = 'conversation_id')

In [None]:
final_user_resp = f_user_resp_.drop_duplicates()

In [164]:
final_user_resp[((final_user_resp.Trump_flag) & (final_user_resp.Biden_flag))].shape

(522232, 60)

In [81]:
# final_user_resp.to_csv('results/user_resp.csv')

In [70]:
R_user_resp = final_user_resp[((final_user_resp['publication'] == 'Breitbart News') | (final_user_resp['publication'] == 'Fox News'))]
L_user_resp = final_user_resp[((final_user_resp['publication'] == 'CNN') | (final_user_resp['publication'] == 'The Washington Post'))]
C_user_resp = final_user_resp[((final_user_resp['publication'] == 'Business Insider') | (final_user_resp['publication'] ==  'USA Today'))]

NameError: name 'final_user_resp' is not defined

In [72]:
R_user_resp.shape, L_user_resp.shape, C_user_resp.shape

((1123244, 61), (2755256, 61), (161108, 61))

In [107]:
user_resp.joy.astype(float).mean()

ValueError: could not convert string to float: 'Twitter Media Studio'

In [73]:
R_user_resp.shape[0]/R_user_resp.conversation_id.nunique(), L_user_resp.shape[0]/L_user_resp.conversation_id.nunique(), C_user_resp.shape[0]/C_user_resp.conversation_id.nunique()

(322.30817790530847, 243.31119745672908, 32.17012779552716)

In [103]:
def get_pos_and_neg_mentions(df, pub, ent):

    pos_mentions = df[((df['publication'] == pub) & (df[ent + '_pos'] > df[ent + '_pos'].mean()))]
    neg_mentions = df[((df['publication'] == pub) & (df[ent + '_neg'] > df[ent + '_neg'].mean()))]
    
    return pos_mentions, neg_mentions

T_pos_men_CNN, T_neg_men_CNN = get_pos_and_neg_mentions(user_resp, 'CNN', 'trump')
B_pos_men_CNN, B_neg_men_CNN = get_pos_and_neg_mentions(user_resp, 'CNN', 'biden')

T_pos_men_TWP, T_neg_men_TWP = get_pos_and_neg_mentions(user_resp, 'The Washington Post', 'trump')
B_pos_men_TWP, B_neg_men_TWP = get_pos_and_neg_mentions(user_resp, 'The Washington Post', 'biden')

T_pos_men_UST, T_neg_men_UST = get_pos_and_neg_mentions(user_resp, 'USA Today', 'trump')
B_pos_men_UST, B_neg_men_UST = get_pos_and_neg_mentions(user_resp, 'USA Today', 'biden')

T_pos_men_BI, T_neg_men_BI = get_pos_and_neg_mentions(user_resp, 'Business Insider', 'trump')
B_pos_men_BI, B_neg_men_BI = get_pos_and_neg_mentions(user_resp, 'Business Insider', 'biden')

T_pos_men_FN, T_neg_men_FN = get_pos_and_neg_mentions(user_resp, 'Fox News', 'trump')
B_pos_men_FN, B_neg_men_FN = get_pos_and_neg_mentions(user_resp, 'Fox News', 'biden')

T_pos_men_BN, T_neg_men_BN = get_pos_and_neg_mentions(user_resp, 'Breitbart News', 'trump')
B_pos_men_BN, B_neg_men_BN = get_pos_and_neg_mentions(user_resp, 'Breitbart News', 'biden')


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [98]:
def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity.title() + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

T_pos_mentions_R,  T_neg_mentions_R = get_pos_neg_mentions(R_user_resp, "trump")
B_pos_mentions_R,  B_neg_mentions_R = get_pos_neg_mentions(R_user_resp, "biden")

T_pos_mentions_L,  T_neg_mentions_L = get_pos_neg_mentions(L_user_resp, "trump")
B_pos_mentions_L,  B_neg_mentions_L = get_pos_neg_mentions(L_user_resp, "biden")

In [99]:
T_pos_mentions_R.columns, T_pos_mentions_R.shape

(Index(['tweet_id', 'conversation_id', 'author_id_x', 'created_at_x', 'geo_x',
        'lang_x', 'like_count_x', 'quote_count_x', 'reply_count_x',
        'retweet_count_x', 'source_x', 'text_x', 'anger', 'joy', 'optimism',
        'sadness', 'tweet_id_y', 'author_id_y', 'created_at_y', 'geo_y',
        'lang_y', 'like_count_y', 'quote_count_y', 'reply_count_y',
        'retweet_count_y', 'source_y', 'text_y', 'publication', 'topics_x',
        'topic_labels', 'theme', 'vad_pos_senti', 'vad_neu_senti',
        'vad_neg_senti', 'trump_neg', 'trump_neu', 'trump_pos', 'biden_neg',
        'biden_neu', 'biden_pos', 'trump_flag', 'biden_flag', 'tweet_freq_y',
        'neg_senti', 'neu_senti', 'pos_senti', 'topics_y', 'subtopic',
        'topic_ids', 'subtopic', 'trump_pos_new', 'trump_neg_new',
        'trump_neu_new', 'biden_pos_new', 'biden_neg_new', 'biden_neu_new',
        'Trump_flag', 'Biden_flag', 'Trump_Biden_flag', 'topic_id'],
       dtype='object'),
 (106956, 60))

In [100]:
R_fav = pd.concat((T_pos_men_FN, T_pos_men_BN, B_neg_men_FN, B_neg_men_BN), axis = 0)
R_unfav = pd.concat((T_neg_men_FN, T_neg_men_BN, B_pos_men_FN, B_pos_men_BN), axis = 0)
print(R_fav.shape, R_unfav.shape)

L_fav = pd.concat((B_pos_men_CNN, B_pos_men_TWP, T_neg_men_CNN, T_neg_men_TWP), axis = 0)
L_unfav = pd.concat((T_pos_men_CNN, T_pos_men_TWP, B_neg_men_CNN, B_neg_men_TWP,), axis = 0)
L_fav.shape, L_unfav.shape

(205178, 60) (437025, 60)


((1416644, 60), (479598, 60))

In [173]:
def get_mean_emotions_for_pos_neg_entity_mentions(df1, df2, ent, cols):
    
    return pd.concat((pd.DataFrame(df1[cols].mean(), columns = [ent + '_Fav']), df2[cols].mean()), axis = 1).rename(columns={0: ent + "_unFav"})

_df1 = get_mean_emotions_for_pos_neg_entity_mentions(R_fav, R_unfav, ent = 'Right', cols = ['anger', 'joy', 'optimism', 'sadness'])
_df2 = get_mean_emotions_for_pos_neg_entity_mentions(L_fav, L_unfav, ent = 'Left', cols = ['anger', 'joy', 'optimism', 'sadness'])
# _df3 = get_mean_emotions_for_pos_neg_entity_mentions(T_pos_mentions_L, T_neg_mentions_L, ent = 'Trump_L', cols = ['anger', 'joy', 'optimism', 'sadness'])
# _df4 = get_mean_emotions_for_pos_neg_entity_mentions(B_pos_mentions_L, B_neg_mentions_L, ent = 'Biden_L', cols = ['anger', 'joy', 'optimism', 'sadness'])
comb_df = pd.concat((_df1, _df2), axis = 1)
comb_df.round(3)

Unnamed: 0,Right_Fav,Right_unFav,Left_Fav,Left_unFav
anger,0.638,0.678,0.66,0.653
joy,0.128,0.108,0.109,0.11
optimism,0.113,0.1,0.114,0.114
sadness,0.121,0.114,0.116,0.124


In [168]:
df = pd.DataFrame()
cols = ['anger', 'joy', 'optimism', 'sadness']
df = pd.concat((df, L_user_resp[cols].describe().loc['mean']), axis = 1)
df = pd.concat((df, C_user_resp[cols].describe().loc['mean']), axis = 1)
df = pd.concat((df, R_user_resp[cols].describe().loc['mean']), axis = 1)
# df.T.to_csv('results/user_resp_analysis/mean_emotions.csv')
df.T.round(4)

Unnamed: 0,anger,joy,optimism,sadness
mean,0.6466,0.1136,0.117,0.1228
mean,0.6234,0.1277,0.1205,0.1284
mean,0.6578,0.1173,0.1067,0.1182


In [169]:
df.T.round(4).to_csv('mean_user_emotions_L_vs_C_vs_R.csv')

### Mean emotions across topics

In [317]:
pd.concat((R_fav.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_R-Fav', 'joy': 'joy_R-Fav', 'optimism': 'optimism_R-Fav', 'sadness': 'sadness_R-Fav'}),
           R_unfav.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_R-unfav', 'joy': 'joy_R-unfav', 'optimism': 'optimism_R-unfav', 'sadness': 'sadness_R-unfav'}),
           L_fav.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_L-fav', 'joy': 'joy_L-fav', 'optimism': 'optimism_L-fav', 'sadness': 'sadness_L-fav'}),
           L_unfav.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_L-unfav', 'joy': 'joy_L-unfav', 'optimism': 'optimism_L-unfav', 'sadness': 'sadness_L-unfav'})),
           axis = 1).to_csv('results/user_resp_analysis/mean_emotions_fav_vs_unfav_topic_wise.csv')

In [103]:
pd.concat((B_pos_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-L-pos', 'joy': 'joy_B-L-pos', 'optimism': 'optimism_B-L-pos', 'sadness': 'sadness_B-L-pos'}),
           B_neg_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-L-neg', 'joy': 'joy_B-L-neg', 'optimism': 'optimism_B-L-neg', 'sadness': 'sadness_B-L-neg'}),
           B_pos_mentions_R.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-R-pos', 'joy': 'joy_B-R-pos', 'optimism': 'optimism_B-R-pos', 'sadness': 'sadness_B-R-pos'}),
           B_neg_mentions_R.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-R-neg', 'joy': 'joy_B-R-neg', 'optimism': 'optimism_B-R-neg', 'sadness': 'sadness_B-R-neg'}),
           T_pos_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_T-L-pos', 'joy': 'joy_T-L-pos', 'optimism': 'optimism_T-L-pos', 'sadness': 'sadness_T-L-pos'}),
           T_neg_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_T-L-neg', 'joy': 'joy_T-L-neg', 'optimism': 'optimism_T-L-neg', 'sadness': 'sadness_T-L-neg'}),
           T_pos_mentions_R.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_T-R-pos', 'joy': 'joy_T-R-pos', 'optimism': 'optimism_T-R-pos', 'sadness': 'sadness_T-R-pos'}),
           T_neg_mentions_R.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_T-R-neg', 'joy': 'joy_T-R-neg', 'optimism': 'optimism_T-R-neg', 'sadness': 'sadness_T-R-neg'})), 
           axis = 1)
# .to_csv('results/user_resp_analysis/mean_emotions_pos_neg_men_topic_wise.csv')

  pd.concat((B_pos_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-L-pos', 'joy': 'joy_B-L-pos', 'optimism': 'optimism_B-L-pos', 'sadness': 'sadness_B-L-pos'}),
  B_neg_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-L-neg', 'joy': 'joy_B-L-neg', 'optimism': 'optimism_B-L-neg', 'sadness': 'sadness_B-L-neg'}),
  B_pos_mentions_R.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-R-pos', 'joy': 'joy_B-R-pos', 'optimism': 'optimism_B-R-pos', 'sadness': 'sadness_B-R-pos'}),
  B_neg_mentions_R.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_B-R-neg', 'joy': 'joy_B-R-neg', 'optimism': 'optimism_B-R-neg', 'sadness': 'sadness_B-R-neg'}),
  T_pos_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 'anger_T-L-pos', 'joy': 'joy_T-L-pos', 'optimism': 'optimism_T-L-pos', 'sadness': 'sadness_T-L-pos'}),
  T_neg_mentions_L.groupby('topics_y').mean()[cols].rename(columns = {'anger': 

Unnamed: 0_level_0,anger_B-L-pos,joy_B-L-pos,optimism_B-L-pos,sadness_B-L-pos,anger_B-L-neg,joy_B-L-neg,optimism_B-L-neg,sadness_B-L-neg,anger_B-R-pos,joy_B-R-pos,...,optimism_T-L-neg,sadness_T-L-neg,anger_T-R-pos,joy_T-R-pos,optimism_T-R-pos,sadness_T-R-pos,anger_T-R-neg,joy_T-R-neg,optimism_T-R-neg,sadness_T-R-neg
topics_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
american,0.572845,0.138541,0.160241,0.128375,0.667878,0.110874,0.105092,0.116146,0.661463,0.110169,...,0.111761,0.112522,0.620627,0.120231,0.121879,0.137266,0.624649,0.126807,0.123912,0.12463
blm,0.579087,0.158605,0.14514,0.117176,0.69412,0.09422,0.110756,0.100916,0.659782,0.122919,...,0.103685,0.103671,0.615916,0.154792,0.11945,0.10984,0.704446,0.096798,0.08915,0.109602
capitol,0.586964,0.166072,0.120783,0.126265,0.734283,0.075341,0.091979,0.098387,,,...,0.09182,0.110064,0.605422,0.162759,0.121102,0.110738,0.739081,0.080269,0.090338,0.090308
conspiracy_theory,0.608126,0.124327,0.146543,0.120998,0.66782,0.09651,0.121371,0.114287,0.69245,0.09795,...,0.105238,0.11075,0.600405,0.165178,0.124636,0.109765,0.677274,0.116615,0.08988,0.116217
covid,0.668985,0.097466,0.120282,0.113274,0.676493,0.093333,0.112775,0.117402,0.626145,0.103955,...,0.116766,0.133298,0.628326,0.11413,0.134893,0.122648,0.680533,0.097458,0.097074,0.124935
democrats,0.564016,0.159799,0.151673,0.124517,0.678457,0.096392,0.115092,0.110053,0.59981,0.144452,...,0.118061,0.10992,0.581264,0.156821,0.116616,0.145292,0.688234,0.105585,0.089748,0.116437
economy,0.559688,0.121202,0.168449,0.150655,0.595959,0.104114,0.146714,0.153202,0.563389,0.109823,...,0.118238,0.124975,0.630824,0.103843,0.143332,0.121985,0.677831,0.100641,0.111576,0.109939
election,0.589343,0.149846,0.133216,0.127602,0.682073,0.095428,0.108562,0.113934,0.652939,0.119549,...,0.112566,0.115095,0.588712,0.151629,0.124918,0.13474,0.672243,0.110605,0.103594,0.113558
healthcare,0.600408,0.108349,0.152968,0.138269,0.715222,0.058111,0.036611,0.190056,0.485486,0.178,...,0.127307,0.124647,0.564991,0.140604,0.144217,0.150057,,,,
immigration,0.67881,0.094436,0.122615,0.104201,0.675384,0.113392,0.116855,0.094378,0.68125,0.0485,...,0.114589,0.126451,0.709984,0.090625,0.10197,0.097431,0.661075,0.086545,0.117612,0.134746


In [104]:
from pingouin import kruskal

def get_KW_stat(emot1, emot2):
    
    df1 = pd.DataFrame(emot1.values, columns = ['senti'])
    df1['type_flag'] = 1

    df2 = pd.DataFrame(emot2.values, columns = ['senti'])
    df2['type_flag'] = 2
    
    temp_df = pd.concat((df1,df2), axis = 0)
    kwTest = kruskal(temp_df, dv='senti', between='type_flag')
    H = kwTest.H[0]
    p = kwTest['p-unc'][0]
    dof = kwTest['ddof1'][0]
    n = temp_df.shape[0]
    esq = H * (n + 1)/(n**2 - 1)
    return H, p, dof, esq


def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos_new'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg_new'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

def get_significance_stats(dist1, dist2, pub, entity, emotion):

    emot1_shapiro = stats.shapiro(dist1)
    emot2_shapiro = stats.shapiro(dist2)
    emot1_shapiro = (emot1_shapiro.statistic, emot1_shapiro.pvalue)
    emot2_shapiro = (emot2_shapiro.statistic, emot2_shapiro.pvalue)
    H, p, dof, es = get_KW_stat(dist1, dist2)
    # t, p , dof = welch_ttest(emot1, emot2)
    res = [entity, pub, emotion, dist1.shape[0], dist2.shape[0], emot1_shapiro[0], emot1_shapiro[1], emot2_shapiro[0], emot2_shapiro[1], p, H, dof, es]
    # print(res)
    return res

def get_mean_emotions(df):
    
    return df['anger'].mean(), df['joy'].mean(), df['optimism'].mean(), df['sadness'].mean()

def get_mean_emotions_per_sentiment(df, entity):
    
    entity_mentions = df[df[entity + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos_new'].mean()
    neu_mean = entity_mentions[entity + '_neu_new'].mean()
    neg_mean = entity_mentions[entity + '_neg_new'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos_new'] > pos_mean * 1.5))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg_new'] > neg_mean * 1.5))]
    
#     print(entity, pos_entity_mentions.shape, pos_entity_mentions_.shape)
#     print(entity, neg_entity_mentions.shape, neg_entity_mentions_.shape)
    
    pos_tweet_emot = get_mean_emotions(pos_entity_mentions)
    neg_tweet_emot = get_mean_emotions(neg_entity_mentions)
    
    return pos_entity_mentions, neg_entity_mentions, pos_tweet_emot, neg_tweet_emot

  return warn(


### Left-vs-right (Statistical Test)

In [110]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
sstat = []

def get_mean_emotions(df):
    
    return df['anger'].mean(), df['joy'].mean(), df['optimism'].mean(), df['sadness'].mean()


for ent in ['biden', 'trump']:
    # temp_df = user_resp[user_resp['publication'] == pub]
    # for ent in ['biden', 'trump']:
    
    l_df = L_user_resp[L_user_resp[ent.title() + '_flag'] == True]
    r_df = R_user_resp[R_user_resp[ent.title() + '_flag'] == True]


    for emot in emotions:
    # for emot in ['neg_senti', 'neu_senti', 'pos_senti']:

        dist1 = l_df[emot].dropna()
        dist2 = r_df[emot].dropna()
        if(len(dist1) > 10 and len(dist2) > 10):
            res = get_significance_stats(dist1, dist2, ent, None, emot)
        else:
            res = None
        # print(res)
        sstat.append(res)
        # sys.exit()
    # print()
    
stats_df = pd.DataFrame(sstat, columns = ['senti', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df



Unnamed: 0,senti,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,,biden,anger,759484,563272,0.827,0.0,0.818,0.0,0.0,1695.649,1,0.001
1,,biden,joy,759484,563272,0.544,0.0,0.529,0.0,0.0,55.862,1,0.0
2,,biden,optimism,759484,563272,0.646,0.0,0.607,0.0,0.0,4504.172,1,0.003
3,,biden,sadness,759484,563272,0.622,0.0,0.623,0.0,0.107,2.6,1,0.0
4,,trump,anger,2327235,729943,0.824,0.0,0.802,0.0,0.0,750.516,1,0.0
5,,trump,joy,2327235,729943,0.512,0.0,0.522,0.0,0.0,429.422,1,0.0
6,,trump,optimism,2327235,729943,0.614,0.0,0.596,0.0,0.0,3326.504,1,0.001
7,,trump,sadness,2327235,729943,0.622,0.0,0.595,0.0,0.0,3411.851,1,0.001


### Fav-vs-unFav Statistical Test

In [170]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
# emotions = ['anger_norm', 'joy_norm', 'optimism_norm', 'sadness_norm']
sstat = []

for emot in emotions:
    
    for i, ur in enumerate([(R_fav, R_unfav),(L_fav, L_unfav)]):

        data1 = ur[0]
        data2 = ur[1]
        # print(data1.shape, data2.shape)
        dist1 = data1[emot].dropna()
        dist2 = data2[emot].dropna()
        
        # print(dist1.shape, dist2.shape)
        
        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):
            
            if(i%2 == 0):
                res = get_significance_stats(dist1, dist2, 'R', None, emot)
            else:
                res = get_significance_stats(dist1, dist2, 'L', None, emot)
            # print(res)
            sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(4)
stats_df

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,,R,anger,205178,437025,0.8166,0.0,0.7901,0.0,0.0,2546.0751,1,0.004
1,,L,anger,1416644,479598,0.8055,0.0,0.8162,0.0,0.0,205.8426,1,0.0001
2,,R,joy,205178,437025,0.5476,0.0,0.5001,0.0,0.0,2791.8392,1,0.0043
3,,L,joy,1416644,479598,0.5076,0.0,0.5103,0.0,0.0,243.2702,1,0.0001
4,,R,optimism,205178,437025,0.6121,0.0,0.58,0.0,0.0,1372.3571,1,0.0021
5,,L,optimism,1416644,479598,0.6138,0.0,0.6134,0.0,0.2538,1.3021,1,0.0
6,,R,sadness,205178,437025,0.6139,0.0,0.5901,0.0,0.0,1116.2638,1,0.0017
7,,L,sadness,1416644,479598,0.6084,0.0,0.6259,0.0,0.0,645.8605,1,0.0003


In [171]:
stats_df.to_csv('results/user_resp_analysis/fav_vs_unfav_statistical_test.csv')

### Fav-vs-unFav statistical test (per topic)

In [125]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
# emotions = ['anger_norm', 'joy_norm', 'optimism_norm', 'sadness_norm']
sstat = []
topics = set(L_user_resp.topics_y).intersection(set(R_user_resp.topics_y))

for emot in emotions:
    
    for top in topics:

        for i, ur in enumerate([(R_fav, R_unfav),(L_fav, L_unfav)]):
        
            data1 = ur[0]
            data2 = ur[1]
            dist1 = data1[data1.topics_y == top][emot].dropna()
            dist2 = data2[data2.topics_y == top][emot].dropna()

            if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

                if(i%2 == 0):
                    res = get_significance_stats(dist1, dist2, 'R', top, emot)
                else:
                    res = get_significance_stats(dist1, dist2, 'L', top, emot)
                # print(res)
                sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,republican,R,anger,4975,16544,0.835,0.0,0.787,0.0,0.000,165.646,1,0.008
1,republican,L,anger,72618,28370,0.804,0.0,0.825,0.0,0.000,152.190,1,0.002
2,conspiracy_theory,R,anger,1169,4020,0.811,0.0,0.796,0.0,0.131,2.279,1,0.000
3,conspiracy_theory,L,anger,63162,14195,0.796,0.0,0.806,0.0,0.000,33.176,1,0.000
4,impeachment,R,anger,1896,7980,0.817,0.0,0.766,0.0,0.000,49.355,1,0.005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,economy,L,sadness,28646,10533,0.658,0.0,0.678,0.0,0.000,60.250,1,0.002
156,newyork,R,sadness,7503,6051,0.605,0.0,0.583,0.0,0.000,23.441,1,0.002
157,newyork,L,sadness,17918,1886,0.600,0.0,0.634,0.0,0.001,11.954,1,0.001
158,blm,R,sadness,30627,23911,0.583,0.0,0.570,0.0,0.000,134.755,1,0.002


In [127]:
stats_df[stats_df['ES'] > 0.001]

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,republican,R,anger,4975,16544,0.835,0.0,0.787,0.0,0.0,165.646,1,0.008
1,republican,L,anger,72618,28370,0.804,0.0,0.825,0.0,0.0,152.190,1,0.002
4,impeachment,R,anger,1896,7980,0.817,0.0,0.766,0.0,0.0,49.355,1,0.005
7,white_house,L,anger,31658,23824,0.805,0.0,0.825,0.0,0.0,180.332,1,0.003
8,media,R,anger,8659,56525,0.802,0.0,0.766,0.0,0.0,117.244,1,0.002
...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,international,L,sadness,30447,6331,0.582,0.0,0.608,0.0,0.0,161.566,1,0.004
154,economy,R,sadness,8134,6309,0.630,0.0,0.609,0.0,0.0,28.389,1,0.002
155,economy,L,sadness,28646,10533,0.658,0.0,0.678,0.0,0.0,60.250,1,0.002
156,newyork,R,sadness,7503,6051,0.605,0.0,0.583,0.0,0.0,23.441,1,0.002


### Left-vs-Right statistical test

In [120]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
topics = set(L_user_resp.topics_y).intersection(set(R_user_resp.topics_y))
sstat = []
for emot in emotions:

    for top in topics:

        dist1 = L_user_resp[L_user_resp.topics_y == top][emot].dropna()
        dist2 = R_user_resp[R_user_resp.topics_y == top][emot].dropna()

        # print(dist2.shape, dist1.shape)

        if(dist1.shape[0] > 10 and dist2.shape[0] > 10):

            res = get_significance_stats(dist1, dist2, ent, top, emot)
            # print(res)
            sstat.append(res)

        
stats_df = pd.DataFrame(sstat, columns = ['Topic', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(3)
stats_df



Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,republican,trump,anger,137697,35987,0.812,0.0,0.810,0.0,0.002,9.892,1,0.000
1,conspiracy_theory,trump,anger,87615,7647,0.801,0.0,0.811,0.0,0.066,3.392,1,0.000
2,impeachment,trump,anger,34701,15662,0.794,0.0,0.774,0.0,0.000,48.792,1,0.001
3,white_house,trump,anger,89694,19477,0.812,0.0,0.826,0.0,0.000,49.144,1,0.000
4,media,trump,anger,76285,98696,0.794,0.0,0.785,0.0,0.000,156.704,1,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,covid,trump,sadness,390394,104714,0.659,0.0,0.639,0.0,0.000,559.006,1,0.001
76,international,trump,sadness,40841,26828,0.595,0.0,0.614,0.0,0.000,132.835,1,0.002
77,economy,trump,sadness,68121,31288,0.663,0.0,0.638,0.0,0.000,100.380,1,0.001
78,newyork,trump,sadness,48812,18102,0.604,0.0,0.601,0.0,0.592,0.287,1,0.000


In [122]:
stats_df[stats_df.ES > 0.001]

Unnamed: 0,Topic,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
5,supreme_court,trump,anger,73759,40069,0.816,0.0,0.801,0.0,0.0,227.774,1,0.002
6,democrats,trump,anger,134668,86167,0.83,0.0,0.814,0.0,0.0,657.652,1,0.003
9,healthcare,trump,anger,16999,1130,0.818,0.0,0.864,0.0,0.0,81.331,1,0.004
18,newyork,trump,anger,48812,18102,0.815,0.0,0.799,0.0,0.0,115.21,1,0.002
29,healthcare,trump,joy,16999,1130,0.473,0.0,0.532,0.0,0.0,85.404,1,0.005
36,international,trump,joy,40841,26828,0.503,0.0,0.516,0.0,0.0,173.963,1,0.003
39,blm,trump,joy,100079,71107,0.505,0.0,0.547,0.0,0.0,1317.369,1,0.008
46,democrats,trump,optimism,134668,86167,0.652,0.0,0.596,0.0,0.0,1597.422,1,0.007
48,national,trump,optimism,108056,33121,0.629,0.0,0.598,0.0,0.0,338.783,1,0.002
49,healthcare,trump,optimism,16999,1130,0.618,0.0,0.684,0.0,0.0,52.838,1,0.003


# User engagement

In [340]:
R_news_tweets = news_tweets[((news_tweets['publication'] == 'Breitbart News') | (news_tweets['publication'] == 'Fox News'))]
L_news_tweets = news_tweets[((news_tweets['publication'] == 'CNN') | (news_tweets['publication'] == 'The Washington Post'))]
C_news_tweets = news_tweets[((news_tweets['publication'] == 'Business Insider') | (news_tweets['publication'] ==  'USA Today'))]

In [341]:
R_news_tweets.shape, L_news_tweets.shape, C_news_tweets.shape

((4115, 49), (13107, 49), (7362, 49))

In [344]:
R_user_resp.tweet_id.nunique()/R_news_tweets.conversation_id.nunique(), L_user_resp.tweet_id.nunique()/L_news_tweets.conversation_id.nunique(), C_user_resp.tweet_id.nunique()/C_news_tweets.conversation_id.nunique()

(272.9635479951397, 210.21286335545892, 21.883863080684595)

In [363]:
R_user_resp.tweet_id.nunique(), L_user_resp.tweet_id.nunique(), C_user_resp.tweet_id.nunique()

(1123245, 2755260, 161109)

In [354]:
topics = list(set(news_tweets.topics))

In [367]:
topics

['republican',
 'conspiracy_theory',
 'white_house',
 'impeachment',
 'media',
 'supreme_court',
 'democrats',
 'election',
 'national',
 'healthcare',
 'immigration',
 'capitol',
 'none',
 'american',
 'other',
 'covid',
 'international',
 'economy',
 'newyork',
 'blm']

In [369]:
cols = ['topics', 'like_count', 'quote_count', 'reply_count', 'retweet_count', 'anger', 'joy', 'optimism', 'sadness', 'trump_pos', 'trump_neg', 'trump_neu', 'biden_pos', 'biden_neg', 'biden_neu']

def get_resp_freq_per_topic(user_resp_df, news_tweets_df, topics):

    data = []
    for top in topics:
        
        resp_df = user_resp_df[user_resp_df.topics_y == top]
        ntweet_df = news_tweets_df[news_tweets_df.topics == top]
        
        num_resp = resp_df.tweet_id.nunique()/ntweet_df.conversation_id.nunique()

        # print(resp_df.tweet_id.nunique(), ntweet_df.conversation_id.nunique(), resp_df.tweet_id.nunique()/ntweet_df.conversation_id.nunique())
        data.append((top, num_resp))
    df_resp_pt = pd.DataFrame(data, columns = ['topics', 'resp_per_tweet'])
    
    return df_resp_pt
    
df_resp_pt_R = get_resp_freq_per_topic(R_user_resp, R_news_tweets, topics) 
df_resp_pt_L = get_resp_freq_per_topic(L_user_resp, L_news_tweets, topics) 
df_resp_pt_C = get_resp_freq_per_topic(C_user_resp, C_news_tweets, topics) 

In [371]:
# df_resp_pt_R, df_resp_pt_L, df_resp_pt_C

In [372]:
df_resp_pt_L.shape, df_resp_pt_R.shape, df_resp_pt_C.shape

((20, 2), (20, 2), (20, 2))

In [296]:
# df_resp_pt_L

In [295]:
# df_resp_pt_R

In [294]:
# df_resp_pt_C

In [375]:
df_resp_pt_L.merge(df_resp_pt_C, on = 'topics').merge(df_resp_pt_R, on = 'topics').round()
# to_csv('results/user_resp_analysis/user_engagement.csv')

Unnamed: 0,topics,resp_per_tweet_x,resp_per_tweet_y,resp_per_tweet
0,republican,233.0,18.0,191.0
1,conspiracy_theory,182.0,67.0,130.0
2,white_house,260.0,25.0,209.0
3,impeachment,259.0,20.0,296.0
4,media,238.0,22.0,338.0
5,supreme_court,177.0,21.0,313.0
6,democrats,189.0,24.0,201.0
7,election,210.0,24.0,314.0
8,national,147.0,18.0,162.0
9,healthcare,156.0,11.0,71.0


In [99]:
user_resp = user_resp.dropna(subset = ['trump_pos', 'trump_neg', 'trump_neu', 'biden_pos', 'biden_neg', 'biden_neu'])

In [101]:
user_resp.to_csv('results/user_resp.csv')

## User response to sentimental vs neutral 

In [102]:
senti_tweet_resp = user_resp[(((user_resp['trump_neg'] > 0.25) & (user_resp['trump_neu'] < 0.25)) | ((user_resp['trump_pos'] > 0.25)  & (user_resp['trump_neu'] < 0.25)) | ((user_resp['biden_neg'] > 0.25)  & (user_resp['biden_neu'] < 0.25)) | ((user_resp['biden_pos'] > 0.25)  & (user_resp['biden_neu'] < 0.25)))]
neu_tweet_resp = user_resp[((user_resp['trump_neu'] > 0.50) | (user_resp['biden_neu'] > 0.50))]
neu_tweet_resp.shape, senti_tweet_resp.shape

TypeError: '>' not supported between instances of 'str' and 'float'

In [182]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']
sstat = []
for s in emotions:
    # dist1 = senti_tweet_resp[s + '_senti'].dropna()
    # dist2 = neu_tweet_resp[s + '_senti'].dropna()
    
    dist1 = senti_tweet_resp[s].dropna()
    dist2 = neu_tweet_resp[s].dropna()
    
    res = get_significance_stats(dist1, dist2, None, s, None)
    sstat.append(res)

stats_df = pd.DataFrame(sstat, columns = ['senti', 'pub', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df.round(4)

Unnamed: 0,senti,pub,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,anger,,,2478865,1609861,0.8205,0.0,0.822,0.0,0.0,5514.3216,1,0.0013
1,sadness,,,2478865,1609861,0.6116,0.0,0.6288,0.0,0.0,2831.1415,1,0.0007
2,joy,,,2478865,1609861,0.5131,0.0,0.5378,0.0,0.0,4796.0497,1,0.0012
3,optimism,,,2478865,1609861,0.6105,0.0,0.6246,0.0,0.0,1368.4183,1,0.0003


In [183]:
senti_tweet_resp.tweet_id.nunique()/senti_tweet_resp.conversation_id.nunique()

207.61013400335008

In [None]:
neu_tweet_resp.tweet_id.nunique()/neu_tweet_resp.conversation_id.nunique()

203.08578276775577

In [190]:
R_senti_user_resp = senti_tweet_resp[((senti_tweet_resp['publication'] == 'Breitbart News') | (senti_tweet_resp['publication'] == 'Fox News'))]
L_senti_user_resp = senti_tweet_resp[((senti_tweet_resp['publication'] == 'CNN') | (senti_tweet_resp['publication'] == 'The Washington Post'))]
C_senti_user_resp = senti_tweet_resp[((senti_tweet_resp['publication'] == 'Business Insider') | (senti_tweet_resp['publication'] ==  'USA Today'))]

R_neu_user_resp = neu_tweet_resp[((neu_tweet_resp['publication'] == 'Breitbart News') | (neu_tweet_resp['publication'] == 'Fox News'))]
L_neu_user_resp = neu_tweet_resp[((neu_tweet_resp['publication'] == 'CNN') | (neu_tweet_resp['publication'] == 'The Washington Post'))]
C_neu_user_resp = neu_tweet_resp[((neu_tweet_resp['publication'] == 'Business Insider') | (neu_tweet_resp['publication'] ==  'USA Today'))]

In [192]:
R_senti_user_resp.tweet_id.nunique()/R_senti_user_resp.conversation_id.nunique(), L_senti_user_resp.tweet_id.nunique()/L_senti_user_resp.conversation_id.nunique(), C_senti_user_resp.tweet_id.nunique()/C_senti_user_resp.conversation_id.nunique()

(331.7762237762238, 245.86871780593066, 33.13250517598344)

In [193]:
R_neu_user_resp.tweet_id.nunique()/R_neu_user_resp.conversation_id.nunique(), L_neu_user_resp.tweet_id.nunique()/L_neu_user_resp.conversation_id.nunique(), C_neu_user_resp.tweet_id.nunique()/C_neu_user_resp.conversation_id.nunique()

(327.43946731234865, 241.2586956521739, 32.843091334894616)

### Selective-Vs-Cross Exposure

In [206]:
author_freqL = pd.DataFrame(L_user_resp.author_id_x.value_counts())
author_freqL['author_ids'] = author_freqL.index
author_freqL = author_freqL.rename(columns = {'author_id_x': 'freq_L'})
author_freqL = author_freqL.rename(columns = {'author_ids': 'author_id'})

In [209]:
author_freq = pd.DataFrame()

In [216]:
author_freq = author_freq.merge(author_freqC, on = 'author_id', how = 'outer')

In [229]:
freq_R_authors = author_freq[author_freq['freq_R'] > 20].author_id
freq_L_authors = author_freq[author_freq['freq_L'] > 20].author_id

In [235]:
freq_authors = list(set(freq_R_authors).intersection(set(freq_L_authors)))

In [245]:
author_freq['freq_C'] = author_freq['freq_C'].fillna(0)

In [246]:
author_freq['total'] = author_freq.freq_R + author_freq.freq_C + author_freq.freq_L

In [250]:
author_freq['prop_C'] = author_freq.freq_C/author_freq['total']

In [252]:
author_freq[author_freq['prop_L'] > 0.4]

(569968, 8)

In [281]:
cross_exposed = author_freq[((author_freq['total'] > 10) & (author_freq['prop_L'] > 0.4) & (author_freq['prop_R'] > 0.4))]
cross_exposed.shape

(3484, 8)

In [280]:
sel_exposed = author_freq[((author_freq['total'] > 10) & ((author_freq['prop_L'] == 1.0) | (author_freq['prop_R'] == 1.0)))]
sel_exposed.shape

(30090, 8)

In [286]:
cols = ['anger', 'joy', 'optimism', 'sadness']

In [290]:
R_user_resp[R_user_resp.author_id_x.isin(cross_exposed.author_id)][cols].describe(), L_user_resp[L_user_resp.author_id_x.isin(cross_exposed.author_id)][cols].describe()

(              anger           joy      optimism       sadness
 count  53356.000000  53356.000000  53356.000000  53356.000000
 mean       0.661459      0.114265      0.109385      0.114896
 std        0.336352      0.234273      0.173305      0.173010
 min        0.005000      0.002000      0.003000      0.004000
 25%        0.373000      0.008000      0.018000      0.022000
 50%        0.834000      0.013000      0.036000      0.048000
 75%        0.941000      0.056000      0.104000      0.119000
 max        0.986000      0.973000      0.945000      0.985000,
               anger           joy      optimism       sadness
 count  53954.000000  53954.000000  53954.000000  53954.000000
 mean       0.660838      0.108831      0.113445      0.116884
 std        0.336121      0.228164      0.178190      0.174417
 min        0.004000      0.003000      0.003000      0.004000
 25%        0.368000      0.007000      0.018000      0.022000
 50%        0.833000      0.013000      0.037000      

In [291]:
R_user_resp[R_user_resp.author_id_x.isin(sel_exposed.author_id)][cols].describe(), L_user_resp[L_user_resp.author_id_x.isin(sel_exposed.author_id)][cols].describe()

(               anger            joy       optimism        sadness
 count  169017.000000  169017.000000  169017.000000  169017.000000
 mean        0.667304       0.108896       0.107215       0.116584
 std         0.337270       0.228240       0.173914       0.181249
 min         0.004000       0.002000       0.003000       0.004000
 25%         0.377000       0.007000       0.016000       0.021000
 50%         0.845000       0.013000       0.034000       0.046000
 75%         0.944000       0.053000       0.097000       0.117000
 max         0.986000       0.973000       0.952000       0.986000,
                anger            joy       optimism        sadness
 count  663722.000000  663722.000000  663722.000000  663722.000000
 mean        0.658972       0.103957       0.115808       0.121263
 std         0.336476       0.221190       0.180771       0.181863
 min         0.004000       0.002000       0.003000       0.003000
 25%         0.363000       0.007000       0.018000       0.0

In [292]:
user_resp[user_resp.author_id_x.isin(cross_exposed.author_id)][cols].mean(), user_resp[user_resp.author_id_x.isin(cross_exposed.author_id)][cols].mean()

(anger       0.659733
 joy         0.111418
 optimism    0.112502
 sadness     0.116349
 dtype: float64,
 anger       0.659733
 joy         0.111418
 optimism    0.112502
 sadness     0.116349
 dtype: float64)

In [293]:
user_resp[user_resp.author_id_x.isin(sel_exposed.author_id)][cols].mean(), user_resp[user_resp.author_id_x.isin(sel_exposed.author_id)][cols].mean()

(anger       0.660253
 joy         0.105279
 optimism    0.113815
 sadness     0.120652
 dtype: float64,
 anger       0.660253
 joy         0.105279
 optimism    0.113815
 sadness     0.120652
 dtype: float64)

In [316]:
emotions = ['anger', 'joy', 'optimism', 'sadness']
sstat = []

# for ent in ['biden', 'trump']:
for emot in emotions:
    # temp_df = user_resp[user_resp['publication'] == pub]
    # for ent in ['biden', 'trump']:
    
    dist1 = L_user_resp[L_user_resp.author_id_x.isin(freq_users.author_id)][emot]
    dist2 = L_user_resp[L_user_resp.author_id_x.isin(less_freq_users.author_id)][emot]
    
    if(len(dist1) > 10 and len(dist2) > 10):
        res = get_significance_stats(dist1, dist2, 'Sel-Exp', 'all-vs-all', emot)
    else:
        res = None
    # print(res)
    sstat.append(res)
    # sys.exit()
# print()

stats_df = pd.DataFrame(sstat, columns = ['senti', 'Entity', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df = stats_df.round(4)
stats_df

Unnamed: 0,senti,Entity,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,all-vs-all,Sel-Exp,anger,1647687,269649,0.8121,0.0,0.815,0.0,0.0,145.9499,1,0.0001
1,all-vs-all,Sel-Exp,joy,1647687,269649,0.5017,0.0,0.527,0.0,0.0,254.8815,1,0.0001
2,all-vs-all,Sel-Exp,optimism,1647687,269649,0.6183,0.0,0.6214,0.0,0.0,86.2055,1,0.0
3,all-vs-all,Sel-Exp,sadness,1647687,269649,0.6206,0.0,0.6199,0.0,0.0,72.2389,1,0.0


### Freq-vs-nonFreq

In [313]:
freq_users = author_freq[author_freq['total'] > 10]
freq_users.shape

(71273, 8)

In [314]:
less_freq_users = author_freq[((author_freq['total'] > 5) & (author_freq['total'] < 10))]
non_freq_users = author_freq[author_freq['total'] == 1]
less_freq_users.shape, non_freq_users.shape                                                               

((57457, 8), (416823, 8))