In [213]:
import pandas as pd
import numpy as np
import sys

import matplotlib.pyplot as plt
# from bertopic import BERTopic
pd.set_option('display.max_colwidth', None)
from scipy import stats

In [214]:
user_resp = pd.read_csv('results/user_response.csv')
# user_resp_ = pd.read_parquet('results_old/user_resp_newslant_with_senti.parquet')

In [215]:
user_resp.shape

(765796, 39)

In [216]:
user_resp.columns

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'tweet_id', 'conversation_id', 'author_id', 'created_at',
       'geo', 'lang', 'like_count', 'quote_count', 'reply_count',
       'retweet_count', 'source', 'text', 'anger', 'joy', 'optimism',
       'sadness', 'publication', 'topic_labels', 'theme', 'pos_senti',
       'neu_senti', 'neg_senti', 'date', 'week', 'flag', 'Trump_flag',
       'Biden_flag', 'Trump_Biden_flag', 'trump_pos', 'trump_neg', 'trump_neu',
       'biden_pos', 'biden_neg', 'biden_neu'],
      dtype='object')

In [217]:
user_resp.publication.value_counts()

CNN                    332227
The Washington Post    219282
Fox News               119202
Breitbart News          67333
USA Today               17859
Business Insider         9893
Name: publication, dtype: int64

In [218]:
# user_resp.created_at_y.head()

In [221]:
user_resp['publish_date'] = pd.to_datetime(user_resp['created_at'])
user_resp['day'] = user_resp.publish_date.dt.day
user_resp['month'] = user_resp.publish_date.dt.month
user_resp['year'] = user_resp.publish_date.dt.year
user_resp['timestamp'] = user_resp.publish_date.dt.time

In [222]:
user_resp.month.value_counts()

11    139823
10    131076
8      82445
1      74396
9      59548
4      56850
5      54893
6      46757
12     44802
7      42093
3      29777
2       3336
Name: month, dtype: int64

In [223]:
user_resp_2020 = user_resp[user_resp.year == 2020]
user_resp_fil_months = user_resp_2020[user_resp_2020.month.isin([5,6,7,8,9,10,11])]
final_user_resp_df = user_resp_fil_months[((user_resp_fil_months.month.isin([6,7,8,9,10])) | 
                      ((user_resp_fil_months.month == 5) & (user_resp_fil_months.day >= 3)) | 
                      ((user_resp_fil_months.month == 11) & (user_resp_fil_months.day < 3)))]

In [224]:
# final_user_resp_df.to_parquet('dataset/user_resp.parquet')

In [226]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']

In [227]:
from pingouin import kruskal

def get_KW_stat(emot1, emot2):
    
    df1 = pd.DataFrame(emot1.values, columns = ['senti'])
    df1['type_flag'] = 1

    df2 = pd.DataFrame(emot2.values, columns = ['senti'])
    df2['type_flag'] = 2
    
    temp_df = pd.concat((df1,df2), axis = 0)
    kwTest = kruskal(temp_df, dv='senti', between='type_flag')
    H = kwTest.H[0]
    p = kwTest['p-unc'][0]
    dof = kwTest['ddof1'][0]
    n = temp_df.shape[0]
    esq = H * (n + 1)/(n**2 - 1)
    return H, p, dof, esq

def get_significance_stats(dist1, dist2, pub, entity, emotion):

    emot1_shapiro = stats.shapiro(dist1)
    emot2_shapiro = stats.shapiro(dist2)
    emot1_shapiro = (emot1_shapiro.statistic, emot1_shapiro.pvalue)
    emot2_shapiro = (emot2_shapiro.statistic, emot2_shapiro.pvalue)
    H, p, dof, es = get_KW_stat(dist1, dist2)
    # t, p , dof = welch_ttest(emot1, emot2)
    res = [entity, pub, emotion, dist1.shape[0], dist2.shape[0], emot1_shapiro[0], emot1_shapiro[1], emot2_shapiro[0], emot2_shapiro[1], p, H, dof, es]
    # print(res)
    return res

In [228]:
def get_pos_neg_mentions(df, entity):
    
    entity_mentions = df[df[entity + '_flag'] == True]
    pos_mean = entity_mentions[entity + '_pos'].mean()
    neu_mean = entity_mentions[entity + '_neu'].mean()
    neg_mean = entity_mentions[entity + '_neg'].mean()
    pos_entity_mentions = entity_mentions[((entity_mentions[entity + '_pos'] > pos_mean))]
    neg_entity_mentions = entity_mentions[((entity_mentions[entity + '_neg'] > neg_mean))]
    
    return pos_entity_mentions, neg_entity_mentions

In [229]:
R_user_resp = user_resp[((user_resp['publication'] == 'Breitbart News') | (user_resp['publication'] == 'Fox News'))]
L_user_resp = user_resp[((user_resp['publication'] == 'CNN') | (user_resp['publication'] == 'The Washington Post'))]
C_user_resp = user_resp[((user_resp['publication'] == 'Business Insider') | (user_resp['publication'] ==  'USA Today'))]

### Regular vs non-regular users

#### Do regular users have any different sentiments than less regular ones?

In [230]:
all_user_resp_freq = pd.DataFrame(user_resp.author_id.value_counts())
all_user_resp_freq['author_ids'] = all_user_resp_freq.index
all_user_resp_freq = all_user_resp_freq.rename(columns = {'author_id': 'freq'})
all_user_resp_freq.index = [x for x in range(all_user_resp_freq.shape[0])]

In [231]:
regular_users = all_user_resp_freq[all_user_resp_freq['freq'] > 50]
non_regular_users = all_user_resp_freq[all_user_resp_freq['freq'] < 5]

In [232]:
regular_users.shape, non_regular_users.shape

((717, 2), (222691, 2))

In [233]:
user_resp_reg_users_L = L_user_resp[L_user_resp['author_id'].isin(regular_users['author_ids'])]
user_resp_reg_users_R = R_user_resp[R_user_resp['author_id'].isin(regular_users['author_ids'])]

user_resp_non_reg_users_L = L_user_resp[L_user_resp['author_id'].isin(non_regular_users['author_ids'])]
user_resp_non_reg_users_R = R_user_resp[R_user_resp['author_id'].isin(non_regular_users['author_ids'])]

In [234]:
user_resp_reg_users_L.shape, user_resp_reg_users_R.shape, user_resp_non_reg_users_L.shape, user_resp_non_reg_users_R.shape

((50058, 44), (11308, 44), (253095, 44), (97121, 44))

In [235]:
mean_senti_rnr_df = pd.DataFrame([['Reg', 'L'] + list(user_resp_reg_users_L[['anger', 'joy', 'optimism', 'sadness']].mean().values),  
                                 ['Non-Reg', 'L'] + list(user_resp_non_reg_users_L[['anger', 'joy', 'optimism', 'sadness']].mean().values),
                                  ['Reg', 'R'] + list(user_resp_reg_users_R[['anger', 'joy', 'optimism', 'sadness']].mean().values),
                                 ['Non-Reg', 'R'] + list(user_resp_non_reg_users_R[['anger', 'joy', 'optimism', 'sadness']].mean().values)],
        columns = ['user_type', 'slant', 'anger', 'joy', 'optimism', 'sadness'])

mean_senti_rnr_df

Unnamed: 0,user_type,slant,anger,joy,optimism,sadness
0,Reg,L,0.659792,0.101805,0.120605,0.117798
1,Non-Reg,L,0.61947,0.131023,0.122812,0.126696
2,Reg,R,0.645973,0.102876,0.126033,0.12512
3,Non-Reg,R,0.657755,0.123776,0.103032,0.11544


In [236]:
# mean_senti_rnr_df.round(3).to_csv('results/mean_senti_reg_non_reg.csv')

#### Statistical Test

In [237]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']
sstat = []
for s in emotions:
    # dist1 = user_resp_non_reg_users[s + '_senti'].dropna()
    # dist2 = user_resp_reg_users[s + '_senti'].dropna()
    
    dist1 = user_resp_non_reg_users_R[s].dropna()
    dist2 = user_resp_non_reg_users_L[s].dropna()
    
    res = get_significance_stats(dist1, dist2, None, s, None)
    sstat.append(res)
    



In [238]:
stats_df = pd.DataFrame(sstat, columns = ['senti', 'pub', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df.round(3)

Unnamed: 0,senti,pub,emotion,Sample size 1,sample size 2,shapiro_t_dist1,shapiro_p_dist2,shapiro_t_dist1.1,shapiro_p_dist2.1,KW-p,KW-H,KW-dof,ES
0,anger,,,97121,253095,0.803,0.0,0.824,0.0,0.0,958.854,1,0.003
1,sadness,,,97121,253095,0.604,0.0,0.623,0.0,0.0,433.119,1,0.001
2,joy,,,97121,253095,0.537,0.0,0.557,0.0,0.0,115.797,1,0.0
3,optimism,,,97121,253095,0.591,0.0,0.632,0.0,0.0,1187.908,1,0.003


In [None]:
# stats_df[['senti', 'KW-p', 'ES']].round(3).to_csv('results/statistical_test_user_resp_reg_vs_non_reg.csv')

#### Emotion Mean

In [None]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']
sstat = []
for s in emotions:
    # dist1 = user_resp_non_reg_users[s + '_senti'].dropna()
    # dist2 = user_resp_reg_users[s + '_senti'].dropna()
    
    mean1 = user_resp_non_reg_users[s].dropna().mean()
    mean2 = user_resp_reg_users[s].dropna().mean()
    
    # res = get_significance_stats(dist1, dist2, None, s, None)
    sstat.append([s, mean1, mean2])
    

In [None]:
sstat = pd.DataFrame(sstat, columns = ['pub', 'mean_non_reg', 'mean_reg'])
sstat = sstat.round(3)
sstat
# sstat.to_csv('results/mean_user_resp_reg_vs_non_reg.csv')

In [None]:
# x = sstat[['pub', 'mean_non_reg']].rename(columns = {'mean_non_reg': 'em_score'})
# x['user'] = 'non_reg'
# y = sstat[['pub', 'mean_reg']].rename(columns = {'mean_reg': 'em_score'})
# y['user'] = 'reg'
# new_df = pd.concat((x, y),axis = 0)
# new_df

### Selective Vs Cross Exposure

In [145]:
user_resp.author_id

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'tweet_id', 'conversation_id', 'author_id', 'created_at',
       'geo', 'lang', 'like_count', 'quote_count', 'reply_count',
       'retweet_count', 'source', 'text', 'anger', 'joy', 'optimism',
       'sadness', 'publication', 'topic_labels', 'theme', 'pos_senti',
       'neu_senti', 'neg_senti', 'date', 'week', 'flag', 'Trump_flag',
       'Biden_flag', 'Trump_Biden_flag', 'trump_pos', 'trump_neg', 'trump_neu',
       'biden_pos', 'biden_neg', 'biden_neu'],
      dtype='object')

In [171]:
author_freqR = pd.DataFrame(R_user_resp.author_id.value_counts())
author_freqR['author_ids'] = author_freqR.index
author_freqR = author_freqR.rename(columns = {'author_id': 'freq_R'})
author_freqR = author_freqR.rename(columns = {'author_ids': 'author_id'})

In [173]:
author_freq = author_freq.merge(author_freqC, on = 'author_id')

In [177]:
author_freq['total'] = author_freq.freq_R + author_freq.freq_L + author_freq.freq_C

In [182]:
author_freq['per_C'] = (author_freq.freq_C * 100)/author_freq.total

In [205]:
author_freq[((author_freq.total > 20) & (author_freq.per_C == 0))]

Unnamed: 0,freq_R,author_id,freq_L,freq_C,total,per_L,per_R,per_C


In [197]:
cross_exposed_authors = author_freq[author_freq.total > 20][(((author_freq.per_L > 20) & (author_freq.per_R > 20)))]['author_id']

  author_freq[author_freq.total > 20][(((author_freq.per_L > 20) & (author_freq.per_R > 20)))]


Unnamed: 0,freq_R,author_id,freq_L,freq_C,total,per_L,per_R,per_C
1,101,2.194839e+09,29,1,131,22.137405,77.099237,0.763359
5,50,1.283365e+18,67,39,156,42.948718,32.051282,25.000000
6,40,1.143332e+18,47,2,89,52.808989,44.943820,2.247191
7,37,1.406664e+07,29,2,68,42.647059,54.411765,2.941176
8,37,9.477016e+17,54,3,94,57.446809,39.361702,3.191489
...,...,...,...,...,...,...,...,...
159,6,1.528568e+08,15,1,22,68.181818,27.272727,4.545455
169,6,7.157012e+17,14,1,21,66.666667,28.571429,4.761905
173,6,1.102796e+18,13,4,23,56.521739,26.086957,17.391304
202,5,1.518477e+09,17,1,23,73.913043,21.739130,4.347826


In [142]:
# user_freq = pd.read_csv('../../backup/code/NewSlant/dataset/user_tweet_freq.csv')

In [None]:
# common = set(regular_users['author_id']).intersection(set(user_freq['author_id']))
# len(common)

In [None]:
# publishers = ['CNN', 'The Washington Post', 'Business Insider', 'USA Today', 'Fox News', 'Breitbart News']
# temp_df = pd.DataFrame(columns = ['author_id'] + publishers)
# for user in regular_users['author_id']:
#     k = user_resp[user_resp['author_id_x'] == user].publication.value_counts()
#     c = ['author_id'] + list(k.index)
#     f = pd.DataFrame([[user] + list(k)], columns = c)
#     temp_df = pd.concat((temp_df, f), axis = 0)
#     # print(temp_df)
    
# temp_df = temp_df.fillna(0)

# incl_count = []
# for i, row in temp_df.iterrows():
#     a = row['author_id']
#     L = int(row['CNN'] + row['The Washington Post'])
#     C = int(row['Business Insider'] + row['USA Today'])
#     R = int(row['Fox News'] + row['Breitbart News'])
#     incl_count.append([a,L,C,R])
# # len(incl_count)

# incl_count_df = pd.DataFrame(incl_count, columns = ['author_id', 'L', 'C', 'R'])
# user_details = pd.merge(temp_df, incl_count_df, on = 'author_id')

# s = user_details[['L', 'C', 'R']].sum(axis = 1)
# k = user_details[['L','C', 'R']].div(s, axis=0).rename(columns = {'L':'L_f', 'C': 'C_f', 'R':'R_f'})

# user_details = pd.concat((user_details, k), axis = 1)

# s = user_details[['L', 'C', 'R']].sum(axis = 1)
# user_details['tweet_count'] = s

# user_details.shape
# user_details.to_csv('dataset/user_details.csv')

In [143]:
user_details = pd.read_csv('../../backup/code/NewSlant/dataset/user_details.csv')

In [None]:
sel_exposed_authors = user_details[((user_details['L_f'] == 1) | (user_details['R_f'] == 1))]
# center_exposed_authors = fil_user_det[fil_user_det.C_f > 0.5]
# cross_exposed_authors = user_details[(((user_details['L_f'] > 0.25) & (user_details['R_f'] > 0.25)) | (user_details.C_f > 0.5))]
cross_exposed_authors = user_details[(((user_details['L_f'] > 0.40) & (user_details['R_f'] > 0.40)))]

In [None]:
sel_exposed_authors = user_details[~(((user_details.L_f > 0.25) & (user_details.R_f > 0.25)) | (user_details.C_f > 0.5))]
# center_exposed_authors = fil_user_det[fil_user_det.C_f > 0.5]
cross_exposed_authors = user_details[(((user_details.L_f > 0.25) & (user_details.R_f > 0.25)) | (user_details.C_f > 0.5))]

In [None]:
sel_exposed_authors_L = user_details[user_details['L_f'] == 1]
sel_exposed_authors_R = user_details[user_details['R_f'] == 1]
# center_exposed_authors = fil_user_det[fil_user_det.C_f > 0.5]
cross_exposed_authors = user_details[(((user_details.L_f > 0.25) & (user_details.R_f > 0.25)) | (user_details.C_f > 0.5))]
sel_exposed_authors_L.shape, sel_exposed_authors_R.shape

sel_user_resp = user_resp[user_resp['author_id_x'].isin(sel_exposed_authors_L.author_id)]
cross_user_resp = user_resp[user_resp['author_id_x'].isin(sel_exposed_authors_R.author_id)]

In [None]:
sel_exposed_authors.shape, cross_exposed_authors.shape

In [None]:
sel_user_resp = user_resp[user_resp['author_id_x'].isin(sel_exposed_authors.author_id)]
cross_user_resp = user_resp[user_resp['author_id_x'].isin(cross_exposed_authors.author_id)]

In [None]:
sel_user_resp.shape, cross_user_resp.shape

In [None]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']
sstat = []
for s in emotions:
    # dist1 = sel_user_resp[s + '_senti'].dropna()
    # dist2 = cross_user_resp[s + '_senti'].dropna()
    
    dist1 = sel_user_resp[s].dropna()
    dist2 = cross_user_resp[s].dropna()
    res = get_significance_stats(dist1, dist2, None, s, None)
    sstat.append(res)
    

In [None]:
stats_df = pd.DataFrame(sstat, columns = ['senti', 'pub', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df.round(4)

In [None]:
# sel_user_resp.columns

In [None]:
sel_L_exp_authors = sel_exposed_authors[sel_exposed_authors['L_f'] > 0.80].author_id
sel_R_exp_authors = sel_exposed_authors[sel_exposed_authors['R_f'] > 0.80].author_id
# cross_exposed_authors = user_details[(((user_details['L_f'] > 0.25) & (user_details['R_f'] > 0.25)))].author_id

In [None]:
sel_L_exp = user_resp[user_resp.author_id_x.isin(sel_L_exp_authors)]
sel_R_exp = user_resp[user_resp.author_id_x.isin(sel_R_exp_authors)]
# cross_user_resp = user_resp[user_resp['author_id_x'].isin(cross_exposed_authors)]

In [None]:
sel_L_exp.shape, sel_R_exp.shape, cross_user_resp.shape

In [None]:
sel_user_resp[['neg_senti', 'neu_senti', 'pos_senti']].describe()

In [None]:
cross_user_resp[['neg_senti', 'neu_senti', 'pos_senti']].describe()

### Compare user response to coverage of same entity across L and R based on selective exposure (statistical tests)

In [None]:
sstat, means = [], []
for pub, x in zip(['L_C', 'C_R', 'R_L'], [(sel_L_exp, cross_user_resp),(cross_user_resp, sel_R_exp), (sel_R_exp, sel_L_exp)]):
    for ent in ['biden', 'trump']:
        df1_pos, df1_neg = get_pos_neg_mentions(x[0], ent)
        df2_pos, df2_neg = get_pos_neg_mentions(x[1], ent)
        
        for i, s in enumerate(emotions): 
            for men in ['Mpos', 'Mneg']:
                if(men == 'Mneg'):
                    df1, df2 = df1_neg, df2_neg
                else:
                    df1, df2 = df1_pos, df2_pos

                dist1 = df1[s].dropna()
                dist2 = df2[s].dropna()
                res = get_significance_stats(dist1, dist2, pub, men, s)
                # print(res)
                # sys.exit()
                sstat.append(res + [ent])

sstat_df = pd.DataFrame(sstat, columns = ['mention', 'entity', 'senti', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES', 'Entity'])
sstat_df = sstat_df.round(2)
sstat_df

### Mean emotion scores based on positive and negative mentions (SE_L vs SE_R vs Cross)

In [None]:
sstat, means = [], []
for pub, x in zip(['SE_L', 'CE', 'SE_R'], [sel_L_exp, cross_user_resp, sel_R_exp]):
    for ent in ['biden', 'trump']:
        df1_pos, df1_neg = get_pos_neg_mentions(x, ent)
        # df2_pos, df2_neg = get_pos_neg_mentions(x[1], ent)
        
        for i, s in enumerate(emotions): 
            means.append([pub, ent, 'Mpos', s, df1_pos[s].mean()])
            means.append([pub, ent, 'Mneg', s, df1_neg[s].mean()])

mean_df = pd.DataFrame(means, columns = ['pub', 'entity', 'mention', 'senti', 'senti_score'])
# mean_df = mean_df.round(2)
# mean_df

#### Statistical tests without looking at positive or negative mentions

In [None]:
sstat, means = [], []
for pub, x in zip(['L_C', 'C_R', 'R_L'], [(sel_L_exp, cross_user_resp),(cross_user_resp, sel_R_exp), (sel_R_exp, sel_L_exp)]):
    for ent in ['biden', 'trump']:
        # df1_pos, df1_neg = get_pos_neg_mentions(x[0], ent)
        # df2_pos, df2_neg = get_pos_neg_mentions(x[1], ent)
        df1 = x[0][x[0][ent + '_flag'] == True]
        df2 = x[1][x[1][ent + '_flag'] == True]
        
        for i, s in enumerate(senti): 

            dist1 = df1[s + '_senti'].dropna()
            dist2 = df2[s  + '_senti'].dropna()
            res = get_significance_stats(dist1, dist2, pub, men, s)
            # print(res)
            # sys.exit()
            sstat.append(res + [ent])
            
sstat_df = pd.DataFrame(sstat, columns = ['mention', 'entity', 'senti', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES', 'Entity'])
sstat_df = sstat_df.round(3)
sstat_df

### Comparing same entity within each news publisher (based on L or R)

In [None]:
sstat, means = [], []
for pub, x in zip(['SE_L', 'SE_R', 'Cross'], [sel_L_exp, sel_R_exp, cross_user_resp]):

    b_pos_x, b_neg_x = get_pos_neg_mentions(x, 'biden')
    t_pos_x, t_neg_x = get_pos_neg_mentions(x, 'trump')

    for i, s in enumerate(senti):        
        for men in ['Mpos', 'Mneg']:
            if(men == 'Mneg'):
                b_df, t_df = b_neg_x, t_neg_x
            else:
                b_df, t_df = b_pos_x, t_pos_x

            means.append([pub, 'biden', men, s, b_df[s + '_senti'].mean()])
            means.append([pub, 'trump', men, s, t_df[s + '_senti'].mean()])

            # means.append([pub, 'biden', 'Mneg', s, b_neg_x[s + '_senti'].mean()])
            # means.append([pub, 'trump', 'Mneg', s, t_neg_x[s + '_senti'].mean()])

            dist1 = b_df[s + '_senti'].dropna()
            dist2 = t_df[s  + '_senti'].dropna()
            res = get_significance_stats(dist1, dist2, pub, men, s)
            # print(res)
            # sys.exit()
            sstat.append(res)
            
sstat_df = pd.DataFrame(sstat, columns = ['mention', 'entity', 'senti', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
sstat_df = sstat_df.round(3)

mean_df = pd.DataFrame(means, columns = ['pub', 'entity', 'mention', 'senti', 'senti_score'])
mean_df = mean_df.round(3)
mean_df

In [None]:
sstat_df

### Visualization

In [None]:
import pylab as plt
import seaborn as sns

fig = plt.figure(figsize=(30, 15))
fig.tight_layout(pad=10.0)
j = 2
i = 0
for s in emotions:
    for mention in ['Mpos', 'Mneg']:
        # df_ = get_df_for_plot(temp_df, pub)
        df_ = mean_df[((mean_df['mention'] == mention) & (mean_df['senti'] == s))]
        # print(df_)
        # leaning = media_incl[media_incl['Media_Outlet'] == pub]['Leaning'].values[0]
        ax = fig.add_subplot(3, 6, i+1)
        i+=1
        plt.title(mention + "-" + s)
        plt.xticks(rotation = 90)
        # print(pub, df_.shape)
        sns.barplot(x='pub', y='senti_score', hue='entity', data=df_, ax = ax)
        ax.set(xlabel=None)
        ax.get_legend().remove()

#### Save Plots

In [None]:
import pylab as plt
import seaborn as sns

# fig = plt.figure(figsize=(30, 15))
# fig.tight_layout(pad=10.0)
j = 2
i = 0
for emotion in emotions:
    for men in ['Mpos', 'Mneg']:
        # df_ = get_df_for_plot(temp_df, pub)
        df_ = mean_df[((mean_df['senti'] == emotion) & (mean_df['mention'] == men))]
        # print(df_)
        # leaning = media_incl[media_incl['Media_Outlet'] == pub]['Leaning'].values[0]
        fig = plt.figure(figsize=(10, 5))
        ax = fig.add_subplot()
        # ax = fig.add_subplot(3, 6, i+1)
        i+=1
        if(men == 'Mpos'):
            plt.title("Response to Positive metions (" + emotion + ")", fontsize = 25)
        else:
            plt.title("Response to Negative mentions (" + emotion + ")", fontsize = 25)
        # plt.xticks(rotation = 90)
        # print(pub, df_.shape)
        sns.barplot(x='pub', y='senti_score', hue='entity', data=df_, ax = ax)
        ax.set(xlabel=None)
        plt.ylabel('Emotion Score', fontsize=25)
        plt.xticks(fontsize = 25)
        plt.yticks(fontsize = 25)
        plt.legend(bbox_to_anchor=(1.1, 0.6), loc='upper left', borderaxespad=0, fontsize=25)
        ax.get_legend().remove()
        plt.savefig('figures/user_resp_SE/' + emotion + "-" + men + '.png', bbox_inches='tight')

### User response to Sentimental Vs Neutral tweets

In [None]:
user_resp.trump_neu.describe()

In [None]:
senti_tweet_resp = user_resp[(((user_resp['trump_neg'] > 0.25) & (user_resp['trump_neu'] < 0.25)) | ((user_resp['trump_pos'] > 0.25)  & (user_resp['trump_neu'] < 0.25)) | ((user_resp['biden_neg'] > 0.25)  & (user_resp['biden_neu'] < 0.25)) | ((user_resp['biden_pos'] > 0.25)  & (user_resp['biden_neu'] < 0.25)))]
neu_tweet_resp = user_resp[((user_resp['trump_neu'] > 0.50) | (user_resp['biden_neu'] > 0.50))]
neu_tweet_resp.shape, senti_tweet_resp.shape

### Sentimental Vs neutral

In [None]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']
sstat = []
for s in emotions:
    # dist1 = senti_tweet_resp[s + '_senti'].dropna()
    # dist2 = neu_tweet_resp[s + '_senti'].dropna()
    
    dist1 = senti_tweet_resp[s].dropna()
    dist2 = neu_tweet_resp[s].dropna()
    res = get_significance_stats(dist1, dist2, None, s, None)
    sstat.append(res)

stats_df = pd.DataFrame(sstat, columns = ['senti', 'pub', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df.round(4)

In [None]:
# stats_df.round(4).to_csv('results/statistical_test_user_resp_senti_vs_neutral.csv')

In [None]:
senti_tweet_resp.tweet_id.nunique()/senti_tweet_resp.conversation_id.nunique()

In [None]:
neu_tweet_resp.tweet_id.nunique()/neu_tweet_resp.conversation_id.nunique()

In [None]:
k1 = pd.DataFrame([['Sentimental'] + list(senti_tweet_resp[emotions].mean())], columns = ['News Tweets', emotions[0], emotions[1], emotions[2], emotions[3]])
k2 = pd.DataFrame([['Neutral'] + list(neu_tweet_resp[emotions].mean())], columns = ['News Tweets', emotions[0], emotions[1], emotions[2], emotions[3]])

In [None]:
pd.concat((k1, k2), axis = 0).round(2).to_csv('results/user_resp_emot_means_senti_vs_neutral_tweets.csv')

In [None]:
t_pos_df, t_neg_df = get_pos_neg_mentions(user_resp, 'trump')
b_pos_df, b_neg_df = get_pos_neg_mentions(user_resp, 'biden')

In [None]:
t_pos_df.shape, t_neg_df.shape, b_pos_df.shape, b_neg_df.shape

In [None]:
pos_df = pd.concat((t_pos_df, b_pos_df), axis = 0)
neg_df = pd.concat((t_neg_df, b_neg_df), axis = 0)
pos_df.shape, neg_df.shape

In [None]:
neu_df = user_resp[((user_resp['trump_neu'] > 0.8) | (user_resp['biden_neu'])) > 0.8]
neu_df.shape

In [None]:
senti = ['neg', 'neu', 'pos']
emotions = ['anger', 'sadness', 'joy', 'optimism']
sstat = []
for s in emotions:
#     dist1 = pos_df[s + '_senti'].dropna().sample(1000, random_state = 1)
#     dist2 = neg_df[s + '_senti'].dropna().sample(1000, random_state = 2)
    
    dist1 = pos_df[s].dropna()
    dist2 = neg_df[s].dropna()
    res = get_significance_stats(dist1, dist2, None, s, None)
    sstat.append(res)

stats_df = pd.DataFrame(sstat, columns = ['senti', 'pub', 'emotion', 'Sample size 1', 'sample size 2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'shapiro_t_dist1', 'shapiro_p_dist2', 'KW-p', 'KW-H', 'KW-dof', 'ES'])
stats_df.round(3)

In [None]:
neu_df.tweet_id.nunique()/neu_df.conversation_id.nunique()

In [None]:
pos_df.tweet_id.nunique()/pos_df.conversation_id.nunique()

In [None]:
neg_df.tweet_id.nunique()/neg_df.conversation_id.nunique()

In [None]:
senti_tweet_resp.theme.value_counts()/senti_tweet_resp.shape[0], neu_tweet_resp.theme.value_counts()/neu_tweet_resp.shape[0]

In [None]:
user_resp.columns

In [None]:
pd.set_option('display.max_columns', None)
user_resp.head()

### Analyze likes, replies and retweets

In [None]:
pos_df[['like_count_y', 'quote_count_y', 'reply_count_y', 'retweet_count_y']].mean()

In [None]:
neg_df[['like_count_y', 'quote_count_y', 'reply_count_y', 'retweet_count_y']].mean()

In [None]:
neu_df[['like_count_y', 'quote_count_y', 'reply_count_y', 'retweet_count_y']].mean()

In [None]:
user_resp[['anger', 'joy', 'optimism', 'sadness']].mean()