In [None]:
%load_ext autoreload
%autoreload 2
from utils import *
import os
import glob
from tqdm import tqdm
from datetime import datetime
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.lines as mlines
import matplotlib.ticker as ticker


In [None]:
# if you want to do the analysis on combined dimensions like Appenddic D of the paper
# set the following variable to the extra dimension you want to analyze otherwise set it to None
COMBINED_DIMENSION = None#'risk'

In [None]:
sorted_bias_groups = ['Left','Left-Center','Least Biased','Right-Center','Right']
sorted_reliability_groups = ["Non-misinformation","Misinformation"]
GENRATE_ALL_FIGURES = True


bias_color_dict = {
    'Left': (0.1, 0.1, 0.7),  # Blue, kept dark to distinguish from lighter blue
    'Left-Center': (0.2, 0.4, 0.8),  # Light blue, more distinguishable from the darker blue
    'Least Biased': (0.75, 0.75, 0.75),  # Medium gray, to ensure good contrast with both blues and reds
    'Right-Center': (0.95, 0.55, 0.55),  # Soft red, distinguishable from a more intense red
    'Right': (0.8, 0.1, 0.1)  # Red, kept deep to distinguish from the softer red
}


reliability_color_dict = {
    'Non-misinformation': (0.4, 0.8, 0.4),  # Light Green
    'Misinformation': (0.5, 0, 0.25)        # Maroon
}



In [None]:
all_potential_twitter_accounts = os.listdir("./data/sorted_tweets_pickles_1_13")
all_potential_twitter_accounts = [file[:-3] for file in all_potential_twitter_accounts]    
all_potential_twitter_accounts.sort()
len(all_potential_twitter_accounts)                                

In [None]:
# To save the dataframe for saving time
# If you want to save the dataframe for later use, set this variable to True
delete_the_saved_dataframes = True
if delete_the_saved_dataframes:
    ! rm ./data/publisher_and_tweets_df.pk

In [None]:
# The liwc results are in the following file tweetid is a column in the file which is used to merge the dataframes
tweets_liwc_results_file = "data/tweet_level_1_13_texts/LIWC_22_combined.csv"
tweets_liwc_results_df = pd.read_csv(tweets_liwc_results_file)
# rename the Filename column to tweet_id
tweets_liwc_results_df.rename(columns={"Filename": "tweet_id"}, inplace=True)
# lower case the column names
tweets_liwc_results_df.columns = map(str.lower, tweets_liwc_results_df.columns)
tweets_liwc_results_df

In [None]:
# print the mean of the wc,WPS,analytic,authentic,cogproc,Certitude,discrep,moral,socbehav,conflict,female,relig,sexual,risk,reward,curiosity,focuspast,focuspresent,focusfuture colunms
target_liwc_columns = ["wc","wps","analytic", "cogproc", "socbehav", "risk", "reward","authentic",  "curiosity", "perception","clout"]
target_liwc_columns = ["analytic", "clout","perception","risk"]
#target_liwc_columns = ["analytic"]


target_liwc_medians = dict()
for tc in target_liwc_columns:
    target_liwc_medians[tc] = tweets_liwc_results_df[tc].median()
target_liwc_medians

In [None]:
# limit the tweets_liwc_results_df to the target columns
tweets_liwc_results_df = tweets_liwc_results_df[target_liwc_columns + ["tweet_id"]]

In [None]:
PLOT_CDF_CCDFS = False
if PLOT_CDF_CCDFS:
    for tc in target_liwc_columns:
        plot_cdf_ccdf(tweets_liwc_results_df[tc],xlabel=f"{tc}",ylabel="CDF",file_name=f"{tc}_cdf_ccdf")

In [None]:
tweets_liwc_results_df

In [None]:
drop_tweet_text_column = True
if not os.path.exists("./data/publisher_and_tweets_df.pk"):
    column_names = ["tweet_id", "tweet_creation_date", "tweet_text", "like_count", "retweet_count", 
                    "quote_count", "reply_count", "bookmark_count", "impression_count", "followers_count"]
    tweets_df_list = []
    for next_twitter_account in tqdm(all_potential_twitter_accounts):
        next_twitter_account_list_of_tweets = un_pickelize("./data/sorted_tweets_pickles_1_13/" + next_twitter_account + ".pk")
        next_twitter_account_df = pd.DataFrame(next_twitter_account_list_of_tweets, columns=column_names)
        if drop_tweet_text_column:
            next_twitter_account_df.drop(columns=["tweet_text"], inplace=True)
        

            
        
        # add the next_twitter_account as the last column and put next_twitter_account in all rows
        next_twitter_account_df['twitter_account'] = next_twitter_account
        tweets_df_list.append(next_twitter_account_df)

    tweets_df = pd.concat(tweets_df_list)
    del tweets_df_list
    # join the tweets_df and tweets_liwc_results_df to tweets_df and drop the rows from tweets_df that don't have a corresponding row in tweets_liwc_results_df
    tweets_df["total_enagement"] = tweets_df["like_count"] + tweets_df["retweet_count"] + tweets_df["quote_count"] + tweets_df["reply_count"]
    #tweets_df["deep_count"] = tweets_df["retweet_count"] + tweets_df["quote_count"] + tweets_df["reply_count"]
    #print("Number of tweets without any views is:",tweets_df[tweets_df["impression_count"]==0].shape)
    tweets_df = tweets_df[tweets_df['impression_count'] > 0]# limit to tweets with views
    tweets_df["engagement_rate"] = tweets_df["total_enagement"] / tweets_df["impression_count"]
    print("Number of tweets with engagement rate > 1 is:",tweets_df[tweets_df["engagement_rate"]>1].shape)
    tweets_df = tweets_df[tweets_df['engagement_rate'] < 1]
    #for next_column in ['like', 'retweet', 'quote', 'reply', 'deep' ]:
    #    tweets_df[f"{next_column}_engagement_rate"] = tweets_df[f"{next_column}_count"] / tweets_df['impression_count']
            
    tweets_df['tweet_id'] = pd.to_numeric(tweets_df['tweet_id'], errors='coerce')
    tweets_df = tweets_df.merge(tweets_liwc_results_df, on='tweet_id', how='inner')
    #del tweets_liwc_results_df
    
    
    COMPUTE_PUBLISHERS_DF = True
    if COMPUTE_PUBLISHERS_DF:
        publishers_df = pd.read_csv("../Twitter_td/df_final_web.csv")
        publishers_df = publishers_df[[ 'Bias', 'Reliability','twitter_account', 'followers_count']]
        publishers_df = publishers_df[publishers_df["followers_count"] > 10000]
        #'Credibility', 'source', 'link', 'final_source',l
        publishers_df.columns = ['Bias', 'Reliability','twitter_account', 'followers_count_march']
        tweets_df = tweets_df.merge(publishers_df, on='twitter_account', how='inner')

        #tweets_df = tweets_df[~tweets_df['twitter_account'].isin(twitter_account_counts[twitter_account_counts < 100].index)]
        
        
        publishers_df = tweets_df.groupby('twitter_account').agg({'Bias': 'first', 'Reliability': 'first',
                                                                'engagement_rate':'median',
                                                                'like_count' : 'sum',
                                                                'retweet_count' : 'sum',
                                                                'quote_count' : 'sum',
                                                                'reply_count' : 'sum',
                                                                #'like_engagement_rate':'median',
                                                                #'retweet_engagement_rate':'median',
                                                                #'quote_engagement_rate':'median',
                                                                #'reply_engagement_rate':'median',
                                                                'tweet_id': 'count', 
                                                                'followers_count': 'mean', 'followers_count_march':"first",
                                                                'total_enagement': 'sum','impression_count':'sum' ,}).reset_index()
        #publishers_df.columns = ['twitter_account','Bias', 'Reliability','engagement_rate','like_count','retweet_count','quote_count','reply_count','like_engagement_rate','retweet_engagement_rate','quote_engagement_rate','reply_engagement_rate', 'number_of_tweets', 'followers_count_mean','followers_count_march', 'total_enagement', 'impression_count']
        publishers_df.columns = ['twitter_account','Bias', 'Reliability','engagement_rate','like_count','retweet_count','quote_count','reply_count', 'number_of_tweets', 'followers_count_mean','followers_count_march', 'total_enagement', 'impression_count']
        # apply int to the followers_count_mean column
        publishers_df['followers_count_mean'] = publishers_df['followers_count_mean'].astype(int)
        # renamed the columnn followers_count_march to followers_count
        publishers_df.rename(columns={'followers_count_march':'followers_count'}, inplace=True)


        for liwc_feature in target_liwc_columns:
            
            if COMBINED_DIMENSION == None:
                temp = tweets_df[tweets_df[liwc_feature] >target_liwc_medians[liwc_feature]].groupby('twitter_account').agg({'tweet_id': 'count'})
            else:
                temp = tweets_df[(tweets_df[liwc_feature] >target_liwc_medians[liwc_feature])&(tweets_df[COMBINED_DIMENSION] >target_liwc_medians[COMBINED_DIMENSION])].groupby('twitter_account').agg({'tweet_id': 'count'})
            temp.columns = [f'high_{liwc_feature}_tweets']
            # Use an outer join instead of inner
            publishers_df = publishers_df.merge(temp, on='twitter_account', how='outer')
            # Replace NaN values with 0 in the liwc_feature tweets column
            publishers_df[f'high_{liwc_feature}_tweets'].fillna(0, inplace=True) #######################
            # Calculate the percentage; ensure division by zero is handled
            publishers_df[f'high_{liwc_feature}_percentage'] = publishers_df[f'high_{liwc_feature}_tweets']*100 / publishers_df['number_of_tweets']
            # drop the f'{sentiment}_tweets' column 
            assert(publishers_df[f'high_{liwc_feature}_tweets'].min() >= 0)

            publishers_df.drop(columns=[f'high_{liwc_feature}_tweets'], inplace=True)
            
            if COMBINED_DIMENSION == None:
                temp = tweets_df[tweets_df[liwc_feature] >target_liwc_medians[liwc_feature]].groupby('twitter_account').agg({'engagement_rate': 'median'})
            else:
                temp = tweets_df[(tweets_df[liwc_feature] >target_liwc_medians[liwc_feature])&(tweets_df[COMBINED_DIMENSION] >target_liwc_medians[COMBINED_DIMENSION])].groupby('twitter_account').agg({'engagement_rate': 'median'})
            temp.columns = [f'high_{liwc_feature}_engagement_rate']
            # Use an outer join instead of inner    
            publishers_df = publishers_df.merge(temp, on='twitter_account', how='outer')

            

        pickelize((publishers_df,tweets_df), "./data/publisher_and_tweets_df.pk")
else:
    publishers_df,tweets_df = un_pickelize("./data/publisher_and_tweets_df.pk")

In [None]:
# add columns for each of the liwc features to the tweets_df so that shows if the tweet is above or below the median for that feature
if COMBINED_DIMENSION == None:
    for liwc_feature in target_liwc_columns:
        tweets_df[f'high_{liwc_feature}'] = tweets_df[liwc_feature] > target_liwc_medians[liwc_feature]
        tweets_df[f'high_{liwc_feature}'] = tweets_df[f'high_{liwc_feature}'].astype(int)
else:
    for liwc_feature in target_liwc_columns:
        tweets_df[f'high_{liwc_feature}'] = ((tweets_df[liwc_feature] > target_liwc_medians[liwc_feature])&(tweets_df[COMBINED_DIMENSION] >target_liwc_medians[COMBINED_DIMENSION]))
        tweets_df[f'high_{liwc_feature}'] = tweets_df[f'high_{liwc_feature}'].astype(int)    

In [None]:
tweets_df.shape

In [None]:
tweets_df

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Assuming tweets_df, publishers_df, target_liwc_columns, and target_liwc_medians are already defined

# Pre-computation of unique twitter accounts and their engagement rates
NOT_NEEDED = True
if not NOT_NEEDED:
    unique_twitter_accounts = publishers_df["twitter_account"].unique()
    engagement_rates = {account: temp[temp["twitter_account"] == account]["engagement_rate_tweets"] for account in unique_twitter_accounts}

    # Vectorized operation for percentile calculation
    candidate_percentiles = [10, 20, 30, 40, 50, 60, 70, 80, 90]

    # Function to calculate the shift average
    def calculate_shift_average(publisher, liwc_feature):
        original_engagement_rates = engagement_rates[publisher]
        high_analytic_engagement_rates = temp[(temp["twitter_account"] == publisher) & (temp[liwc_feature] > target_liwc_medians[liwc_feature])]["engagement_rate_tweets"]
        if high_analytic_engagement_rates.shape[0] == 0:
            return np.nan
        
        sum_percentiles = sum(
            stats.percentileofscore(original_engagement_rates, np.percentile(high_analytic_engagement_rates, per)) - per
            for per in candidate_percentiles
        )
        return sum_percentiles / len(candidate_percentiles)

    # Applying the function over the DataFrame
    for liwc_feature in target_liwc_columns:
        publishers_df["all_percentiles_shift_average_" + liwc_feature] = publishers_df["twitter_account"].apply(lambda x: calculate_shift_average(x, liwc_feature))

In [None]:
tweets_df.shape

In [None]:
# join the publishers_df and tweets_df to tweets_df and drop the rows from tweets_df that don't have a corresponding row in publishers_df
COMPUTE_AVG = False
if not COMPUTE_AVG:
    temp = pd.merge(tweets_df, publishers_df, on='twitter_account', suffixes=('_tweets', '_publishers'))# for each publisher find the distribution of the engagement rates and engagement rates of the ones that are high in the analytuc feature
    for liwc_feature in target_liwc_columns[:1]:
        publishers_df["all_percentiles_shift_average_"+liwc_feature] = np.nan
        for publisher in tqdm(publishers_df["twitter_account"].unique()):
            original_engagement_rates = temp[temp["twitter_account"] == publisher]["engagement_rate_tweets"]
            high_analytic_engagement_rates = temp[(temp["twitter_account"] == publisher) & (temp[liwc_feature] > target_liwc_medians[liwc_feature])]["engagement_rate_tweets"]
            # if high_analytic_engagement_rates is empty then skip this publisher
            if high_analytic_engagement_rates.shape[0] == 0:
                continue
            candidate_percentiles =[10,20,30,40,50,60,70,80,90]
            candidate_percentiles =[50]
            sum = 0
            for  candidate_per in candidate_percentiles:
                # find the percentile of the high_analytic_engagement_rates
                percentile = np.percentile(high_analytic_engagement_rates, candidate_per)
                # find the percentile_of_score of percentile in the original_engagement_rates
                percentile_of_score = stats.percentileofscore(original_engagement_rates, percentile)
                sum += (percentile_of_score-candidate_per)
            sum = sum / len(candidate_percentiles)
            publishers_df.loc[publishers_df["twitter_account"] == publisher,"all_percentiles_shift_average_"+liwc_feature] = sum
    

In [None]:
for liwc_feature in target_liwc_columns:
    # how many are nan
    print(f"{liwc_feature} nan count:",tweets_df[liwc_feature].isna().sum())

In [None]:
import numpy as np
import pandas as pd

# Assuming publishers_df and tweets_df are already defined

# Adding initial columns with default values
for target_liwc in target_liwc_columns:
    publishers_df[f"high_{target_liwc}_engagement_rate_median_percentile"] = -1000

# Merging the tweets data with the publishers on 'twitter_account', differentiating same column names
merged_df = pd.merge(tweets_df, publishers_df, on='twitter_account', suffixes=('_tweets', '_publishers'))

for target_liwc in target_liwc_columns:
    # Group by twitter_account and apply the percentile calculation
    def percentile_calc(group):
        sentiment_median = group[f"high_{target_liwc}_engagement_rate"].iloc[0]
        values = np.sort(group['engagement_rate_tweets'].values)
        return (np.searchsorted(values, sentiment_median) * 100 / len(group))-50

    def effect_calc(group):
        values = group[liwc_feature].values
        # drop the nan values
        
        values = values[~np.isnan(values)]
        this_median = np.median(values)
        
        high_values = group[group[liwc_feature] >=this_median]['engagement_rate_tweets'].values
        low_values = group[group[liwc_feature] <this_median]['engagement_rate_tweets'].values
        #print(high_values,low_values)
        # drop the nan values
        high_values = high_values[~np.isnan(high_values)]
        low_values = low_values[~np.isnan(low_values)]
        #print(high_values)
        #print(low_values)
        a = mannwhitneyu(high_values, low_values)
        return a

    # Apply the function to each group and map the results back to publishers_df
    percentile_series = merged_df.groupby('twitter_account').apply(percentile_calc)
    publishers_df = publishers_df.set_index('twitter_account')
    publishers_df[f"high_{target_liwc}_engagement_rate_median_percentile"] = percentile_series
    publishers_df.reset_index(inplace=True)
    effect_series = merged_df.groupby('twitter_account').apply(effect_calc)
    publishers_df = publishers_df.set_index('twitter_account')
    publishers_df[f"high_{target_liwc}_engagement_rate_effect"] = effect_series
    publishers_df.reset_index(inplace=True)
    
    # compute for each interaction type
    COMPUTE_FOR_SUB_INTERACTIONS = False
    if COMPUTE_FOR_SUB_INTERACTIONS:
        for next_column in ["like", "retweet", "reply","quote"]:
            # Group by twitter_account and apply the percentile calculation
            def percentile_calc(group):
                sentiment_median = group[f"high_{target_liwc}_{next_column}_engagement_rate"].iloc[0]
                values = np.sort(group[f'{next_column}_engagement_rate_tweets'].values)
                return (np.searchsorted(values, sentiment_median) * 100 / len(group))-50

            # Apply the function to each group and map the results back to publishers_df
            percentile_series = merged_df.groupby('twitter_account').apply(percentile_calc)
            publishers_df = publishers_df.set_index('twitter_account')
            publishers_df[f"high_{target_liwc}_{next_column}_engagement_rate_median_percentile"] = percentile_series
            publishers_df.reset_index(inplace=True)

# Moivation Example

In [None]:
# rename Reliable to "Non-misinformation" and Unreliable to "Misinformation" for the "Reliability" column
publishers_df.loc[publishers_df['Reliability'] == 'Reliable', 'Reliability'] = "Non-misinformation"
publishers_df.loc[publishers_df['Reliability'] == 'Unreliable', 'Reliability'] = "Misinformation"

In [None]:
# what are the misinformaiton publishers wit the highest number of followers
misinformation_publishers = publishers_df[publishers_df['Reliability'] == 'Misinformation']
misinformation_publishers.sort_values(by=['followers_count'], ascending=False)[['twitter_account', 'followers_count','Bias']].head(10)

In [None]:
publishers_df.columns

In [None]:
# print the shift in median for the breitbartnews account and the risk liwc feature
tem = publishers_df[publishers_df['twitter_account'] == "nytimes"]
print(tem["high_analytic_engagement_rate_median_percentile"])
# median of high analytic tweets
temp2 = tweets_df[(tweets_df['twitter_account'] == "nytimes")]
print(temp2['engagement_rate'].median())
temp2 = temp2[(temp2['analytic'] > target_liwc_medians['analytic'])]
print(temp2['engagement_rate'].median())

In [None]:
tweets_df.shape

In [None]:
# plot the cdf of the engagement rate for the accounts nytimes and wshingtonpost
accounts = ["nytimes","foxnews"]
values = []
names = []
target_liwc = "analytic"
for account in accounts:
    values.append(tweets_df[tweets_df['twitter_account'] == account]['engagement_rate'].values)
    names.append(f"@{account}")
    values.append(tweets_df[(tweets_df['twitter_account'] == account) & (tweets_df[target_liwc]> target_liwc_medians[target_liwc] )]['engagement_rate'].values)
    names.append(f"@{account}" + f"_high_{target_liwc}")
# combine the two lists
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.sans-serif'] = 'Helvetica'
plt.rcParams['pdf.fonttype'] = 42
#plt.rcParams.update({'font.size': 24})
plt.figure(figsize=(10, 5))

bias_cclors = list(bias_color_dict.values())
colors = [bias_cclors[0],bias_cclors[0],bias_cclors[-1],bias_cclors[-1]]
i = 0
for rates, name in zip(values, names):
    # Calculate the empirical CDF
    sorted_rates = np.sort(rates)
    cdf = np.arange(1, len(sorted_rates) + 1) / len(sorted_rates)

    if "_high_" not in name:
        plt.plot(sorted_rates, cdf, label=name +" (all tweets)", color=colors[i], linewidth=3)
        print(cdf[-1])
        print(sorted_rates[0],sorted_rates[-1],sorted_rates[-10:])
        i += 1
    else:
       # plot the dashed version
        plt.plot(sorted_rates, cdf, label=name.split("_")[0]+f" ({target_liwc}al tweets)", color=colors[i], linewidth=3, linestyle='--')
        i+=1

# add a horizontal line at 0.5
#plt.axhline(y=0.5, color='black',linestyle='--' ,alpha=0.8, linewidth=2)
#limit the x to the min and max of the data

# put an arrow at the 0.5 point
plt.xlabel("Engagement Rate")

#plt.xscale("log")
plt.xlim(0.0003,0.0035)
plt.ylabel("CDF")
plt.legend(fontsize=21,loc='lower right')
plt.grid(True)
plt.rcParams.update({'font.size': 22})

# limit y to 0 to 1
plt.ylim(0,1.00)

# plto an arrow for where the last cdf has y of 0.5 to where the third cdf has y of 0.5
ax = plt.gca()
ticks = ax.get_xticks()
# drop one out of two among the ticks
ticks = ticks[::2]
ticks = ticks[ticks != 0]  # Remove zero from the list of ticks
ax.set_xticks(ticks)

plt.annotate('', xy=(0.0012, 0.55), xytext=(0.0012, 0.5), arrowprops=dict(facecolor='black', shrink=0.01))

plt.savefig(f"./figs/cdfs/cdf_motivating_{target_liwc}.pdf", dpi=300,  bbox_inches='tight')
plt.show()

In [None]:
#print the number of tweets for nytimes and foxnews
print(tweets_df[tweets_df['twitter_account'] == "nytimes"].shape)
print(tweets_df[tweets_df['twitter_account'] == "foxnews"].shape)

In [None]:
#print the number of followers for nytimes and foxnews
print(publishers_df[publishers_df['twitter_account'] == "nytimes"]['followers_count'].values)
print(publishers_df[publishers_df['twitter_account'] == "foxnews"]['followers_count'].values)

In [None]:
# median enagement rate of analytic posts of nytimes
print(tweets_df[(tweets_df['twitter_account'] == "nytimes") & (tweets_df[target_liwc]> target_liwc_medians[target_liwc] )]['engagement_rate'].median())

In [None]:
print(tweets_df[(tweets_df['twitter_account'] == "nytimes") ]['engagement_rate'].median())

In [None]:
# find the publishers which are misinformation and have a high negative analytic median percentile
tem = publishers_df[(publishers_df['Reliability'] == "Misinformation") & (publishers_df["high_analytic_engagement_rate_median_percentile"] < 1)]
# limit to the top 5 ones with the highest number of followers
tem.sort_values(by=['followers_count'], ascending=False)[['twitter_account', 'followers_count','Bias']].head(10)

In [None]:
#largest number of tweets and the twitter account
twitter_account_counts = tweets_df['twitter_account'].value_counts()
print("Largest number of tweets: ", max(twitter_account_counts))
print("Twitter account with the largest number of tweets: ", twitter_account_counts[twitter_account_counts == max(twitter_account_counts)].index[0])

# Table 1

In [None]:
grouped = publishers_df.groupby(['Bias', 'Reliability']).agg({
    'followers_count': 'sum',
    'number_of_tweets': 'sum',
    'like_count': 'sum',
    'retweet_count': 'sum',
    'quote_count': 'sum',
    'reply_count': 'sum',
    'total_enagement': 'sum',
    'impression_count': 'sum',
    'followers_count_mean': 'count'# used just to count the number of publishers in each group
}).reset_index()
# add a column for the number of publishers in each group
grouped = grouped.rename(columns={'followers_count_mean': 'outlets'})

grouped['engagement_rate'] = grouped['total_enagement'] / grouped['impression_count']

# Sort dataframe based on your defined orders
grouped['Bias'] = pd.Categorical(grouped['Bias'], sorted_bias_groups)
grouped['Reliability'] = pd.Categorical(grouped['Reliability'], sorted_reliability_groups)
grouped = grouped.sort_values(by=['Reliability', 'Bias'])

In [None]:
grouped

In [None]:
publishers_df.shape

In [None]:
# Intensity Calculation
import pyperclip
def calculate_intensity(value, max_value, min_value):
    if max_value == min_value:
        return 0
    return int(100 * (value - min_value) / (max_value - min_value))

def calculate_intensity(value, max_value, min_value):
    if value > 1:
        # Avoid logarithm of zero or negative values
        value, max_value, min_value = max(1, value), max(1, max_value), max(1, min_value)
        log_value, log_max, log_min = np.log(value), np.log(max_value), np.log(min_value)
        if log_max == log_min:
            return 0
        return int(100 * (log_value - log_min) / (log_max - log_min))
    else:
        # non-logarithmic values
        if max_value == min_value:
            return 0
        return int(100 * (value - min_value) / (max_value - min_value))
    


# Assuming 'grouped' is your DataFrame after aggregation and sorting

# LaTeX helper function to format numbers


# Create the LaTeX Table with multi-level headers
latex_table = """
\\begin{table*}[htbp]
  \\centering
    \\caption{Statistics of the dataset categorized by political leaning and reliability type.}
  \\begin{tabular}{l|l|rrrrrrrrrrr}
    \\toprule
    & Class & outlets & Followers & Tweets & Likes& Retweets& Replies& Quotes& Interactions & Impressions& Eng. rate \\\\
    \\midrule
"""

# Add Bias classes
latex_table += "    \\multirow{5}{*}{\\rotatebox[origin=c]{90}{Leaning}}"
for bias in sorted_bias_groups:
    bias_data = grouped[grouped['Bias'] == bias]
    latex_table += f" & {bias}"
    for col in ['outlets', 'followers_count', 'number_of_tweets', 'like_count','retweet_count','reply_count','quote_count', 'total_enagement','impression_count','engagement_rate']:
        if col == 'outlets':
            latex_table += f" & {bias_data[col].sum()}"
        elif col == "engagement_rate":
            a = bias_data["total_enagement"].sum()/bias_data["impression_count"].sum()
            latex_table += f" & {y_fmt(a, decimals=3)}"
        else:
            latex_table += f" & {y_fmt(bias_data[col].sum())}"
    latex_table += " \\\\\n"

# Add Reliability classes
latex_table += "    \\midrule\n"
latex_table += "    \\multirow{2}{*}{\\rotatebox[origin=c]{90}{Rel.}}"
for reliability in sorted_reliability_groups:
    reliability_data = grouped[grouped['Reliability'] == reliability]
    reliability_renamed = reliability.replace("Misinformation","Misinfo.").replace("Non-misinformation","Non-misinfo.")
    latex_table += f" & {reliability_renamed}"
    for col in ['outlets', 'followers_count', 'number_of_tweets', 'like_count','retweet_count','reply_count','quote_count', 'total_enagement','impression_count','engagement_rate']:
        if col == 'outlets':
            latex_table += f" & {reliability_data[col].sum()}"
        elif col == "engagement_rate":
            a = reliability_data["total_enagement"].sum()/reliability_data["impression_count"].sum()
            latex_table += f" & {y_fmt(a, decimals=3)}"
        else:
            latex_table += f" & {y_fmt(reliability_data[col].sum())}"
    latex_table += " \\\\\n"

# Add Total row
total_row = grouped[['outlets', 'followers_count', 'number_of_tweets', 'like_count','retweet_count','reply_count','quote_count', 'total_enagement','impression_count','engagement_rate']].sum()
latex_table += "    \\hline\n"
latex_table += "    \\multicolumn{2}{l|}{Total}"
for col in total_row.index:
    if col == 'outlets':
        latex_table += f" & {int(total_row[col])}"
    elif col == "engagement_rate":
        latex_table += f"& {y_fmt(total_row['total_enagement']/total_row['impression_count'], decimals=3)}"
    else:
        latex_table += f" & {y_fmt(total_row[col])}"
latex_table += " \\\\\n"

latex_table += "\\bottomrule\n"
latex_table += "\\end{tabular}\n"
latex_table += "\\label{tab:dataset_statistics}\n"
latex_table += "\\end{table*}\n"

# copy the latex_table to the clipboard

pyperclip.copy(latex_table)
print(latex_table)


In [None]:
# mean and median of the engagement rate
print("Mean of the engagement rate: ", np.mean(tweets_df['engagement_rate']))
print("Median of the engagement rate: ", np.median(tweets_df['engagement_rate']))

In [None]:
publishers_df.shape

In [None]:
#inport empirical_cdf
from scipy.stats import spearmanr
import matplotlib.ticker as ticker
from matplotlib import pyplot as plt

In [None]:
def plot_box_plots(group_values,groups_names,group_colors=None,y_log_scale=True,save_file_name=None,ylabel=None,show_whiskers=True,add_horizontal_line_at=None,show_means=True,rotate_x_ticks=None,varname=None,legened_loc='lower center'):
    # replace Non-misinformation with Non-misinfo. in the group names and Misinformation with Misinfo.
    groups_names = [x.replace("Non-misinformation","Non-misinfo.").replace("Misinformation","Misinfo.") for x in groups_names]
    from matplotlib import pyplot as plt

    plt.rcParams['font.family'] = 'Helvetica'
    plt.rcParams['font.sans-serif'] = 'Helvetica'
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams.update({'font.size': 24})
    box_plot_positions = []
    group_spacing = .8
    width_of_box = 0.5
    latest_position = 0
    for i in range(len(group_values)):
        box_plot_positions.append(latest_position)
        latest_position += group_spacing
        if i == 4:
            latest_position += group_spacing-.2
        if i == 6:
            latest_position += group_spacing-.2
    print(box_plot_positions)
        
    fig, ax = plt.subplots(figsize=(10, 5))

    if show_whiskers == False:
        # just show the interquartile range
        box_plots = ax.boxplot(group_values, widths=width_of_box, positions=box_plot_positions, patch_artist=True, showfliers=True, notch=False, meanline=False, showmeans=show_means, whis=[25, 75])
    else:
        box_plots = ax.boxplot(group_values, widths=width_of_box, positions=box_plot_positions, patch_artist=True, showfliers=False, whis=[20, 80], notch=False, meanline=False, showmeans=show_means)
        

    
    # apply the colors to the box plots
    if group_colors is not None:
        for patch, color in zip(box_plots['boxes'], group_colors):
            patch.set_facecolor(color)
    highlight_color = "aquamarine" # chartreuse

    for median in box_plots['medians']:
        median.set_color(highlight_color)  # Set the color of median line
        #median.set_marker('o')
        median.set_markerfacecolor(highlight_color)
        # set the line width of the median
        median.set_linewidth(3)
        median.set_markersize(8)
    if show_means:
        for mean in box_plots['means']:
            mean.set_color(highlight_color)  # Set the color of mean marker
            mean.set_marker('o')
            mean.set_markerfacecolor(highlight_color)
            mean.set_markersize(8)
            mean_patch = mlines.Line2D([], [], color=highlight_color, marker='o', linestyle='None', markersize=8, label='Mean')
    ax.set_xticklabels(groups_names)
    ax.yaxis.set_major_formatter(ticker.FuncFormatter(y_fmt))
    
    median_patch = mlines.Line2D([], [], color=highlight_color, linestyle='-', markersize=10, label='Median')

    # change the y label font size
    
    # shift the y label down

    # apply the scirentific notation to the y axis while the labels are floats
    def y_fmt_box(x, pos):
        """
        Custom formatting function for y-axis labels in scientific notation.
        """
        if x == 0:
            return "0"
        exp = int(round(plt.log10(x)))
        coeff = x / 10**exp
        return fr"${coeff:.2f} \times 10^{exp}$"
    ax.yaxis.set_major_formatter(ticker.ScalarFormatter(useMathText=True, useOffset=False))
    ax.yaxis.offsetText.set_fontsize(16)  # Adjust the font size of the exponent
    ax.yaxis.major.formatter._useMathText = True
    ax.yaxis.get_offset_text().set_position((0.1, 0.15))  # Adjust the position of the exponent

    
    if show_means: 
        #ax.legend(handles=[mean_patch, median_patch], labels=['Mean', 'Median'], loc='lower center', bbox_to_anchor=(0.5, 1.02), ncol=2, borderaxespad=0.0, fontsize='20')
        ax.legend(handles=[mean_patch, median_patch], labels=['Mean', 'Median'],fontsize='20',loc=legened_loc,ncol=2)
    else:
        #ax.legend(handles=[median_patch], labels=['Median'], loc='lower center', bbox_to_anchor=(0.5, 1.02), ncol=2, borderaxespad=0.0, fontsize='28')
        ax.legend(handles=[median_patch], labels=['Median'],fontsize='20',loc=legened_loc)
    print("legend loc: ", legened_loc)
    if add_horizontal_line_at is not None:
        ax.axhline(y=add_horizontal_line_at, color='black', linestyle='--',linewidth=2)
    if ylabel == None:
        ax.set_ylabel("Engagement Rate")
    else:
        ax.set_ylabel(ylabel)
        ax.yaxis.label.set_size(24)
    # rotate the x axis labels
    if rotate_x_ticks is not None:
        plt.xticks(rotation=rotate_x_ticks, ha='right')
    # decrease the x label font size
    plt.xticks(fontsize=24)
    # put the ha to the right
    ax.tick_params(axis='x', which='major', pad=15) # shift the ticks with ha to the right
    if y_log_scale:
        plt.yscale("log")
    # increast the subticks marks size
    ax.tick_params(axis='y', which='minor', length=5)
    # shift the ticks with ha to the right
    # xlabel font size
    plt.savefig(f"./figs/box_plots/box_plot_{save_file_name}.pdf", dpi=300,  bbox_inches='tight')

In [None]:
from scipy.stats import wilcoxon

def wilcoxon_signed_rank_test(data, constant=0):
    """
    Perform the Wilcoxon Signed-Rank Test to compare a sample distribution to a constant value.
    
    Parameters:
    data (array-like): The distribution of numbers (sample data).
    constant (float): The constant value to compare the distribution against.

    Returns:
    statistic (float): The test statistic.
    p_value (float): The p-value for the test.
    """
    # Calculate the differences between the data points and the constant
    differences = np.array(data) - constant

    # Perform the Wilcoxon Signed-Rank Test
    statistic, p_value1 = wilcoxon(differences, alternative='less')
    
    statistic, p_value2 = wilcoxon(differences, alternative='greater')


    return p_value1,p_value2,np.median(data)

In [None]:
# plot the {sentiment}_engagement_rate_median_percentile
for column in target_liwc_columns:
    print(column)
    if column== "analytic":
        legened_loc = 'lower left'
    elif column == "perception":
        legened_loc = 'upper right'
    else:
        legened_loc = 'lower right'
    column1 = f"high_{column}_engagement_rate_median_percentile"
    if COMBINED_DIMENSION == None:
        save_file_name=f"{column}_engagement_rate_median_percentile"
    else:
        save_file_name=f"{column}_{COMBINED_DIMENSION}_engagement_rate_median_percentile"
    
    median_percentile_values = []
    for bias in sorted_bias_groups:
        temp = publishers_df[publishers_df['Bias'] == bias]
        # drop the infinities
        #temp = temp[temp[f"{sentiment}_engagement_rate_median_percentile"] != -1000]
        # add the non nan values
        a = temp[column1].values
        a = a[~np.isnan(a)]
        median_percentile_values.append(a)
        #print(sentiment, bias,wilcoxon_signed_rank_test(median_percentile_values[-1], constant=0))
    for reliability in sorted_reliability_groups:
        temp = publishers_df[publishers_df['Reliability'] == reliability]
        # drop the infinities
        #temp = temp[temp[f"{sentiment}_engagement_rate_median_percentile"] != -1000]
        # drop the nan values
        a = temp[column1].values
        a = a[~np.isnan(a)]
        median_percentile_values.append(a)        #print(sentiment, reliability,wilcoxon_signed_rank_test(median_percentile_values[-1], constant=0))
    # add also all the data
    # add the non nan values
    a = publishers_df[column1].values
    a = a[~np.isnan(a)]
    median_percentile_values.append(a)
    # drop the nan values

    

    #plot_empirical_cdfs(median_percentile_values, sorted_bias_groups, f"{sentiment}_engagement_rate_median_percentile",min_y_cdf=y_cdf_min,max_x=100,x_log_scale=False,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"{sentiment} tweets engagement rate median percentile")
    # also plot the box plot

    plot_box_plots(median_percentile_values,sorted_bias_groups+sorted_reliability_groups+["All"],y_log_scale=False,group_colors = list(bias_color_dict.values())+list(reliability_color_dict.values())+["white"],save_file_name=save_file_name,ylabel=f"Median Percentiles Shift",add_horizontal_line_at=0,show_means=False,show_whiskers=True,rotate_x_ticks=45,varname=column,legened_loc=legened_loc)
    #plot_box_plots(median_percentile_values, sorted_bias_groups,y_log_scale=False,group_colors = list(bias_color_dict.values()),save_file_name=f"{column}_engagement_rate_median_percentile",ylabel=f"Increase in median percentile",add_horizontal_line_at=0,show_means=False,show_whiskers=True,rotate_x_ticks=45)
    
    
    # also plot the cdf of the median percentile
    #plot_empirical_cdfs(median_percentile_values, sorted_bias_groups, f"{sentiment}_engagement_rate_median_percentile",min_y_cdf=0,max_x=100,x_log_scale=False,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"{sentiment} tweets engagement rate median percentile")
    # also run the kruscall wallis test
    #print("Kruscall wallis test for ", sentiment)
    
    # run the wilcoxon signed rank test for each group for the nonnan values
    # print the medians of all groups 
    for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups+["All"]):
        print(group,np.median(median_percentile_values[i]))
        
    print("------------------------")
    #for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups+["All"]):
    #    values = median_percentile_values[i]
    #    values = values[~np.isnan(values)]
    #    print(group, wilcoxon_signed_rank_test(values, constant=0))
    #print("------------------------")

    #run_kruskal_wallis(median_percentile_values, sorted_bias_groups+sorted_reliability_groups+["All"])
    
    
    

In [None]:
# plot the {sentiment}_engagement_rate_median_percentile
for column in target_liwc_columns[:1]:
    print(column)
    if column== "analytic":
        legened_loc = 'lower left'
    elif column == "perception":
        legened_loc = 'upper right'
    else:
        legened_loc = 'lower right'
    column1 = "all_percentiles_shift_average_"+column

    median_percentile_values = []
    for bias in sorted_bias_groups:
        temp = publishers_df[publishers_df['Bias'] == bias]
        # drop the infinities
        #temp = temp[temp[f"{sentiment}_engagement_rate_median_percentile"] != -1000]
        # add the non nan values
        a = temp[column1].values
        a = a[~np.isnan(a)]
        median_percentile_values.append(a)
        #print(sentiment, bias,wilcoxon_signed_rank_test(median_percentile_values[-1], constant=0))
    for reliability in sorted_reliability_groups:
        temp = publishers_df[publishers_df['Reliability'] == reliability]
        # drop the infinities
        #temp = temp[temp[f"{sentiment}_engagement_rate_median_percentile"] != -1000]
        # drop the nan values
        a = temp[column1].values
        a = a[~np.isnan(a)]
        median_percentile_values.append(a)        #print(sentiment, reliability,wilcoxon_signed_rank_test(median_percentile_values[-1], constant=0))
    # add also all the data
    # add the non nan values
    a = publishers_df[column1].values
    a = a[~np.isnan(a)]
    median_percentile_values.append(a)
    # drop the nan values

    

    #plot_empirical_cdfs(median_percentile_values, sorted_bias_groups, f"{sentiment}_engagement_rate_median_percentile",min_y_cdf=y_cdf_min,max_x=100,x_log_scale=False,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"{sentiment} tweets engagement rate median percentile")
    # also plot the box plot

    plot_box_plots(median_percentile_values,sorted_bias_groups+sorted_reliability_groups+["All"],y_log_scale=False,group_colors = list(bias_color_dict.values())+list(reliability_color_dict.values())+["white"],save_file_name=f"{column}_engagement_rate_percentile_deviations_mean",ylabel=f"Average Percentiles Shift",add_horizontal_line_at=0,show_means=False,show_whiskers=True,rotate_x_ticks=45,varname=column,legened_loc=legened_loc)
    #plot_box_plots(median_percentile_values, sorted_bias_groups,y_log_scale=False,group_colors = list(bias_color_dict.values()),save_file_name=f"{column}_engagement_rate_median_percentile",ylabel=f"Increase in median percentile",add_horizontal_line_at=0,show_means=False,show_whiskers=True,rotate_x_ticks=45)
    
    
    # also plot the cdf of the median percentile
    #plot_empirical_cdfs(median_percentile_values, sorted_bias_groups, f"{sentiment}_engagement_rate_median_percentile",min_y_cdf=0,max_x=100,x_log_scale=False,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"{sentiment} tweets engagement rate median percentile")
    # also run the kruscall wallis test
    #print("Kruscall wallis test for ", sentiment)
    
    # run the wilcoxon signed rank test for each group for the nonnan values
    # print the medians of all groups 
    for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups+["All"]):
        print(group,np.median(median_percentile_values[i]))
        
    print("------------------------")
    #for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups+["All"]):
    #    values = median_percentile_values[i]
    #    values = values[~np.isnan(values)]
    #    print(group, wilcoxon_signed_rank_test(values, constant=0))
    #print("------------------------")

    #run_kruskal_wallis(median_percentile_values, sorted_bias_groups+sorted_reliability_groups+["All"])
    
    
    

# Average Percentiles shift

In [None]:
# plot the {sentiment}_engagement_rate_median_percentile
for column in target_liwc_columns[:1]:
    print(column)
    column1 = "all_percentiles_shift_average_"+column

    median_percentile_values = []
    for bias in sorted_bias_groups:
        temp = publishers_df[publishers_df['Bias'] == bias]
        # drop the infinities
        #temp = temp[temp[f"{sentiment}_engagement_rate_median_percentile"] != -1000]
        # add the non nan values
        a = temp[column1].values
        a = a[~np.isnan(a)]
        median_percentile_values.append(a)
        #print(sentiment, bias,wilcoxon_signed_rank_test(median_percentile_values[-1], constant=0))
    for reliability in sorted_reliability_groups:
        temp = publishers_df[publishers_df['Reliability'] == reliability]
        # drop the infinities
        #temp = temp[temp[f"{sentiment}_engagement_rate_median_percentile"] != -1000]
        # drop the nan values
        a = temp[column1].values
        a = a[~np.isnan(a)]
        median_percentile_values.append(a)        #print(sentiment, reliability,wilcoxon_signed_rank_test(median_percentile_values[-1], constant=0))
    # add also all the data
    # add the non nan values
    a = publishers_df[column1].values
    a = a[~np.isnan(a)]
    median_percentile_values.append(a)
    # drop the nan values

    

    #plot_empirical_cdfs(median_percentile_values, sorted_bias_groups, f"{sentiment}_engagement_rate_median_percentile",min_y_cdf=y_cdf_min,max_x=100,x_log_scale=False,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"{sentiment} tweets engagement rate median percentile")
    # also plot the box plot
    
    plot_box_plots(median_percentile_values,sorted_bias_groups+sorted_reliability_groups+["All"],y_log_scale=False,group_colors = list(bias_color_dict.values())+list(reliability_color_dict.values())+["white"],save_file_name=f"{column}_engagement_rate_percentile_deviations_mean",ylabel=f"Average Percentiles Shift",add_horizontal_line_at=0,show_means=False,show_whiskers=True,rotate_x_ticks=45,varname=column)
    #plot_box_plots(median_percentile_values, sorted_bias_groups,y_log_scale=False,group_colors = list(bias_color_dict.values()),save_file_name=f"{column}_engagement_rate_median_percentile",ylabel=f"Increase in median percentile",add_horizontal_line_at=0,show_means=False,show_whiskers=True,rotate_x_ticks=45)
    
    
    # also plot the cdf of the median percentile
    #plot_empirical_cdfs(median_percentile_values, sorted_bias_groups, f"{sentiment}_engagement_rate_median_percentile",min_y_cdf=0,max_x=100,x_log_scale=False,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"{sentiment} tweets engagement rate median percentile")
    # also run the kruscall wallis test
    #print("Kruscall wallis test for ", sentiment)
    
    # run the wilcoxon signed rank test for each group for the nonnan values
    # print the medians of all groups 
    for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups+["All"]):
        print(group,np.median(median_percentile_values[i]))
        
    print("------------------------")
    #for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups+["All"]):
    #    values = median_percentile_values[i]
    #    values = values[~np.isnan(values)]
    #    print(group, wilcoxon_signed_rank_test(values, constant=0))
    #print("------------------------")

    #run_kruskal_wallis(median_percentile_values, sorted_bias_groups+sorted_reliability_groups+["All"])
    
    
    

In [None]:
np.median(median_percentile_values[0])

In [None]:
column1 in temp.columns

In [None]:
target_liwc_columns

In [None]:
# prevelance of the different sentiments among different bias groups
for liwc_feature in target_liwc_columns:
    
    print(liwc_feature)
    percentage_values = []
    for bias in sorted_bias_groups:
        temp = publishers_df[publishers_df['Bias'] == bias]
        # drop the infinities
        temp = temp[temp[f"high_{liwc_feature}_percentage"] != -1000]
        # drop the nan values

        
        percentage_values.append(temp[f"high_{liwc_feature}_percentage"].values)
    
    for reliability in sorted_reliability_groups:
        temp = publishers_df[publishers_df['Reliability'] == reliability]
        # drop the infinities
        temp = temp[temp[f"high_{liwc_feature}_percentage"] != -1000]
        # drop the nan values

        
        percentage_values.append(temp[f"high_{liwc_feature}_percentage"].values)
        
    if liwc_feature in ["analytic","perception"]:
        legened_loc = 'lower left'
    elif liwc_feature == "clout":
        legened_loc = 'upper center'
    else:
        legened_loc = 'lower center'

        
    cdf_labels = sorted_bias_groups+sorted_reliability_groups
    #plot_empirical_cdfs(percentage_values, cdf_labels, f"Percentage",max_x=100,x_log_scale=True,y_log_scale=False,colors = list(bias_color_dict.values()),xlabel=f"high{liwc_feature} percentage")
    plot_box_plots(percentage_values, cdf_labels,y_log_scale=False,group_colors = list(bias_color_dict.values())+list(reliability_color_dict.values()),save_file_name=f"high_{liwc_feature}_percentage",ylabel=f"Percentage",show_means=True,show_whiskers=True,rotate_x_ticks=45,varname=liwc_feature,legened_loc=legened_loc)
    
    for i,group in enumerate(sorted_bias_groups+sorted_reliability_groups):
        print(group,np.median(percentage_values[i]))
        
    print("------------------------")

    run_kruskal_wallis(percentage_values, sorted_bias_groups+sorted_reliability_groups)