In [105]:
SCHOOL = "AU"
method = "mean_diff"
sentiment_type = "granularity"
sentiment_path_portion = "3" if sentiment_type == "summarizer" else "2/nltk_sia"
data_path = f"../bias_processing/data/{sentiment_path_portion}/{SCHOOL.lower()}_dataset_{sentiment_type}.poli.csv"
output_path = f"../bias_processing/data/4/{SCHOOL.lower()}_bias_{method}.poli.csv"

In [106]:
from statistics import mean

def score_computer_generator(method=method):
    def score_computer(article_sentiment_scores, sentence_sentiment_scores):
        if method == "mean_avg":
            article_val = mean(article_sentiment_scores)
            sentence_val = mean(sentence_sentiment_scores)
            return (article_val + sentence_val) / 2
        elif method == "mean_diff":
            article_val = mean(article_sentiment_scores)
            sentence_val = mean(sentence_sentiment_scores)
            return article_val - sentence_val
        elif method == "diff_avg":
            differences = [a - b for a, b in zip(article_sentiment_scores, sentence_sentiment_scores)]
            return mean(differences)
        else:
            raise(ValueError("Put in a valid method"))
    return score_computer
scorer = score_computer_generator(method=method)

In [107]:
"""
Given a csv from one of the Sentiment_Calculator notebooks, calculate bias by doing the following:
Group by topic, and consider that 1 series
Compute variance between series of sentiment for entire article granularity and sentence-granularity for each of the labels (Positive, Negative, Neutral) leaving you with 3 values
Average these values (add functionality to take max as well) and assume this to be 'bias'
Store the bias for a given school as a csv with the columns of School_Name, Israel_Bias, Palestine_Bias, India_Bias, China_Bias
Save a new csv with this entry, or load in a csv of past schools with these columns and add this row to the bottom
"""

"\nGiven a csv from one of the Sentiment_Calculator notebooks, calculate bias by doing the following:\nGroup by topic, and consider that 1 series\nCompute variance between series of sentiment for entire article granularity and sentence-granularity for each of the labels (Positive, Negative, Neutral) leaving you with 3 values\nAverage these values (add functionality to take max as well) and assume this to be 'bias'\nStore the bias for a given school as a csv with the columns of School_Name, Israel_Bias, Palestine_Bias, India_Bias, China_Bias\nSave a new csv with this entry, or load in a csv of past schools with these columns and add this row to the bottom\n"

In [108]:
"""
In this version, the bias is calculated as the average of the variances of the sentiment scores 
within each granularity (article and sentence).
"""
import pandas as pd
import os

# Read the data from the CSV file
df = pd.read_csv(data_path)

# Define the keywords and sentiments to be processed
keywords = ['Democrat', 'Repulican', 'conservative', 'liberal', 'club']
sentiments = ['pos', 'neg', 'neu']

# Initialize a dictionary to store the bias for each keyword and sentiment
result_dict = {f'{keyword}_{sentiment}_Bias': [] for keyword in keywords for sentiment in sentiments}

# Loop over each keyword and sentiment
for keyword in keywords:
    for sentiment in sentiments:
        # Extract the sentiment scores for the keyword from the dataframe
        article_sentiment_scores = df.loc[df['keyword'] == keyword, f'article_{sentiment}']
        sentence_sentiment_scores = df.loc[df['keyword'] == keyword, f'sentence_{sentiment}']

        result_dict[f'{keyword}_{sentiment}_Bias'] = scorer(article_sentiment_scores, sentence_sentiment_scores)

# Add the school name to the results dictionary
result_dict['School_Name'] = SCHOOL

# Convert the results dictionary to a DataFrame
result_df = pd.DataFrame(result_dict, index=[0])

# If the output file already exists, load the existing data and append the new data only if it's not duplicate
if os.path.exists(output_path):
    existing_df = pd.read_csv(output_path)
    if not existing_df.equals(result_df):
        result_df.to_csv(output_path, mode='a', header=False, index=False)
# If it doesn't exist, create a new file
else:
    result_df.to_csv(output_path, index=False)

StatisticsError: mean requires at least one data point

In [None]:
# """
# In this version, the bias is calculated as the average of the variance of the differences between 
# the sentiment scores for article and sentence granularities.
# """

# import pandas as pd
# import numpy as np

# # Define the name of the school and the file paths
# SCHOOL = "McGill"
# data_path = f"bias_processing/data/2/nltk_sia/{SCHOOL.lower()}_dataset_granularity.csv"
# output_path = f"bias_processing/data/4/{SCHOOL.lower()}_bias_granularity_v1.csv"

# # Read the data from the CSV file
# df = pd.read_csv(data_path)

# # Define the keywords and sentiments to be processed
# keywords = ['Israel', 'Palestine', 'India', 'China']
# sentiments = ['pos', 'neg', 'neu']

# # Initialize a dictionary to store the average variance for each keyword and sentiment
# result_dict = {}

# # Loop over each keyword
# for keyword in keywords:
#     # Filter the dataframe by the keyword
#     keyword_df = df[df['keyword'] == keyword]
#     # Loop over each sentiment
#     for sentiment in sentiments:
#         # Initialize a list to store the variance for each granularity
#         variances = []
#         # Loop over each granularity
#         for granularity in ['article', 'paragraph', 'sentence']:
#             # Calculate the variance of the sentiment scores for the granularity and append it to the list
#             variances.append(keyword_df[f'{granularity}_{sentiment}'].var())
#         # Calculate the mean of the variances and store it in the results dictionary
#         result_dict[f'{keyword}_Bias_{sentiment}'] = np.mean(variances)

# # Convert the results dictionary to a DataFrame
# result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=['Average Variance'])

# # Transpose the DataFrame to match the desired output
# result_df = result_df.transpose()

# # Add the school name to the DataFrame
# result_df['School_Name'] = SCHOOL

# # Write the DataFrame to the CSV file
# result_df.to_csv(output_path)

In [None]:
# """
# In this version, the bias is calculated as the average of the variances of the sentiment scores 
# within each granularity (Full Article, Article Summary, and Paragraph Summary).
# """

# import pandas as pd
# import numpy as np

# # Define the name of the school for which the data will be processed
# SCHOOL = "McGill"

# # Define the file paths for the input and output CSV files
# data_path = f"bias_processing/data/3/{SCHOOL.lower()}_dataset_summarizer.csv"
# output_path = f"bias_processing/data/4/{SCHOOL.lower()}_bias.summarizer_v1.csv"

# # Load the data from the CSV file
# df = pd.read_csv(data_path)

# # Calculate the variance of 'Full Article', 'Article Summary', and 'Paragraph Summary' 
# # for each 'Type' and 'School' across all dates. Reset the index so that 
# # 'School' and 'Type' become columns instead of indices.
# df_var = df.groupby(['School', 'Type']).agg(
#     full_article_var=pd.NamedAgg(column='Full Article', aggfunc=np.var),
#     article_summary_var=pd.NamedAgg(column='Article Summary', aggfunc=np.var),
#     paragraph_summary_var=pd.NamedAgg(column='Paragraph Summary', aggfunc=np.var)
# ).reset_index()

# # Reshape the df_var DataFrame to long format, turning 'Type' into columns and 
# # their variance as the 'Variance' values
# df_pivot = df_var.melt(id_vars=['School', 'Type'], var_name='Granularity', value_name='Variance')

# # Group the reshaped DataFrame by 'School', 'Type' and 'Granularity', then calculate 
# # the mean of 'Variance' for each group. This gives the average variance (bias) for 
# # each sentiment type for the specified school. Reset the index so that 
# # 'School', 'Type' and 'Granularity' become columns instead of indices.
# df_avg = df_pivot.groupby(['School', 'Type', 'Granularity']).agg({'Variance': 'mean'}).reset_index()

# # Rename the columns to reflect their corresponding bias and replace the 'School' 
# # column name with 'School_Name'
# df_avg.columns = [f'{i}_Bias' if i != 'School' else i for i in df_avg.columns]
# df_avg = df_avg.rename(columns={'School': 'School_Name'}).replace({'School_Name': {SCHOOL: SCHOOL}})

# # Save the resulting DataFrame with the average variance (bias) as a new CSV file
# df_avg.to_csv(output_path, index=False)

In [None]:
# import pandas as pd
# import numpy as np

# output_path = f"bias_processing/data/4/{SCHOOL.lower()}_bias.summarizer_v2.csv"

# # Reshape the DataFrame to long format, turning the granularity columns into 
# # individual rows and their values as the 'Score'
# df_melt = df.melt(id_vars=['Date', 'School', 'Keyword', 'Type'], 
#                   var_name='Granularity', 
#                   value_name='Score')

# # Group the reshaped DataFrame by 'School', 'Type' and 'Granularity', then calculate 
# # the variance ('np.var') of 'Score' for each group. Reset the index so that 
# # 'School', 'Type' and 'Granularity' become columns instead of indices.
# df_var = df_melt.groupby(['School', 'Type', 'Granularity'])\
#                 .agg({'Score': np.var})\
#                 .reset_index()

# # Group the variance DataFrame by 'Granularity', calculate the mean of the 'Score' 
# # for each group (which will give us the average variance for each granularity), 
# # and reset the index.
# df_avg_var = df_var.groupby('Granularity')\
#                    .agg({'Score': 'mean'})\
#                    .reset_index()

# # Rename the columns of the average variance DataFrame to more accurately 
# # reflect what they represent: 'Granularity' and 'Average_Variance'
# df_avg_var.columns = ['Granularity', 'Average_Variance']

# # Print the final DataFrame, which shows the average variance for each granularity
# print(df_avg_var)

# df_avg_var.to_csv(output_path, index=False)
