# Labelling experiment results

## Data preperation

In [1]:
import os
import pandas as pd
import pandas as pd
from scipy.stats import spearmanr
import numpy as np

data_folder = 'data/article_ratings/'
pd.set_option('display.max_rows', 5)

df_total = pd.DataFrame()
for file in os.listdir(data_folder):
    df = pd.read_csv(data_folder + file)
    df["name"] = file.split("_")[0]
    print(f"Number of reviewed articles by {file.split('_')[0]}:    {df.shape[0]}")
    df_total = df_total.append(df)
    


df_art = pd.read_csv("data/articles_topics.csv", index_col = 0)



df_total.rename(columns={"sentiment": "manual_sentiment"}, inplace=True)


df_art = df_art.merge(df_total[["paragraphs", "manual_sentiment", "outcome"]], on="paragraphs", how="left")

df_art["manual_sentiment"] = df_art.manual_sentiment.map({1: "POSITIVE", -1: "NEGATIVE", 0: "NEUTRAL", np.nan: np.nan})

Number of reviewed articles by Ahmed:    28
Number of reviewed articles by Ale:    28
Number of reviewed articles by Artemis:    28
Number of reviewed articles by Bart:    28
Number of reviewed articles by David:    28
Number of reviewed articles by Sophie:    28


  df_total = df_total.append(df)
  df_total = df_total.append(df)
  df_total = df_total.append(df)
  df_total = df_total.append(df)
  df_total = df_total.append(df)
  df_total = df_total.append(df)


# Results

### Manual sentiment vs. manual outcome

The p-value shows if there is a association between the two variables. a value lower than 0.05 shows that we reject the $H_0$ hypothesis

In [16]:

df = df_art[df_art["manual_sentiment"].notna()]

# Calculate the observed statistic (Spearman's rank correlation)
observed_statistic, _ = spearmanr(df['manual_sentiment'], df['outcome'])

# Specify the number of permutations (e.g., 1000)
num_permutations = 1000

# Create an array to store the permuted statistics
permuted_statistics = np.empty(num_permutations)

# Permutation test
for i in range(num_permutations):
    shuffled_outcome = np.random.permutation(df['outcome'])
    permuted_statistic, _ = spearmanr(df['manual_sentiment'], shuffled_outcome)
    permuted_statistics[i] = permuted_statistic

# Calculate the p-value by comparing the observed statistic to the permuted statistics
p_value = (np.abs(permuted_statistics) >= np.abs(observed_statistic)).mean()

print("Observed Spearman's rank correlation:", observed_statistic)
print("Permutation p-value:", p_value)


Observed Spearman's rank correlation: 0.7128900699037357
Permutation p-value: 0.0


### Manual sentiment vs. sentiment from RoBERTa model

In [17]:

df = df_art[df_art["manual_sentiment"].notna()]

observed_statistic, _ = spearmanr(df['manual_sentiment'], df['sentiment_roberta'])

# Specify the number of permutations (e.g., 1000)
num_permutations = 1000

# Create an array to store the permuted statistics
permuted_statistics = np.empty(num_permutations)

# Permutation test
for i in range(num_permutations):
    shuffled_outcome = np.random.permutation(df['sentiment_roberta'])
    permuted_statistic, _ = spearmanr(df['manual_sentiment'], shuffled_outcome)
    permuted_statistics[i] = permuted_statistic

# Calculate the p-value by comparing the observed statistic to the permuted statistics
p_value = (np.abs(permuted_statistics) >= np.abs(observed_statistic)).mean()

print("Observed Spearman's rank correlation:", observed_statistic)
print("Permutation p-value:", p_value)

Observed Spearman's rank correlation: 0.4033412964650235
Permutation p-value: 0.0


### Sentiment RoBERTa model vs. Manual outcome

In [18]:

df = df_art[df_art["outcome"].notna()]

# Create a contingency table
observed_statistic, _ = spearmanr(df['sentiment_roberta'], df['outcome'])

# Specify the number of permutations (e.g., 1000)
num_permutations = 1000

# Create an array to store the permuted statistics
permuted_statistics = np.empty(num_permutations)

# Permutation test
for i in range(num_permutations):
    shuffled_outcome = np.random.permutation(df['outcome'])
    permuted_statistic, _ = spearmanr(df['sentiment_roberta'], shuffled_outcome)
    permuted_statistics[i] = permuted_statistic

# Calculate the p-value by comparing the observed statistic to the permuted statistics
p_value = (np.abs(permuted_statistics) >= np.abs(observed_statistic)).mean()

print("Observed Spearman's rank correlation:", observed_statistic)
print("Permutation p-value:", p_value)

Observed Spearman's rank correlation: 0.4144275961878863
Permutation p-value: 0.0
