In [79]:
## prequisites
#%pip install pandas
#%pip install numpy
#%pip install scipy
#%pip install sklearn

## libraries
from IPython.display import display, HTML
import os
import os.path
import random
import re
import pandas as pd
import numpy as np
from scipy.stats import kendalltau, mannwhitneyu
from scipy.stats.mstats import chisquare
from sklearn.metrics import cohen_kappa_score

## project structure
DATA_DIR = "/data/projects/capturingBias/research/framing/data/"  # change to "./" for current directory
CROWD_RESULTS = DATA_DIR + "120CSexperimentCrowdResults.csv"
EXPERT_RESULTS = DATA_DIR + "ExpertAnnotationsEpisodicVSThematic.tsv"
CROWD_FILTERS = DATA_DIR + "crowd_data_filtered_worker_ip_and_gender_and_type_and_title.csv"

## load files
crowd_results = pd.read_csv(CROWD_RESULTS, delimiter=';')
expert_results = pd.read_csv(EXPERT_RESULTS, delimiter='\t')
crowd_filters = pd.read_csv(CROWD_FILTERS)

## filter crowd
good_raters = np.unique(crowd_filters['_worker_id'].values)
crowd_results_filtered = crowd_results[crowd_results['_worker_id'].isin(good_raters)]

In [80]:
annotators = pd.unique(expert_results['annotator'])

def interexperts(label, level):
    expert_corr = {'annotator' : annotators}
    expert_corr.update({annotator: list() for annotator in annotators})
    expert_dist = {'annotator' : annotators}
    expert_dist.update({annotator: list() for annotator in annotators})
    for annotator_a in annotators:
        data_a = expert_results[expert_results['annotator'] == annotator_a].copy()
        data_a.sort_values(by='display_id', inplace=True)
        data_a = data_a[label].values
        for annotator_b in annotators:
            if annotator_a == annotator_b:
                expert_corr[annotator_a].append(np.nan)
                expert_dist[annotator_a].append(np.nan)
                continue

            data_b = expert_results[expert_results['annotator'] == annotator_b].copy()
            data_b.sort_values(by='display_id', inplace=True)
            data_b = data_b[label].values

            if level == "ordinal":
                expert_corr[annotator_a].append(kendalltau(data_a, data_b)[0])
                expert_dist[annotator_a].append(mannwhitneyu(data_a, data_b)[1])
            elif level == "nominal":
                expert_corr[annotator_a].append(cohen_kappa_score(data_a, data_b))
                data_a_freq = [np.count_nonzero(data_a == 'Episodic'), 
                               np.count_nonzero(data_a == 'Thematic')]
                data_b_freq = [np.count_nonzero(data_b == 'Episodic'), 
                               np.count_nonzero(data_b == 'Thematic')]
                expert_dist[annotator_a].append(chisquare(data_a_freq, data_b_freq)[1])

    print("="*10+" Correlation "+10*"=")
    corr = pd.DataFrame(expert_corr)
    display(corr)

    print("="*10+" H0: same distribution "+10*"=")
    dist = pd.DataFrame(expert_dist)
    display(dist)

In [81]:
interexperts('Dominant Frame', level='nominal')



Unnamed: 0,annotator,Mykola,Honorata,Antoaneta
0,Mykola,,0.279503,0.34012
1,Honorata,0.279503,,0.617964
2,Antoaneta,0.34012,0.617964,




Unnamed: 0,annotator,Mykola,Honorata,Antoaneta
0,Mykola,,1.0,0.179562
1,Honorata,1.0,,0.179562
2,Antoaneta,0.188898,0.188898,


In [82]:
interexperts('Framing score (1: Thematic, 7: Episodic)', level='ordinal')



Unnamed: 0,annotator,Mykola,Honorata,Antoaneta
0,Mykola,,0.308426,0.345582
1,Honorata,0.308426,,0.672277
2,Antoaneta,0.345582,0.672277,




Unnamed: 0,annotator,Mykola,Honorata,Antoaneta
0,Mykola,,0.165965,0.328057
1,Honorata,0.165965,,0.080297
2,Antoaneta,0.328057,0.080297,
