In [1]:
import os
import pandas as pd
import statsmodels.stats.inter_rater as irr

In [2]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
data_dir = f"{repo_dir}/data"

In [3]:
interr_dir  = f"{data_dir}/interrater-reliability/"

In [4]:
interr_fnames = [f for f in os.listdir(interr_dir) if f.startswith("interrater2")]

In [5]:
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

selected_columns = ['example_index'] + labels # example_id in master_data

In [6]:
list_of_interr_data = []
for fname in interr_fnames:
    rater_data = pd.read_csv(f"{interr_dir}{fname}")[selected_columns]
    rater_data['rater_id'] = fname.split("-")[1]
    list_of_interr_data.append(rater_data)

In [7]:
interrater_data = pd.concat(list_of_interr_data)

rater_ids = list(interrater_data['rater_id'].unique())
print(rater_ids )

['ac4119', 'yp2201', 'yj2369', 'gm2858']


In [8]:
interrater_data = interrater_data.melt(id_vars=['rater_id','example_index'], value_vars=labels)

interrater_data = pd.pivot_table(interrater_data, index=['example_index', 'variable'], 
               columns = 'rater_id',
               values='value').reset_index()

In [9]:
interrater_array = interrater_data[rater_ids].to_numpy()
interrater_fleiss_kappa_data, (n_rows, n_cat) =  irr.aggregate_raters(interrater_array)

fleiss_kappa_values = {'overall': irr.fleiss_kappa(interrater_fleiss_kappa_data)}

In [16]:
for label in labels:
    interrater_data_by_label = interrater_data[interrater_data['variable'] == label]
    interrater_array = interrater_data_by_label[rater_ids].to_numpy()
    interrater_fleiss_kappa_data, n_info =  irr.aggregate_raters(interrater_array)
    fleiss_kappa_values[label]= irr.fleiss_kappa(interrater_fleiss_kappa_data)

  kappa = (p_mean - p_mean_exp) / (1- p_mean_exp)


In [17]:
fleiss_kappa_values

{'overall': 0.6999395039322442,
 'toxicity': 0.4666666666666667,
 'severe_toxicity': 0.11111111111111072,
 'identity_attack': -0.09090909090908979,
 'insult': 0.8222222222222222,
 'profanity': -0.09090909090908979,
 'threat': nan}

In [15]:
irr.aggregate_raters(interrater_data[interrater_data['variable'] == label][rater_ids].to_numpy())

(array([[4],
        [4],
        [4],
        [4],
        [4],
        [4]]),
 array([0]))

In [None]:
irr.aggregate_raters(interrater_array)
irr.fleiss_kappa

In [None]:
interrater_fleiss_kappa_data, (n_rows, n_cat) =  irr.aggregate_raters(interrater_array)

In [None]:
# https://www.statsmodels.org/dev/generated/statsmodels.stats.inter_rater.fleiss_kappa.html
irr.fleiss_kappa(interrater_fleiss_kappa_data)