In [99]:
import pandas as pd
import os
import random
from scipy.stats import pearsonr
from itertools import combinations

In [88]:
# Replace this location with where the repo is.
repository_location = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"

In [89]:
interrater_data_folder = f"{repository_location}/data/interrater-reliability"

In [141]:
os.listdir(interrater_data_folder)

['interrater-ac4119-20220406.csv',
 'place-your-interrater-file-here.txt',
 'interrater-gm2858-20220406.csv']

In [92]:
# file_criteria = '.csv'
interrater_files = [f for f in os.listdir(interrater_data_folder) if f.endswith('.csv') ]

In [93]:
interrater_info = {}
for file in interrater_files:
    file_type, rater_id, date_rated = file.split('.')[0].split('-')
    data_path = f'{interrater_data_folder}/{file}'
    interrater_info[rater_id] = {
        "data": pd.read_csv(data_path),
        "created_date": date_rated,
        "file_path": data_path
                          }

In [94]:
raters = list(interrater_info.keys())
random.shuffle(raters)

In [95]:
interrater_assessments = []
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']
selected_columns = ['example_index','comment_for_evaluation'] + labels

for i, rater_id in enumerate(raters):
    individual_ratings = interrater_info[rater_id]['data'][selected_columns]
    interrater_assessment = pd.melt(individual_ratings,
                                    id_vars=['example_index'], 
                                    value_vars=labels)\
                              .rename(columns={'value':'label'})
    # Using file id instead of rater_id to anonymize results
    interrater_assessment["file_id"] = i
    interrater_assessments.append(interrater_assessment)

In [96]:
interrater_assessments_df = pd.concat(interrater_assessments)

In [97]:
interrater_assessments_df

Unnamed: 0,example_index,variable,label,file_id
0,115811,toxicity,1.0,0
1,6509,toxicity,0.0,0
2,3,toxicity,0.0,0
3,49308,toxicity,1.0,0
4,201032,toxicity,0.0,0
...,...,...,...,...
49,288908,threat,0.0,1
50,157648,threat,0.0,1
51,71214,threat,,1
52,62874,threat,0.0,1


In [105]:
interrater_assessments_df = pd.pivot(interrater_assessments_df, index=['variable', 'example_index'], columns='file_id', values='label').fillna(0).reset_index()

In [119]:
all_rater_combinations = list(combinations(range(num_raters), 2))

In [132]:
reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = pearsonr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "reliability_type":"overall"}
    reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "reliability_type":f"by_var_{variable}"}
        reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "reliability_type":f"by_example_{example}"}
        reliability.append(data)

identity_attack
insult
profanity
severe_toxicity
threat
toxicity




In [138]:
reliability_df = pd.DataFrame(reliability)[['reliability_type', 'rater_pair', 'r_coef','p_value']]

In [139]:
reliability_df

Unnamed: 0,reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.493624,1.052852e-05
1,by_var_identity_attack,"(0, 1)",1.0,1.328317e-79
2,by_var_insult,"(0, 1)",0.29277,0.3557557
3,by_var_profanity,"(0, 1)",-0.239046,0.4543014
4,by_var_severe_toxicity,"(0, 1)",1.0,1.328317e-79
5,by_var_threat,"(0, 1)",,
6,by_var_toxicity,"(0, 1)",0.555556,0.06074657
7,by_example_3,"(0, 1)",1.0,0.0
8,by_example_5499,"(0, 1)",,
9,by_example_6509,"(0, 1)",0.632456,0.1778078
