In [143]:
import os
import random
from itertools import combinations
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

In [88]:
# Replace this location with where the repo is.
repository_location = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"

In [89]:
interrater_data_folder = f"{repository_location}/data/interrater-reliability"

In [151]:
os.listdir(interrater_data_folder)

['interrater-ac4119-20220406.csv', 'place-your-interrater-file-here.txt']

In [92]:
# file_criteria = '.csv'
interrater_files = [f for f in os.listdir(interrater_data_folder) if f.endswith('.csv') ]

In [93]:
interrater_info = {}
for file in interrater_files:
    file_type, rater_id, date_rated = file.split('.')[0].split('-')
    data_path = f'{interrater_data_folder}/{file}'
    interrater_info[rater_id] = {
        "data": pd.read_csv(data_path),
        "created_date": date_rated,
        "file_path": data_path
                          }

Shuffling the Rater file order to preserve anonymity

In [94]:
raters = list(interrater_info.keys())
random.shuffle(raters)

In [95]:
interrater_assessments = []
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']
selected_columns = ['example_index','comment_for_evaluation'] + labels

for i, rater_id in enumerate(raters):
    individual_ratings = interrater_info[rater_id]['data'][selected_columns]
    interrater_assessment = pd.melt(individual_ratings,
                                    id_vars=['example_index'], 
                                    value_vars=labels)\
                              .rename(columns={'value':'label'})
    # Using file id instead of rater_id to anonymize results
    interrater_assessment["file_id"] = i
    interrater_assessments.append(interrater_assessment)

In [96]:
interrater_assessments_df = pd.concat(interrater_assessments)

In [105]:
interrater_assessments_df = pd.pivot(interrater_assessments_df, index=['variable', 'example_index'], columns='file_id', values='label').fillna(0).reset_index()

### Calculating Pearson's R for Reliability
https://en.wikipedia.org/wiki/Inter-rater_reliability#Correlation_coefficients

In [119]:
all_rater_combinations = list(combinations(range(num_raters), 2))

In [132]:
reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = pearsonr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "reliability_type":"overall"}
    reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "reliability_type":f"by_var_{variable}"}
        reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "reliability_type":f"by_example_{example}"}
        reliability.append(data)

identity_attack
insult
profanity
severe_toxicity
threat
toxicity




In [138]:
reliability_df = pd.DataFrame(reliability)[['reliability_type', 'rater_pair', 'r_coef','p_value']]

In [139]:
reliability_df.groupby('reliability_type').agg({'r_coef':np.avg})

Unnamed: 0,reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.493624,1.052852e-05
1,by_var_identity_attack,"(0, 1)",1.0,1.328317e-79
2,by_var_insult,"(0, 1)",0.29277,0.3557557
3,by_var_profanity,"(0, 1)",-0.239046,0.4543014
4,by_var_severe_toxicity,"(0, 1)",1.0,1.328317e-79
5,by_var_threat,"(0, 1)",,
6,by_var_toxicity,"(0, 1)",0.555556,0.06074657
7,by_example_3,"(0, 1)",1.0,0.0
8,by_example_5499,"(0, 1)",,
9,by_example_6509,"(0, 1)",0.632456,0.1778078


In [147]:
### Calculating Spearman's R for Reliability

In [148]:
spearmanr_reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = spearmanr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "spearmanr_reliability_type":"overall"}
    spearmanr_reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "spearmanr_reliability_type":f"by_var_{variable}"}
        spearmanr_reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "spearmanr_reliability_type":f"by_example_{example}"}
        spearmanr_reliability.append(data)

identity_attack
insult
profanity
severe_toxicity
threat
toxicity




In [149]:
spearmanr_reliability_df = pd.DataFrame(spearmanr_reliability)[['spearmanr_reliability_type', 'rater_pair', 'r_coef','p_value']]

In [150]:
spearmanr_reliability_df 

Unnamed: 0,spearmanr_reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.493624,1.1e-05
1,by_var_identity_attack,"(0, 1)",1.0,0.0
2,by_var_insult,"(0, 1)",0.29277,0.355756
3,by_var_profanity,"(0, 1)",-0.239046,0.454301
4,by_var_severe_toxicity,"(0, 1)",1.0,0.0
5,by_var_threat,"(0, 1)",,
6,by_var_toxicity,"(0, 1)",0.555556,0.060747
7,by_example_3,"(0, 1)",1.0,0.0
8,by_example_5499,"(0, 1)",,
9,by_example_6509,"(0, 1)",0.632456,0.177808
