In [39]:
import os
import random
from itertools import combinations
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

In [40]:
# Replace this location with where the repo is.
repository_location = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"

In [41]:
interrater_data_folder = f"{repository_location}/data/interrater-reliability"

In [42]:
os.listdir(interrater_data_folder)

['.DS_Store',
 'Interrater-yj2369-20220407.csv',
 'Interrater-yp2201-20220407.csv',
 'interrater-ac4119-20220406.csv',
 'place-your-interrater-file-here.txt',
 'interrater-gm2858-20220407.csv']

In [43]:
# file_criteria = '.csv'
interrater_files = [f for f in os.listdir(interrater_data_folder) if f.endswith('.csv') ]

In [44]:
interrater_info = {}
for file in interrater_files:
    file_type, rater_id, date_rated = file.split('.')[0].split('-')
    data_path = f'{interrater_data_folder}/{file}'
    interrater_info[rater_id] = {
        "data": pd.read_csv(data_path),
        "created_date": date_rated,
        "file_path": data_path
                          }

Shuffling the Rater file order to preserve anonymity

In [45]:
raters = list(interrater_info.keys())
random.shuffle(raters)

In [46]:
interrater_assessments = []
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']
selected_columns = ['example_index','comment_for_evaluation'] + labels

for i, rater_id in enumerate(raters):
    individual_ratings = interrater_info[rater_id]['data'][selected_columns]
    interrater_assessment = pd.melt(individual_ratings,
                                    id_vars=['example_index'], 
                                    value_vars=labels)\
                              .rename(columns={'value':'label'})
    # Using file id instead of rater_id to anonymize results
    interrater_assessment["file_id"] = i
    interrater_assessments.append(interrater_assessment)

In [47]:
interrater_assessments_df = pd.concat(interrater_assessments)

In [58]:
interrater_assessments_df[interrater_assessments_df['example_index']==157648]

file_id,variable,example_index,0,1,2,3
7,identity_attack,157648,0,0,0,0
16,insult,157648,0,0,0,0
25,profanity,157648,0,0,0,0
34,severe_toxicity,157648,0,0,0,0
43,threat,157648,0,0,0,0
52,toxicity,157648,0,0,0,0


In [49]:
interrater_assessments_df = pd.pivot(interrater_assessments_df, index=['variable', 'example_index'], columns='file_id', values='label').fillna(0).reset_index()

### Calculating Pearson's R for Reliability
https://en.wikipedia.org/wiki/Inter-rater_reliability#Correlation_coefficients

In [50]:
num_raters = len(raters)
all_rater_combinations = list(combinations(range(num_raters), 2))

In [51]:
reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = pearsonr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "reliability_type":"overall"}
    reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
#         print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "reliability_type":f"by_var_{variable}"}
        reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "reliability_type":f"by_example_{example}"}
        reliability.append(data)



In [52]:
reliability_df = pd.DataFrame(reliability)[['reliability_type', 'rater_pair', 'r_coef','p_value']]

In [53]:
reliability_df 

Unnamed: 0,reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.209165,0.129034
1,by_var_identity_attack,"(0, 1)",-0.125000,0.748645
2,by_var_insult,"(0, 1)",-0.100000,0.797972
3,by_var_profanity,"(0, 1)",0.316228,0.407084
4,by_var_severe_toxicity,"(0, 1)",,
...,...,...,...,...
91,by_example_62874,"(2, 3)",0.447214,0.373901
92,by_example_71214,"(2, 3)",0.632456,0.177808
93,by_example_115811,"(2, 3)",0.707107,0.116117
94,by_example_157648,"(2, 3)",,


In [54]:
reliability_df.groupby('reliability_type').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
reliability_type,Unnamed: 1_level_1
by_example_115811,0.804738
by_example_157648,
by_example_201032,0.723607
by_example_3,0.723607
by_example_49308,0.679738
by_example_5499,0.687723
by_example_62874,0.631476
by_example_6509,0.526556
by_example_71214,0.75497
by_var_identity_attack,0.25


In [147]:
### Calculating Spearman's R for Reliability

In [55]:
spearmanr_reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = spearmanr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "spearmanr_reliability_type":"overall"}
    spearmanr_reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "spearmanr_reliability_type":f"by_var_{variable}"}
        spearmanr_reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "spearmanr_reliability_type":f"by_example_{example}"}
        spearmanr_reliability.append(data)

identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity




In [56]:
spearmanr_reliability_df = pd.DataFrame(spearmanr_reliability)[['spearmanr_reliability_type', 'rater_pair', 'r_coef','p_value']]

In [57]:
spearmanr_reliability_df 

Unnamed: 0,spearmanr_reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.209165,0.129034
1,by_var_identity_attack,"(0, 1)",-0.125000,0.748645
2,by_var_insult,"(0, 1)",-0.100000,0.797972
3,by_var_profanity,"(0, 1)",0.316228,0.407084
4,by_var_severe_toxicity,"(0, 1)",,
...,...,...,...,...
91,by_example_62874,"(2, 3)",0.447214,0.373901
92,by_example_71214,"(2, 3)",0.632456,0.177808
93,by_example_115811,"(2, 3)",0.707107,0.116117
94,by_example_157648,"(2, 3)",,
