In [1]:
import os
import random
from itertools import combinations
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

In [2]:
# Replace this location with where the repo is.
repository_location = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"


In [3]:
interrater_data_folder = f"{repository_location}/data/interrater-reliability"

In [4]:
os.listdir(interrater_data_folder)

['Interrater-yj2369-20220407.csv',
 'Interrater-yp2201-20220407.csv',
 'interrater2-ac4119-20220410.csv',
 'interrater2-yp2201-20220410.csv',
 'interrater-ac4119-20220406.csv',
 'interrater2-yj2369-20220410.csv',
 'place-your-interrater-file-here.txt',
 'interrater-gm2858-20220407.csv',
 'interrater2-gm2858-20220409.csv']

In [5]:
# file_criteria = '.csv'
interrater_files = [f for f in os.listdir(interrater_data_folder) if f.startswith('interrater2') ]

In [6]:
interrater_info = {}
for file in interrater_files:
    file_type, rater_id, date_rated = file.split('.')[0].split('-')
    data_path = f'{interrater_data_folder}/{file}'
    interrater_info[rater_id] = {
        "data": pd.read_csv(data_path),
        "created_date": date_rated,
        "file_path": data_path
                          }

Shuffling the Rater file order to preserve anonymity

In [7]:
raters = list(interrater_info.keys())
random.shuffle(raters)

In [8]:
interrater_assessments = []
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']
selected_columns = ['example_index','comment_for_evaluation'] + labels

for i, rater_id in enumerate(raters):
    individual_ratings = interrater_info[rater_id]['data'][selected_columns]
    interrater_assessment = pd.melt(individual_ratings,
                                    id_vars=['example_index'], 
                                    value_vars=labels)\
                              .rename(columns={'value':'label'})
    # Using file id instead of rater_id to anonymize results
    interrater_assessment["file_id"] = i
    interrater_assessments.append(interrater_assessment)

In [28]:
interrater_assessments_df = pd.concat(interrater_assessments)
interrater_assessments_df = pd.pivot(interrater_assessments_df, 
                                     index=['variable', 'example_index'], 
                                     columns='file_id', values='label').fillna(0).reset_index()

In [27]:
interrater_assessments_df.head(5)

file_id,variable,example_index,0,1,2,3
0,identity_attack,54305,0,0,0,0
1,identity_attack,60073,0,0,0,1
2,identity_attack,187366,0,0,0,0
3,identity_attack,217190,0,0,0,0
4,identity_attack,288906,1,0,0,0


### Calculating Pearson's R for Reliability
https://en.wikipedia.org/wiki/Inter-rater_reliability#Correlation_coefficients

In [2]:
num_raters = len(raters)
all_rater_combinations = list(combinations(range(num_raters), 2))

NameError: name 'combinations' is not defined

In [1]:
num_raters = len(raters)
all_rater_combinations = list(combinations(range(num_raters), 2))

NameError: name 'raters' is not defined

In [13]:
reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = pearsonr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "reliability_type":"overall"}
    reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
#         print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "reliability_type":f"by_var_{variable}"}
        reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = pearsonr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "reliability_type":f"by_example_{example}"}
        reliability.append(data)



In [14]:
reliability_df = pd.DataFrame(reliability)[['reliability_type', 'rater_pair', 'r_coef','p_value']].fillna(1)

In [15]:
reliability_df 

Unnamed: 0,reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.840548,1.420214e-10
1,by_var_identity_attack,"(0, 1)",1.000000,1.000000e+00
2,by_var_insult,"(0, 1)",1.000000,0.000000e+00
3,by_var_profanity,"(0, 1)",1.000000,1.000000e+00
4,by_var_severe_toxicity,"(0, 1)",1.000000,1.000000e+00
...,...,...,...,...
73,by_example_60073,"(2, 3)",0.447214,3.739010e-01
74,by_example_187366,"(2, 3)",0.632456,1.778078e-01
75,by_example_217190,"(2, 3)",0.707107,1.161165e-01
76,by_example_288906,"(2, 3)",0.316228,5.414697e-01


In [16]:
reliability_df.groupby('reliability_type').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
reliability_type,Unnamed: 1_level_1
by_example_187366,0.877485
by_example_217190,0.853553
by_example_288906,0.457314
by_example_288908,1.0
by_example_54305,1.0
by_example_60073,0.656849
by_var_identity_attack,0.8
by_var_insult,0.853553
by_var_profanity,0.8
by_var_severe_toxicity,0.938743


### Calculating Spearman's R for Reliability

In [18]:
spearmanr_reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = spearmanr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "spearmanr_reliability_type":"overall"}
    spearmanr_reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "spearmanr_reliability_type":f"by_var_{variable}"}
        spearmanr_reliability.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        ex_r_coef, ex_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": ex_r_coef, "p_value": ex_p_value, 
                "spearmanr_reliability_type":f"by_example_{example}"}
        spearmanr_reliability.append(data)

identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity




In [19]:
spearmanr_reliability_df = pd.DataFrame(spearmanr_reliability)[['spearmanr_reliability_type', 'rater_pair', 'r_coef','p_value']].fillna(1)

In [20]:
spearmanr_reliability_df 

Unnamed: 0,spearmanr_reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",0.840548,1.420214e-10
1,by_var_identity_attack,"(0, 1)",1.000000,1.000000e+00
2,by_var_insult,"(0, 1)",1.000000,0.000000e+00
3,by_var_profanity,"(0, 1)",1.000000,1.000000e+00
4,by_var_severe_toxicity,"(0, 1)",1.000000,1.000000e+00
...,...,...,...,...
73,by_example_60073,"(2, 3)",0.447214,3.739010e-01
74,by_example_187366,"(2, 3)",0.632456,1.778078e-01
75,by_example_217190,"(2, 3)",0.707107,1.161165e-01
76,by_example_288906,"(2, 3)",0.316228,5.414697e-01


In [21]:
spearmanr_reliability_df.groupby('spearmanr_reliability_type').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
spearmanr_reliability_type,Unnamed: 1_level_1
by_example_187366,0.877485
by_example_217190,0.853553
by_example_288906,0.457314
by_example_288908,1.0
by_example_54305,1.0
by_example_60073,0.656849
by_var_identity_attack,0.8
by_var_insult,0.853553
by_var_profanity,0.8
by_var_severe_toxicity,0.938743


In [23]:
spearmanr_reliability_df.groupby('rater_pair').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
rater_pair,Unnamed: 1_level_1
"(0, 1)",0.942674
"(0, 2)",0.749384
"(0, 3)",0.776147
"(1, 2)",0.812047
"(1, 3)",0.821422
"(2, 3)",0.72027


In [57]:
cohen_kappa_scores = []
for rater_x, rater_y in all_rater_combinations:
    data = {"rater_pair": (rater_x, rater_y), "cohen_kappa_score": cohen_kappa_score(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values), "reliability_type":"overall"}
    cohen_kappa_scores.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        data = {"rater_pair": (rater_x, rater_y), "cohen_kappa_score": cohen_kappa_score(rater_x_values, rater_y_values), 
                "reliability_type":f"by_var_{variable}"}
        cohen_kappa_scores.append(data)
    
    for example in interrater_assessments_df['example_index'].unique():
        rater_x_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['example_index']==example][rater_y].values
        data = {"rater_pair": (rater_x, rater_y), "cohen_kappa_score": cohen_kappa_score(rater_x_values, rater_y_values), 
                "reliability_type":f"by_example_{example}"}
        cohen_kappa_scores.append(data)

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


In [None]:
data = {"rater_pair": (rater_x, rater_y), 
        "cohen_kappa_score": cohen_kappa_score(rater_x_values, rater_y_values), 
        "reliability_type":"overall"}
    

In [58]:
cohen_kappa_df = pd.DataFrame(cohen_kappa_scores)[['reliability_type', 'rater_pair', 'cohen_kappa_score']]

In [59]:
cohen_kappa_df

Unnamed: 0,reliability_type,rater_pair,cohen_kappa_score
0,overall,"(0, 1)",0.828025
1,by_var_identity_attack,"(0, 1)",0.000000
2,by_var_insult,"(0, 1)",1.000000
3,by_var_profanity,"(0, 1)",0.000000
4,by_var_severe_toxicity,"(0, 1)",0.000000
...,...,...,...
73,by_example_60073,"(2, 3)",0.333333
74,by_example_187366,"(2, 3)",0.571429
75,by_example_217190,"(2, 3)",0.666667
76,by_example_288906,"(2, 3)",0.181818


In [60]:
cohen_kappa_df.groupby('reliability_type').agg({'cohen_kappa_score':np.mean})

Unnamed: 0_level_0,cohen_kappa_score
reliability_type,Unnamed: 1_level_1
by_example_187366,0.357143
by_example_217190,0.833333
by_example_288906,0.380051
by_example_288908,0.5
by_example_54305,1.0
by_example_60073,0.595238
by_var_identity_attack,-0.04
by_var_insult,0.833333
by_var_profanity,-0.04
by_var_severe_toxicity,0.114286
