In [41]:
import os
import random
from itertools import combinations
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

pd.set_option('max_colwidth', None) # Setting to be able to view the entire comment

Assess Overall Interater Relia 

In [42]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
labelled_data_dir = f"{repo_dir}/data/labelled/"

In [43]:
selected_columns = ['example_id', 'comment_for_evaluation','toxicity', 'severe_toxicity', 'identity_attack',
       'insult', 'profanity', 'threat']

all_labelled_df = []
for file_name in os.listdir(labelled_data_dir):
    labelled_df = pd.read_csv(f"{labelled_data_dir}{file_name}")[selected_columns]
    labelled_df['rater_id'] = file_name.split("_")[0]
    all_labelled_df.append(labelled_df)

In [44]:
labelled_df = pd.concat(all_labelled_df)

In [45]:
n_raters_lookup = labelled_df.groupby("example_id").rater_id.nunique().reset_index()

In [46]:
rater_ids = list(labelled_df.rater_id.unique())
num_raters = len(rater_ids)
random.shuffle(rater_ids)

In [72]:
rater_ids

['ac4119', 'yp2201', 'gm2858', 'yj2369']

In [48]:
labelled_df['rater_id'] = labelled_df['rater_id'].apply(lambda x: rater_ids.index(x))

In [49]:
example_ids_for_interrater = n_raters_lookup[n_raters_lookup['rater_id'] == 2]['example_id'].values

In [50]:
labelled_df_1 = labelled_df[labelled_df['example_id'].isin(example_ids_for_interrater)]

In [51]:
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

In [104]:
interrater_assessments_df = pd.melt(labelled_df_1,
                                    id_vars=['example_id','rater_id'], 
                                    value_vars=labels)\
                              .rename(columns={'value':'label'})

In [53]:
interrater_assessments_df.groupby(["example_id", "rater_id", "variable"]).count().sort_values(by='label',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label
example_id,rater_id,variable,Unnamed: 3_level_1
223,1,identity_attack,1
404585,3,threat,1
405298,2,identity_attack,1
405298,2,insult,1
405298,2,profanity,1
...,...,...,...
75177,2,severe_toxicity,0
75177,2,threat,0
75177,2,toxicity,0
82626,2,insult,0


In [105]:
interrater_assessments_df = pd.pivot(interrater_assessments_df , 
                                     index=['variable', 'example_id'], 
                                     columns='rater_id', values='label').reset_index()

In [106]:
interrater_assessments_df

rater_id,variable,example_id,0,1,2,3
0,identity_attack,223,,0.0,0.0,
1,identity_attack,687,,,0.0,0.0
2,identity_attack,1138,0.0,,0.0,
3,identity_attack,1757,0.0,,0.0,
4,identity_attack,3717,0.0,,0.0,
...,...,...,...,...,...,...
5767,toxicity,616547,,0.0,,0.0
5768,toxicity,617246,0.0,0.0,,
5769,toxicity,618295,,0.0,0.0,
5770,toxicity,618640,0.0,,0.0,


In [56]:
all_rater_combinations = list(combinations(range(num_raters), 2))

In [57]:
all_rater_combinations

[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]

In [99]:
interrater_assessments_df[rater_x].values

array([0., 0., 0., ..., 0., 0., 0.])

In [101]:
interrater_assessments_df[rater_y].values

array([0., 0., 0., ..., 0., 0., 0.])

In [102]:
interrater_assessments_df

rater_id,variable,example_id,0,1,2,3
0,identity_attack,223,0.0,0.0,0.0,0.0
1,identity_attack,687,0.0,0.0,0.0,0.0
2,identity_attack,1138,0.0,0.0,0.0,0.0
3,identity_attack,1757,0.0,0.0,0.0,0.0
4,identity_attack,3717,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
5767,toxicity,616547,0.0,0.0,0.0,0.0
5768,toxicity,617246,0.0,0.0,0.0,0.0
5769,toxicity,618295,0.0,0.0,0.0,0.0
5770,toxicity,618640,0.0,0.0,0.0,0.0


In [107]:
spearmanr_reliability = []
for rater_x, rater_y in all_rater_combinations:
    r_coef, p_value = spearmanr(interrater_assessments_df[rater_x].values, interrater_assessments_df[rater_y].values)
    data = {"rater_pair": (rater_x, rater_y), "r_coef": r_coef, "p_value": p_value, "spearmanr_reliability_type":"overall"}
    spearmanr_reliability.append(data)

    for variable in interrater_assessments_df['variable'].unique():
        print(variable)
        rater_x_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_x].values
        rater_y_values = interrater_assessments_df[interrater_assessments_df['variable']==variable][rater_y].values
        var_r_coef, var_p_value = spearmanr(rater_x_values, rater_y_values)
        data = {"rater_pair": (rater_x, rater_y), "r_coef": var_r_coef, "p_value": var_p_value, 
                "spearmanr_reliability_type":f"by_var_{variable}"}
        spearmanr_reliability.append(data)

identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity
identity_attack
insult
profanity
severe_toxicity
threat
toxicity


In [108]:
spearmanr_reliability_df = pd.DataFrame(spearmanr_reliability)[['spearmanr_reliability_type', 'rater_pair', 'r_coef','p_value']].fillna(1)

In [109]:
spearmanr_reliability_df

Unnamed: 0,spearmanr_reliability_type,rater_pair,r_coef,p_value
0,overall,"(0, 1)",1.0,1.0
1,by_var_identity_attack,"(0, 1)",1.0,1.0
2,by_var_insult,"(0, 1)",1.0,1.0
3,by_var_profanity,"(0, 1)",1.0,1.0
4,by_var_severe_toxicity,"(0, 1)",1.0,1.0
5,by_var_threat,"(0, 1)",1.0,1.0
6,by_var_toxicity,"(0, 1)",1.0,1.0
7,overall,"(0, 2)",1.0,1.0
8,by_var_identity_attack,"(0, 2)",1.0,1.0
9,by_var_insult,"(0, 2)",1.0,1.0


In [110]:
spearmanr_reliability_df.groupby('spearmanr_reliability_type').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
spearmanr_reliability_type,Unnamed: 1_level_1
by_var_identity_attack,1.0
by_var_insult,1.0
by_var_profanity,1.0
by_var_severe_toxicity,1.0
by_var_threat,1.0
by_var_toxicity,1.0
overall,1.0


In [65]:
spearmanr_reliability_df.groupby('spearmanr_reliability_type').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
spearmanr_reliability_type,Unnamed: 1_level_1
by_var_identity_attack,0.262714
by_var_insult,0.244487
by_var_profanity,0.20825
by_var_severe_toxicity,0.252228
by_var_threat,1.0
by_var_toxicity,0.24455
overall,0.26744


In [66]:
spearmanr_reliability_df.groupby('rater_pair').agg({'r_coef':np.mean})

Unnamed: 0_level_0,r_coef
rater_pair,Unnamed: 1_level_1
"(0, 1)",0.251144
"(0, 2)",0.408013
"(0, 3)",0.388979
"(1, 2)",0.432313
"(1, 3)",0.378446
"(2, 3)",0.266535


In [85]:
len(set(example_ids_for_interrater))

962

In [67]:
selected_interrater_examples = labelled_df[labelled_df['example_id'].isin(example_ids_for_interrater)]

In [98]:
len(set(selected_interrater_examples[selected_interrater_examples['profanity'] == 1]['example_id'].values))

270

In [117]:
review_table_list = []

In [118]:
label = 'profanity'

y = selected_interrater_examples[['example_id',label]].drop_duplicates()\
.groupby("example_id").agg({label :'nunique'}).reset_index()

label_disagreements = y[y[label] == 2]['example_id'].values
disagreement_examples = selected_interrater_examples[(selected_interrater_examples['example_id'].isin(label_disagreements))]

disagreement_examples[disagreement_examples[label]==0][['example_id','comment_for_evaluation', 'rater_id', label]].drop_duplicates() # .sort_values(by='example_id')
# disagreement_examples[disagreement_examples['rater_id']==3][['example_id','comment_for_evaluation',label]].sort_values(by='example_id')

for_review = disagreement_examples[['example_id','rater_id']]
for_review['attribute'] = label
review_table_list.append(for_review)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_review['attribute'] = label


In [119]:
for_review

Unnamed: 0,example_id,rater_id,attribute
14,147415,3,profanity
80,296431,3,profanity
191,503326,3,profanity
13,146432,2,profanity
35,180557,2,profanity
159,431566,2,profanity
193,516527,2,profanity
239,75801,2,profanity
37,211492,2,profanity
116,41179,2,profanity


In [120]:
label = 'identity_attack'

y = selected_interrater_examples[['example_id',label]].drop_duplicates()\
.groupby("example_id").agg({label :'nunique'}).reset_index()

label_disagreements = y[y[label] == 2]['example_id'].values
disagreement_examples = selected_interrater_examples[(selected_interrater_examples['example_id'].isin(label_disagreements))]

# disagreement_examples[['example_id','comment_for_evaluation',label]].sort_values(by='example_id')
disagreement_examples[['example_id','comment_for_evaluation']].drop_duplicates()
# disagreement_examples[disagreement_examples['rater_id']==3][['example_id','comment_for_evaluation',label]].sort_values(by='example_id')
for_review = disagreement_examples[['example_id','rater_id']]
for_review['attribute'] = label
review_table_list.append(for_review)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_review['attribute'] = label


In [121]:
label = 'insult'

y = selected_interrater_examples[['example_id',label]].drop_duplicates()\
.groupby("example_id").agg({label :'nunique'}).reset_index()

label_disagreements = y[y[label] == 2]['example_id'].values
disagreement_examples = selected_interrater_examples[(selected_interrater_examples['example_id'].isin(label_disagreements))]
for_review = disagreement_examples[['example_id','rater_id']]
for_review['attribute'] = label
review_table_list.append(for_review)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_review['attribute'] = label


In [112]:
x[10:]

Unnamed: 0,example_id,comment_for_evaluation
85,330394,Tesla with more crashes? NIO uses Lidar and has the top autonomous driving researcher in the world in Ren Shaoqing... NIO will overtake Tesla as the premium EV manufacturer by end of year.
88,335182,Hahahahahahaha okay sad bear. Time for your sponge bath
186,619248,No shit
1,106714,I wish all the fucking leaf blowers would come on the same fucking day. Fuck man.
188,6914,He likes having the little ✋ on his 🍆. Makes it look more impressive
6,113182,That’s why no one will remember your name
88,270830,"If the market crashes just load up on boomer stocks, you'll be grand if you just hold for the next few years"
128,349002,AMC cucks... don’t check ur portfolio today
137,373934,"Whatever happens today, don't forget to laugh at the Gibbons who thought they were gorillas, throwing poo at each other in their cage. The exhibit will be closing very soon"
160,43005,180? You paper hands really wanna peak at 180?


In [122]:
label = 'toxicity'

y = selected_interrater_examples[['example_id',label]].drop_duplicates()\
.groupby("example_id").agg({label :'nunique'}).reset_index()

label_disagreements = y[y[label] == 2]['example_id'].values
disagreement_examples = selected_interrater_examples[(selected_interrater_examples['example_id'].isin(label_disagreements))]

# disagreement_examples[disagreement_examples['rater_id']==3][['example_id','comment_for_evaluation',label]].sort_values(by='example_id')
disagreement_examples[['example_id','comment_for_evaluation']].drop_duplicates()
# disagreement_examples[disagreement_examples['rater_id']==3][['example_id','comment_for_evaluation',label]].sort_values(by='example_id')
for_review = disagreement_examples[['example_id','rater_id']]
for_review['attribute'] = label
review_table_list.append(for_review)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_review['attribute'] = label


In [123]:
label = 'severe_toxicity'

y = selected_interrater_examples[['example_id',label]].drop_duplicates()\
.groupby("example_id").agg({label :'nunique'}).reset_index()

label_disagreements = y[y[label] == 2]['example_id'].values
disagreement_examples = selected_interrater_examples[(selected_interrater_examples['example_id'].isin(label_disagreements))]

# disagreement_examples[disagreement_examples['rater_id']==3][['example_id','comment_for_evaluation',label]].sort_values(by='example_id')
disagreement_examples[['example_id','comment_for_evaluation']].drop_duplicates()
# disagreement_examples[disagreement_examples['rater_id']==3][['example_id','comment_for_evaluation',label]].sort_values(by='example_id')

for_review = disagreement_examples[['example_id','rater_id']]
for_review['attribute'] = label
review_table_list.append(for_review)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for_review['attribute'] = label


In [115]:
label = 'threat'

y = selected_interrater_examples[['example_id',label]].drop_duplicates()\
.groupby("example_id").agg({label :'nunique'}).reset_index()

label_disagreements = y[y[label] == 2]['example_id'].values
disagreement_examples = selected_interrater_examples[(selected_interrater_examples['example_id'].isin(label_disagreements))]

disagreement_examples[['example_id','comment_for_evaluation', label]].sort_values(by='example_id')

Unnamed: 0,example_id,comment_for_evaluation,threat


In [127]:
review_table = pd.concat(review_table_list).sort_values(by='example_id')

In [128]:
review_table

Unnamed: 0,example_id,rater_id,attribute
188,6914,1,toxicity
188,6914,1,insult
243,6914,3,insult
243,6914,3,toxicity
3,11714,1,profanity
...,...,...,...
234,600164,0,profanity
124,609296,3,severe_toxicity
229,609296,2,severe_toxicity
240,619248,0,insult


In [132]:
for i, rater_id in enumerate(rater_ids):
    by_rater = review_table[review_table["rater_id"] == i][['example_id','attribute']].sort_values(by="attribute")
    by_rater.to_csv(f"{repo_dir}/data/{rater_id}_review.csv",index=False)