In [33]:
import os
import pandas as pd
import pickle
import random
import datetime as dt

In [34]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
labelled_data_dir = f"{repo_dir}/data/labelled/"

In [35]:
labelled_data_fnames = os.listdir(labelled_data_dir)
labelled_data_fnames

['yj2369_labelling_assignment_2022-04-13.csv',
 'gm2858_labelling_assignment_2022-04-09.csv',
 'yp2201_labelling_assignment_2022-04-09.csv',
 'ac4119_labelling_assignment_2022-04-09.csv',
 'ac4119_labelling_assignment_2022-04-13.csv',
 'yp2201_labelling_assignment_2022-04-13.csv',
 'gm2858_labelling_assignment_2022-04-13.csv',
 'yj2369_labelling_assignment_2022-04-09.csv']

In [36]:
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']
list_of_labelled_examples = []

for fname in labelled_data_fnames:
   
    labelled_data = pd.read_csv(f"{labelled_data_dir}{fname}")
    labelled_data['has_toxicity'] = labelled_data[labels].sum(axis=1)
    labelled_data['has_toxicity'] = labelled_data['has_toxicity'].apply(lambda x: 1 if x > 0 else 0)
    labelled_data['rater_id'] = fname.split("_")[0]
    labelled_data['assignment_date'] = fname.split("_")[-1][:-4]
    example_id_df = labelled_data[['assignment_date','example_id','has_toxicity','rater_id']]
    list_of_labelled_examples.append(example_id_df)

In [37]:
labelled_examples_lookup = pd.concat(list_of_labelled_examples)

In [38]:
n_raters_lookup = labelled_examples_lookup.groupby("example_id").rater_id.nunique().reset_index()

In [39]:
n_raters_lookup.columns = ['example_id', 'n_raters']

In [40]:
labelled_examples_lookup_2 = pd.merge(labelled_examples_lookup, n_raters_lookup, on=['example_id'])

In [41]:
len(labelled_examples_lookup)

1361

In [42]:
toxic_examples = labelled_examples_lookup_2['has_toxicity'] == 1
nontoxic_examples = labelled_examples_lookup_2['has_toxicity'] == 0
need_interrater = labelled_examples_lookup_2['n_raters'] <= 1

In [43]:
rater_ids = ['ac4119', 'gm2858', 'yj2369','yp2201']
n_raters = len(rater_ids)
n_other_raters = n_raters - 1

In [53]:
sample_ratio = {
    'nontoxic': 26,
    'toxic': 14
}

In [54]:
toxic_df = labelled_examples_lookup_2[toxic_examples & need_interrater]
nontoxic_df = labelled_examples_lookup_2[nontoxic_examples & need_interrater]
selected_examples = {}

for rater in rater_ids:
    toxic = toxic_df[toxic_df['rater_id'] == rater]['example_id']
    nontoxic = nontoxic_df[nontoxic_df['rater_id'] == rater]['example_id']
    print(rater, len(toxic),len(nontoxic))
    
    toxic = toxic.sample(sample_ratio['toxic'] * (n_raters-1))
    nontoxic = nontoxic.sample(sample_ratio['nontoxic'] * (n_raters-1))
    
    selected_examples[rater] = {
        'toxic': list(toxic.values),
        'nontoxic':  list(nontoxic.values),
    }

ac4119 99 242
gm2858 54 285
yj2369 61 277
yp2201 44 295


In [46]:
n_batches = (n_raters - 1) * 2 # 2 = has_toxicity TRUE, FALSE

In [58]:
batch_indices = {
    "toxic": [],
    "nontoxic": []
}

In [59]:

toxic_start_id = 0
nontoxic_start_id = 0

for i in range(n_other_raters):
    toxic_end_id = toxic_start_id + sample_ratio['toxic']
    nontoxic_end_id = nontoxic_start_id + sample_ratio['nontoxic']
    batch_indices['toxic'].append((toxic_start_id, toxic_end_id))
    batch_indices['nontoxic'].append((nontoxic_start_id, nontoxic_end_id))
    toxic_start_id = toxic_end_id
    nontoxic_start_id = nontoxic_end_id

In [60]:
batch_indices

{'toxic': [(0, 14), (14, 28), (28, 42)],
 'nontoxic': [(0, 26), (26, 52), (52, 78)]}

In [66]:
interrater_assignment = {r:[] for r in rater_ids}

for rater in rater_ids:
    other_raters = [r for r in rater_ids if r!=rater]
    for example_type in ['toxic','nontoxic']:
        example_list = selected_examples[rater][example_type]
        sample_size = len(example_list)
        sample_ranges = batch_indices[example_type]
        random.shuffle(other_raters)
        assign_batches_to_raters = list(zip(other_raters, sample_ranges))
        print(f'{example_type} assignment order:', assign_batches_to_raters)
        for other_rater, (start_id, end_id) in assign_batches_to_raters:
            interrater_assignment[other_rater] += selected_examples[rater][example_type][start_id:end_id] 

toxic assignment order: [('yp2201', (0, 14)), ('yj2369', (14, 28)), ('gm2858', (28, 42))]
nontoxic assignment order: [('gm2858', (0, 26)), ('yj2369', (26, 52)), ('yp2201', (52, 78))]
toxic assignment order: [('yp2201', (0, 14)), ('yj2369', (14, 28)), ('ac4119', (28, 42))]
nontoxic assignment order: [('yj2369', (0, 26)), ('ac4119', (26, 52)), ('yp2201', (52, 78))]
toxic assignment order: [('gm2858', (0, 14)), ('ac4119', (14, 28)), ('yp2201', (28, 42))]
nontoxic assignment order: [('yp2201', (0, 26)), ('ac4119', (26, 52)), ('gm2858', (52, 78))]
toxic assignment order: [('gm2858', (0, 14)), ('ac4119', (14, 28)), ('yj2369', (28, 42))]
nontoxic assignment order: [('gm2858', (0, 26)), ('yj2369', (26, 52)), ('ac4119', (52, 78))]


In [71]:
interrater_assignment_path =  f"{repo_dir}/data/interrater-reliability/interrater_assignment_{dt.date.today()}.p"
print(interrater_assignment_path )
pickle.dump(interrater_assignment, open( interrater_assignment_path, "wb" ) )

/Users/ameliachu/repos/nlu-reddit-toxicity-dataset/data/interrater-reliability/interrater_assignment_2022-04-21.p
