## Batch 4 (2022-04-28)

In [1]:
import os
import random
import datetime as dt
import pickle
import pandas as pd

In [4]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
labelled_data_dir = f"{repo_dir}/data/labelled/"

# Collecting all the labelled file names 
labelled_data_fnames = [f for f in os.listdir(labelled_data_dir)]

# Pre-randomized list of example_ids
randomized_example_ids_path = f"{repo_dir}/data/randomized_example_ids.p"

# example_ids that require backfill due to init assigment issue
remaining_backfill_path = f'{repo_dir}/data/backfill_example_ids.p'

# Dictionary of example_ids that have been labelled once
# See assign-examples-for-interrater.ipynb for more details
interrater_assignment_date = '2022-04-21' 
interrater_assignment_path = f"{repo_dir}/data/interrater-reliability/interrater_assignment_{interrater_assignment_date}.p"

In [5]:
remaining_examples_to_backfill = pickle.load( open(remaining_backfill_path, "rb" ) )
example_indices = pickle.load( open(randomized_example_ids_path, "rb" ) )
interrater_assignment = pickle.load( open(interrater_assignment_path, "rb" ) )

In [17]:
# obtaining a list of labelled examples

selected_columns = ['example_id']
list_of_example_ids = [pd.read_csv(f"{labelled_data_dir}{fname}")[selected_columns] for fname in labelled_data_fnames]
example_ids_pd = pd.concat(list_of_example_ids)
labelled_examples = list(example_ids_pd['example_id'].values)

In [None]:
Defining raters who need to be assigned a batch. 

In [8]:
rater_ids = ['ac4119', 'gm2858', 'yj2369','yp2201']
num_raters = len(rater_ids)

In [31]:
start_id = 1360
batch_size = 140

In [33]:
current_example_indices = example_indices[start_id:]
current_example_indices = [i for i in current_example_indices if i not in labelled_examples ]

In [34]:
supplmental_batches = []

for i in range(num_raters):
    if i == 0:
        start_id = 0
        n_backfill = len(remaining_examples_to_backfill)
        end_id = start_id + batch_size - n_backfill
    else:
        end_id = start_id + batch_size
    supplmental_batches.append((start_id,end_id))
    start_id = end_id

In [35]:
supplmental_batches

[(0, 103), (103, 243), (243, 383), (383, 523)]

Randomizing the order of raters and assigning batches based on order.

In [15]:
random.shuffle(rater_ids)

In [16]:
assign_batches_to_raters =  list(zip(rater_ids,supplmental_batches))
print(assign_batches_to_raters)

[('yp2201', (1360, 1463)), ('yj2369', (1463, 1603)), ('gm2858', (1603, 1743)), ('ac4119', (1743, 1883))]


### Reading in pre-collected dataset

In [None]:
daily_discussion_data_fname = "daily_master_data_1614250838_1618692612.csv"
daily_discussion_data_path = f"{repo_dir}/data/{daily_discussion_data_fname}"
daily_discussion_data = pd.read_csv(daily_discussion_data_path).rename(columns={'Unnamed: 0':'example_id'})

### Generating files for labelling based on batch assignment

This chunk uses the `rater_id` and assigned indices `(start_ind, end_ind)` as inputs. The process collects the relevant data and generates a file that is more conducive to labelling and text classification training/scoring. Specifically, the below collects the context (i.e. `preceding_comment`, `following_comment`), the `comment_for_evaluation`, and adds columns for each toxic attribute label.

In [None]:
to_label_dir = f"{repo_dir}/data/to_label"
current_date = dt.date.today()
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

for rater_id, (start_ind, end_ind) in assign_batches_to_raters:
    fname = f"{rater_id}_labelling_assignment_{current_date}"
    export_location = f"{to_label_dir}/{fname}"
    assigned_indices = missing_example_ids[start_ind:end_ind]
    assigned_indices += interrater_assignment[rater_id]
    index_map = {}
    required_examples = []
    for ind in assigned_indices:
        ind_examples = [{
        'example_id': str(ind),
        'example_type': 'preceding',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind-1]['body'].values[0]
    }, {
        'example_id': str(ind),
        'example_type': 'example',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind]['body'].values[0]
    },
     {
        'example_id': str(ind),
        'example_type': 'following',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind+1]['body'].values[0]
    }]
        required_examples += ind_examples 
    assigned_data = pd.DataFrame(required_examples)
    # Adding in the primary example info and the type of example for each row.
    assigned_data = assigned_data[['example_type','example_id','body']].reset_index()
    assigned_examples_pivot = assigned_data.pivot(index='example_id', columns='example_type', values='body').reset_index()[['example_id','preceding', 'example','following']]
    assigned_examples = assigned_examples_pivot.rename(columns={
         'preceding':'preceding_comment',
         'following':'following_comment',
         'example':'comment_for_evaluation'})
    for label in labels:
         assigned_examples[label] = ""
    assigned_examples.to_csv(export_location, index=False)
    