## Batch 4 (2022-04-28)

In [1]:
import os
import random
import datetime as dt
import pickle
import pandas as pd

In [2]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"
labelled_data_dir = f"{repo_dir}/data/labelled/"

# Collecting all the labelled file names 
labelled_data_fnames = [f for f in os.listdir(labelled_data_dir)]

# Pre-randomized list of example_ids
randomized_example_ids_path = f"{repo_dir}/data/randomized_example_ids.p"

# example_ids that require backfill due to init assigment issue
remaining_backfill_path = f'{repo_dir}/data/backfill_example_ids.p'

# Dictionary of example_ids that have been labelled once
# See assign-examples-for-interrater.ipynb for more details
interrater_assignment_date = '2022-04-28' 
interrater_assignment_path = f"{repo_dir}/data/interrater-reliability/interrater_assignment_{interrater_assignment_date}.p"

In [3]:
remaining_examples_to_backfill = pickle.load( open(remaining_backfill_path, "rb" ) )
example_indices = pickle.load( open(randomized_example_ids_path, "rb" ) )
interrater_assignment = pickle.load( open(interrater_assignment_path, "rb" ) )

In [4]:
# obtaining a list of labelled examples

selected_columns = ['example_id']
list_of_example_ids = [pd.read_csv(f"{labelled_data_dir}{fname}")[selected_columns] for fname in labelled_data_fnames]
example_ids_pd = pd.concat(list_of_example_ids)
labelled_examples = list(example_ids_pd['example_id'].values)

In [5]:
# Defining raters who need to be assigned a batch. 

In [6]:
rater_ids = ['ac4119', 'gm2858', 'yj2369','yp2201']
num_raters = len(rater_ids)

In [7]:
start_id = 1360
batch_size = 140

In [8]:
current_example_indices = example_indices[start_id:]
current_example_indices = [i for i in current_example_indices if i not in labelled_examples ]

In [9]:
supplmental_batches = []

for i in range(num_raters):
    if i == 0:
        start_id = 0
        n_backfill = len(remaining_examples_to_backfill)
        end_id = start_id + batch_size - n_backfill
    else:
        end_id = start_id + batch_size
    supplmental_batches.append((start_id,end_id))
    start_id = end_id

In [None]:
supplmental_batches

Randomizing the order of raters and assigning batches based on order.

In [10]:
random.shuffle(rater_ids)

In [11]:
assign_batches_to_raters =  list(zip(rater_ids,supplmental_batches))
print(assign_batches_to_raters)

[('yp2201', (0, 103)), ('ac4119', (103, 243)), ('yj2369', (243, 383)), ('gm2858', (383, 523))]


### Reading in pre-collected dataset

In [12]:
daily_discussion_data_fname = "daily_master_data_1614250838_1618692612.csv"
daily_discussion_data_path = f"{repo_dir}/data/{daily_discussion_data_fname}"
daily_discussion_data = pd.read_csv(daily_discussion_data_path).rename(columns={'Unnamed: 0':'example_id'})

### Generating files for labelling based on batch assignment

This chunk uses the `rater_id` and assigned indices `(start_ind, end_ind)` as inputs. The process collects the relevant data and generates a file that is more conducive to labelling and text classification training/scoring. Specifically, the below collects the context (i.e. `preceding_comment`, `following_comment`), the `comment_for_evaluation`, and adds columns for each toxic attribute label.

In [16]:
to_label_dir = f"{repo_dir}/data/to_label"
current_date = dt.date.today()
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

for rater_id, (start_ind, end_ind) in assign_batches_to_raters:
    fname = f"{rater_id}_labelling_assignment_{current_date}"
    export_location = f"{to_label_dir}/{fname}"
    assigned_indices = current_example_indices[start_ind:end_ind]
    if start_id == 0:
        assigned_indices += remaining_examples_to_backfill
    assigned_indices += interrater_assignment[rater_id]
    index_map = {}
    required_examples = []
    for ind in assigned_indices:
        ind_examples = [{
        'example_id': str(ind),
        'example_type': 'preceding',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind-1]['body'].values[0]
    }, {
        'example_id': str(ind),
        'example_type': 'example',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind]['body'].values[0]
    },
     {
        'example_id': str(ind),
        'example_type': 'following',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind+1]['body'].values[0]
    }]
        required_examples += ind_examples 
    assigned_data = pd.DataFrame(required_examples)
    # Adding in the primary example info and the type of example for each row.
    assigned_data = assigned_data[['example_type','example_id','body']].reset_index()
    assigned_examples_pivot = assigned_data.pivot(index='example_id', columns='example_type', values='body').reset_index()[['example_id','preceding', 'example','following']]
    assigned_examples = assigned_examples_pivot.rename(columns={
         'preceding':'preceding_comment',
         'following':'following_comment',
         'example':'comment_for_evaluation'})
    for label in labels:
         assigned_examples[label] = ""
    assigned_examples.to_csv(export_location, index=False)
    

In [14]:
ind

(0, 103)

In [None]:
daily_discussion_data[daily_discussion_data['example_id'] == ind-1]

In [15]:
assigned_indices

[(0, 103),
 (103, 243),
 (243, 383),
 (383, 523),
 290885,
 615643,
 86008,
 170161,
 122521,
 295359,
 448991,
 346447,
 558444,
 370363,
 110661,
 348478,
 276444,
 75306,
 54650,
 65805,
 374145,
 597705,
 202461,
 588864,
 455576,
 296815,
 16915,
 179361,
 417611,
 142273,
 46107,
 416841,
 368880,
 24710,
 427008,
 501180,
 595771,
 2072,
 535474,
 457356,
 465073,
 413921,
 86632,
 378037,
 554332,
 538144,
 395243,
 13786,
 82485,
 534685,
 23154,
 193771,
 511720,
 480746,
 326880,
 128832,
 568475,
 413686,
 265978,
 537192,
 31882,
 277373,
 70825,
 25974,
 38381,
 346470,
 269192,
 591880,
 504766,
 32952,
 529495,
 424081,
 609573,
 94141,
 512007,
 52314,
 285378,
 554146,
 566377,
 98356,
 182906,
 379993,
 398290,
 618295,
 350511,
 547861,
 71377,
 192843,
 280313,
 99989,
 153292,
 41179,
 367275,
 105125,
 400560,
 601750,
 522826,
 495207,
 197535,
 449552,
 152642,
 332182,
 453418,
 340637,
 551996,
 172999,
 88000,
 583256,
 524532,
 327299,
 78350,
 206977,
 916