In [1]:
import pandas as pd
import pickle
import random
import datetime as dt

In [2]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"

daily_discussion_data_fname = "daily_master_data_1614250838_1618692612.csv"
daily_discussion_data_path = f"{repo_dir}/data/{daily_discussion_data_fname}"

randomized_example_ids_path = f"{repo_dir}/data/randomized_example_ids.p"

In [3]:
daily_discussion_data = pd.read_csv(daily_discussion_data_path).rename(columns={'Unnamed: 0':'example_id'})

In [4]:
example_indices = pickle.load( open(randomized_example_ids_path, "rb" ) )

In [24]:
len(set(example_indices ))

619646

In [5]:
rater_ids = ['ac4119', 'gm2858', 'yj2369','yp2201']
num_raters = len(rater_ids)

In [6]:
start_id = 800
batch_size = 140

supplmental_batches = []

for i in range(num_raters):
    end_id = start_id + batch_size
    supplmental_batches.append((start_id,end_id))
    start_id = end_id

In [7]:
supplmental_batches

[(800, 940), (940, 1080), (1080, 1220), (1220, 1360)]

In [8]:
random.shuffle(rater_ids)

In [9]:
assign_batches_to_raters =  list(zip(rater_ids,supplmental_batches))
print(assign_batches_to_raters)

[('yp2201', (800, 940)), ('gm2858', (940, 1080)), ('ac4119', (1080, 1220)), ('yj2369', (1220, 1360))]


In [38]:
to_label_dir = f"{repo_dir}/data/to_label"
current_date = dt.date.today()
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

for rater_id, (start_ind, end_ind) in assign_batches_to_raters:
    fname = f"{rater_id}_labelling_assignment_{current_date}"
    export_location = f"{to_label_dir}/{fname}"
    assigned_indices = example_indices[start_ind:end_ind]
    index_map = {}
    required_examples = []
    for ind in assigned_indices:
        ind_examples = [{
        'example_id': str(ind),
        'example_type': 'preceding',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind-1]['body'].values[0]
    }, {
        'example_id': str(ind),
        'example_type': 'example',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind]['body'].values[0]
    },
     {
        'example_id': str(ind),
        'example_type': 'following',
        'body': daily_discussion_data[daily_discussion_data['example_id'] == ind+1]['body'].values[0]
    }]
        required_examples += ind_examples 
    assigned_data = pd.DataFrame(required_examples)
    # Adding in the primary example info and the type of example for each row.
    assigned_data = assigned_data[['example_type','example_id','body']].reset_index()
    assigned_examples_pivot = assigned_data.pivot(index='example_id', columns='example_type', values='body').reset_index()[['example_id','preceding', 'example','following']]
    assigned_examples = assigned_examples_pivot.rename(columns={
         'preceding':'preceding_comment',
         'following':'following_comment',
         'example':'comment_for_evaluation'})
    for label in labels:
         assigned_examples[label] = ""
    assigned_examples.to_csv(export_location, index=False)
    

In [39]:
len(assigned_examples)

140