### Methodology for Selecting Examples to Label

In this notebook, we will be randomly selecting and assigning a batch of 200 examples to each rater. 


In [1]:
import pandas as pd
import random
import pickle
import datetime as dt

In [2]:
repo_dir = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset"

In [3]:
daily_discussion_data_fname = "daily_master_data_1614250838_1618692612.csv"
daily_discussion_data_path = f"{repo_dir}/data/{daily_discussion_data_fname}"

randomized_example_ids_path = f"{repo_dir}/data/randomized_example_ids.p"

In [4]:
daily_discussion_data = pd.read_csv(daily_discussion_data_path).rename(columns={'Unnamed: 0':'example_id'})

In [5]:
daily_discussion_data.head(5)

Unnamed: 0,example_id,sub_id,created_utc,body,score,author
0,0,ls42x6,1614251000.0,first,7,I_make_switch_a_roos
1,1,ls42x6,1614251000.0,Rise and shine bitches,41,LitenVarg
2,2,ls42x6,1614251000.0,Here we go. 🚀,14,readingtostrangers
3,3,ls42x6,1614251000.0,GME to 420.69 EOD,14,wottsraja
4,4,ls42x6,1614251000.0,Second retard,2,AceSouth


In [6]:
num_examples = len(daily_discussion_data)

In [7]:
example_indices = [i for i in range(num_examples)]
print(example_indices[:5])

[0, 1, 2, 3, 4]


In [9]:
random.seed(519)
random.shuffle(example_indices)
print(example_indices[:5])

[494030, 420324, 473177, 419306, 506755]


In [10]:
pickle.dump(example_indices, open(randomized_example_ids_path , "wb" ) )

In [29]:
example_indices = pickle.load( open(randomized_example_ids_path, "rb" ) )

In [31]:
len(set(example_indices ))

619646

In [32]:
len(example_indices)

619646

In [12]:
rater_ids = ['ac4119', 'gm2858', 'yj2369','yp2201']

In [14]:
init_batches = [(0, 200), (200,400), (400,600), (600,800)]

In [18]:
random.shuffle(rater_ids)

In [21]:
assign_batches_to_raters = list(zip(rater_ids,init_batches))

In [44]:
example_indices[400]

296235

In [45]:
assign_batches_to_raters = [('gm2858', (0, 200)),
 ('yp2201', (200, 400)),
 ('yj2369', (400, 600)),
 ('ac4119', (600, 800))]

In [46]:
to_label_dir = f"{repo_dir}/data/to_label"
current_date = dt.date.today()
labels = ['toxicity', 'severe_toxicity', 'identity_attack', 'insult', 'profanity', 'threat']

for rater_id, (start_ind, end_ind) in assign_batches_to_raters:
    fname = f"{rater_id}_labelling_assignment_{current_date}"
    export_location = f"{to_label_dir}/{fname}"
    assigned_indices = example_indices[start_ind:end_ind]
    index_map = {}
    required_indices = []
    for ind in assigned_indices:
        index_map[ind-1] = {
        'example_id': str(ind),
        'type': 'preceding'
    }
        index_map[ind] = {
        'example_id': str(ind),
        'type': 'example'
    }
        index_map[ind+1] = {
        'example_id': str(ind),
        'type': 'following'
    }
        required_indices += [ind-1,ind, ind+1]
    assigned_data = daily_discussion_data.iloc[required_indices]
    
    # Adding in the primary example info and the type of example for each row.
    assigned_data ['example_type'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("type"), axis=1)
    assigned_data ['example_id'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("example_id"), axis=1)
    assigned_data = assigned_data[['example_type','example_id','body']]
    assigned_examples_pivot = assigned_data.pivot(index='example_id', columns='example_type', values='body').reset_index()[['example_id','preceding', 'example','following']]
    assigned_examples = assigned_examples_pivot.rename(columns={
         'preceding':'preceding_comment',
         'following':'following_comment',
         'example':'comment_for_evaluation'})
    for label in labels:
         assigned_examples[label] = ""
    assigned_examples.to_csv(export_location, index=False)
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assigned_data ['example_type'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("type"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assigned_data ['example_id'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("example_id"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

ValueError: Index contains duplicate entries, cannot reshape

In [49]:
for ind in [296235]:
    index_map[ind-1] = {
    'example_id': str(ind),
    'type': 'preceding'
}
    index_map[ind] = {
    'example_id': str(ind),
    'type': 'example'
}
    index_map[ind+1] = {
    'example_id': str(ind),
    'type': 'following'
}
    required_indices = [ind-1,ind, ind+1]
    assigned_data = daily_discussion_data.iloc[required_indices]
    
    # Adding in the primary example info and the type of example for each row.
    assigned_data ['example_type'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("type"), axis=1)
    assigned_data ['example_id'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("example_id"), axis=1)
    assigned_data = assigned_data[['example_type','example_id','body']]
    assigned_examples_pivot = assigned_data.pivot(index='example_id', columns='example_type', values='body').reset_index()[['example_id','preceding', 'example','following']]
    assigned_examples = assigned_examples_pivot.rename(columns={
         'preceding':'preceding_comment',
         'following':'following_comment',
         'example':'comment_for_evaluation'})
    for label in labels:
         assigned_examples[label] = ""
    assigned_examples

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assigned_data ['example_type'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("type"), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assigned_data ['example_id'] = assigned_data .apply(lambda x: index_map.get(x['example_id'], {}).get("example_id"), axis=1)


In [53]:
 assigned_examples.to_csv(export_location, index=False)

In [52]:
export_location = "/Users/ameliachu/repos/nlu-reddit-toxicity-dataset/data/to_label/test.csv"

In [24]:
assigned_data_example = pd.read_csv(export_location)

In [25]:
assigned_data_example.head(5)

Unnamed: 0,example_id,preceding_comment,comment_for_evaluation,following_comment,toxicity,severe_toxicity,identity_attack,insult,profanity,threat
0,105580,I'm shorting JPOW,It's going back up,I got into so many good spacs today 🦘,,,,,,
1,106714,Were going to need to change the banner at the...,I wish all the fucking leaf blowers would come...,Waiting to load up on NASDAQ CFDs at around 12...,,,,,,
2,107332,"Lol, this is still correction territory.\n\nWa...",I fucking hope not,JAY FIRE THE PRINTERS! JAY?! JAYYYYYYYY,,,,,,
3,11714,I did 2 years ago all cards went like times 10,All these fucks have paper hands,AMC!!!!!!,,,,,,
4,117374,Yeah wtf do you think “I was close” means?,I like sugar with my margaritas. Not salt.,I like alts on top of alts. And boy did I get it,,,,,,


In [28]:
len(assigned_data_example)

200