## Setup

### Imports

In [15]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from src.api_eooh import *
from src.data_utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# set device
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(f'using device: {device}')

using device: cuda


## Davidson

### Preprocess data

In [11]:
# Read the dataset
df_david = pd.read_csv('data/hs_davidson2017.csv')

print(f'total # of samples: {len(df_david)}')
# print(df_hs.hate_speech.value_counts())
# print(df_hs.offensive_language.value_counts())
print(df_david['class'].value_counts())

# rename cols & create 'label' col
df_david = df_david.rename(columns={'tweet': 'text'})
df_david['label'] = df_david['class'].apply(lambda x: 1 if x == 0 else 0)  # binary indicator of hate_speech or not

# save to csv
df_david.to_csv('data/davidson_hs.csv', index=False)

total # of samples: 24783
class
1    19190
2     4163
0     1430
Name: count, dtype: int64


## UCB

### Preprocess UCB data

In [10]:
# Load the dataset and convert to pandas DataFrame
ucb_dataset = datasets.load_dataset('ucberkeley-dlab/measuring-hate-speech', 'binary')
df_ucb = ucb_dataset['train'].to_pandas()

# create hs label
df_ucb['label'] = (df_ucb['hate_speech_score'] > 0.5).astype(int)

# list of columns to retain
cols_to_retain = [
    'comment_id', 'sentiment', 'respect', 'insult', 'humiliate', 'status',
    'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech',
    'hate_speech_score', 'text'
]

# filter the dataset
df_ucb = df_ucb[cols_to_retain]

# # check if 'comment_id' is unique
# print(f'# samples: {len(df_ucb)}, # unique comment_id: {len(df_ucb.comment_id.unique())}')

# aggregate scores from diff annotators per comment_id
# Define custom aggregation methods
agg_methods = {col: 'mean' if df_ucb[col].dtype != 'object' else 'first' for col in df_ucb.columns if col != 'comment_id'}

# group by 'comment_id' and aggregate
df_ucb = df_ucb.groupby('comment_id').agg(agg_methods).reset_index()

print(f'# samples: {len(df_ucb)}, # unique comment_id: {len(df_ucb.comment_id.unique())}')

# create ordinal label from 'hate_speech_score'
df_ucb['label'] = (df_ucb['hate_speech_score'] > 0.5).astype(int)
# Split the data into train, validation, and test sets
train_df, temp_df = train_test_split(df_ucb, test_size=0.2, random_state=42, stratify=df_ucb['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# add 'split' column to each DataFrame
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'

# concatenate the DataFrames and save to a file
df_ucb_w_splits = pd.concat([train_df, val_df, test_df])
df_ucb_w_splits.to_csv('data/hs_ucb.csv', index=False)

# samples: 39565, # unique comment_id: 39565


## Dynabench

In [24]:
# load the dataset
df_hs_dyna = pd.read_csv('data/hs_dynabench.csv')
# print(f'cols in dynabench HS dataset: {df_hs_dyna.columns}')
# print(f'1st 5 rows:\n{df_hs_dyna.head()}')

# preprocess data
# drop extra cols
cols_to_keep = ['acl.id', 'text', 'label', 'type', 'level', 'split']
df_hs_dyna = df_hs_dyna[cols_to_keep]

# convert label to binary
df_hs_dyna['label'] = (df_hs_dyna['label'] == 'hate').astype(int)

# rename 'dev' split to 'val'
df_hs_dyna['split'] = df_hs_dyna['split'].replace('dev', 'val')

# check train/val/test split
print(f'# samples: {len(df_hs_dyna)}, # unique ids: {len(df_hs_dyna["acl.id"].unique())}')
print(f'# samples per split:\n{df_hs_dyna["split"].value_counts()}')

# print value counts of labels
print(f'# samples per label:\n{df_hs_dyna["label"].value_counts()}')

# save cleaned data to csv
# df_hs_dyna.to_csv('data/hs_dynabench_clean.csv', index=False)

# samples: 41144, # unique ids: 41144
# samples per split:
split
train    32924
test      4120
val       4100
Name: count, dtype: int64
# samples per label:
label
1    22175
0    18969
Name: count, dtype: int64


## EOOH

### Download channel data

In [3]:
# get paths
paths = get_paths()

# list channels 
my_channels = request_api(requests.get, paths.my_channels).json()
# print(my_channels)
channel = my_channels[0]
print(f'channel name: {channel["name"]}, channel uid: {channel["uid"]}')

# download channel
response = download_channel(channel['uid'])
print(response)

channel name: LGBTQ, channel uid: 648dcd4b3708dcf1b65d70e9
{'success': True}


### Preprocess annotations

In [25]:
data_path = 'data/hs_dynabench_eooh.xlsx' # path to EOOH annot file
output_path = 'data/hs_dynabench_eooh_clean.csv' # path to save cleaned data
# clean data
preprocess_eooh_annots(data_path, output_path)