In [50]:
import pandas as pd
import utils
import numpy as np
import math
from tqdm import tqdm

#### Jigsaw Subsets

In [51]:
from datasets import load_dataset, concatenate_datasets

dataset = load_dataset("google/jigsaw_unintended_bias", data_dir='../jigsaw-unintended-bias-in-toxicity-classification/')
dataset = concatenate_datasets([dataset["train"], dataset["test_private_leaderboard"], dataset["test_public_leaderboard"]])

In [52]:
# gender based subset
gender_features = ['male', 'female', 'transgender', 'other_gender']
# race based subset
race_features = ['asian', 'black', 'latino', 'white', 'other_race_or_ethnicity']
# disability based subset
disability_features = ['intellectual_or_learning_disability', 'physical_disability', 'psychiatric_or_mental_illness', 'other_disability']
# sexual orientation subset
sexual_orientation_features = ['heterosexual', 'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation']

features = gender_features + race_features + disability_features + sexual_orientation_features

In [None]:
import ast
import json
samples = []
for sample in tqdm(dataset):
    scores = {feature: sample[feature] for feature in features if sample[feature] > 0.0}
    if len(scores) == 1:
        samples += [[sample['comment_text'], list(scores.keys())[0], sample['target'], sample['rating']]]

columns = ['text', 'label', 'toxicity', 'approval']
dataset = pd.DataFrame(samples, columns=columns)

In [None]:
dataset.to_csv('./data/jigsaw/main.csv', index=False)
dataset.head(), len(dataset)

In [55]:
def save_dataset(name, groups):
        subset=dataset.copy()
        subset = subset[subset['label'].isin(groups)]
        subset.to_csv(f'./data/jigsaw/{name}.csv')

In [56]:
# select
labels = [gender_features, race_features, disability_features, sexual_orientation_features]
names = ['gender', 'race', 'disability', 'sexual_orientation']
for groups, name in zip(labels, names):
    save_dataset(name, groups)

#### Ideology Labeled Data

In [57]:
ideology_df = pd.read_csv('./data/processed_annotated_comments.csv')
ideology_df['label'] = ideology_df['label'].apply(lambda x: None if x not in ['left', 'right'] else x)
ideology_df.dropna(inplace=True)

In [None]:
comments = dataset['text'].tolist() + ideology_df['text'].tolist()
len(comments)

In [59]:
import pickle
with open('./data/comments.pkl', 'wb') as handle:
    pickle.dump(comments, handle, protocol=pickle.HIGHEST_PROTOCOL)