In [24]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer

In [25]:
# avoid text truncation
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [26]:
corpus_path = "/Users/admin/hate_speech/datasets/sbic/"
p = Path(corpus_path)

In [7]:
# original labels
csv_trn = "cate-SBIC.v2.trn.csv"
csv_dev = "cate-SBIC.v2.dev.csv"
csv_tst = "cate-SBIC.v2.tst.csv"

output_path = "/Users/admin/hate_speech/XClass/data/datasets/sbic"

In [27]:
# relabeled dataset
csv_trn = "new-cat-SBIC.v2.trn.csv"
csv_dev = "new-cat-SBIC.v2.dev.csv"
csv_tst = "new-cat-SBIC.v2.tst.csv"

output_path = "/Users/admin/hate_speech/XClass/data/datasets/sbic-relabel"

In [28]:
df_trn = pd.read_csv(p/csv_trn, low_memory=False)
df_dev = pd.read_csv(p/csv_dev, low_memory=False)
df_tst = pd.read_csv(p/csv_tst, low_memory=False)

In [29]:
df_trn.shape, df_dev.shape, df_tst.shape

((12053, 3), (1807, 3), (1925, 3))

In [30]:
df_trn.dropna(subset=['targetCategory'], inplace=True)
df_trn.shape

(11754, 3)

In [31]:
import re
emoji_pattern = re.compile("&#[0-9]+;")
user_pattern = re.compile("@[A-Za-z0-9_]+")  #Twitter naming convention
url_pattern = re.compile("(^|\s)https?://\S+")
space_pattern = re.compile(" +")

def preprocess(post):
    """ following the TweetEval paper:
      1. user mentions are anonymized 
      2. line breaks andwebsite links are removed
      NEW: remove emojis
    """  
    post = emoji_pattern.sub("", post)
    post = post.replace("\n", " ")
    post = user_pattern.sub("@user", post)
    post = url_pattern.sub(" ", post)
    post = space_pattern.sub(" ", post)
    return post

In [32]:
df_trn['post'] = df_trn.apply(lambda x: preprocess(x['post']), axis=1)
df_dev['post'] = df_dev.apply(lambda x: preprocess(x['post']), axis=1)
df_tst['post'] = df_tst.apply(lambda x: preprocess(x['post']), axis=1)

In [33]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [38]:
tokenizer("I love you", return_length=True, return_attention_mask=False, return_token_type_ids=False)

{'input_ids': [101, 1045, 2293, 2017, 102], 'length': 5}

In [39]:
tokens = tokenizer(df_trn['post'].tolist(), return_length=True, return_attention_mask=False, return_token_type_ids=False)

2

In [41]:
df_trn['n_tokens'] = [length-2 for length in tokens['length']]

In [47]:
df_trn[df_trn['n_tokens']<=5]

Unnamed: 0,post,targetCategory,targetMinority,n_tokens
2,Molly make bitches crazy,gender,women,5
29,Only bitches talk shit,gender,women,5
142,You tha hoe,gender,women,4
146,Chris a faggot,gender,"lesbian women, gay men",5
175,Smh queer,gender,gay men,3
189,@user faggot,gender,"gay men, trans women, trans men, bisexual women, bisexual men",5
220,Shower rod that hoe,gender,women,5
230,@user faggot,gender,gay men,5
245,Feminist aka a bitch.,gender,women,5
279,bitches,gender,women,2


## Data format for X-Class
We also describe the dataset format for potential use of new datasets.  
All files should be placed in a folder with the dataset's name, in this directory. The files to
include are
- dataset.txt 
    - A text file containing documents, one per line. We will use BERT's tokenizer for tokenization.
- classes.txt
    - A text file containing the class names, one per line.
- labels.txt
    - A text file containing the class (index) of each document in `dataset.txt`, one label per line.
All the files should have the exact same names.

In [48]:
df_count = df_trn.groupby(['targetCategory']).count().reset_index().sort_values(by="post", ascending=False)
df_count

Unnamed: 0,targetCategory,post,targetMinority,n_tokens
7,race,3797,3797,3797
4,gender,3249,3249,3249
9,victim,1581,1581,1581
8,religion,1572,1572,1572
2,disabled,723,723,723
5,political,319,319,319
0,age,183,183,183
1,appearance,178,178,178
3,feminist,81,81,81
6,poor,71,71,71


In [49]:
categories = df_count.targetCategory.to_list()
label2id = {val:i for i, val in enumerate(categories)}
label2id

{'race': 0,
 'gender': 1,
 'victim': 2,
 'religion': 3,
 'disabled': 4,
 'political': 5,
 'age': 6,
 'appearance': 7,
 'feminist': 8,
 'poor': 9}

In [50]:
df_trn['label'] = df_trn.apply(lambda x: label2id[x['targetCategory']], axis=1)

In [51]:
p = Path(output_path)
df_trn.to_csv(str(p/"dataset.txt"), columns=["post"], header=False, index=False)
df_trn.to_csv(str(p/"labels.txt"), columns=["label"], header=False, index=False)

In [52]:
with open(p/"classes.txt", "w+") as f:
    f.write("\n".join(categories))