In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# avoid text truncation
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [7]:
corpus_path = "/Users/admin/hate_speech/datasets/sbic/"
p = Path(corpus_path)

csv_trn = "cate-SBIC.v2.trn.csv"
csv_dev = "cate-SBIC.v2.dev.csv"
csv_tst = "cate-SBIC.v2.tst.csv"

In [26]:
output_path = "/Users/admin/hate_speech/XClass/data/datasets/sbic"

In [8]:
df_trn = pd.read_csv(p/csv_trn, low_memory=False)
df_dev = pd.read_csv(p/csv_dev, low_memory=False)
df_tst = pd.read_csv(p/csv_tst, low_memory=False)

In [9]:
df_trn.shape, df_dev.shape, df_tst.shape

((12131, 2), (1807, 2), (1926, 2))

## Data format for X-Class
We also describe the dataset format for potential use of new datasets.  
All files should be placed in a folder with the dataset's name, in this directory. The files to
include are
- dataset.txt 
    - A text file containing documents, one per line. We will use BERT's tokenizer for tokenization.
- classes.txt
    - A text file containing the class names, one per line.
- labels.txt
    - A text file containing the class (index) of each document in `dataset.txt`, one label per line.
All the files should have the exact same names.

In [17]:
df_count = df_trn.groupby(['targetCategory']).count().reset_index().sort_values(by="post", ascending=False)
df_count

Unnamed: 0,targetCategory,post
4,race,3462
3,gender,3297
1,culture,2137
6,victim,1606
2,disabled,713
5,social,538
0,body,378


In [20]:
categories = df_count.targetCategory.to_list()
label2id = {val:i for i, val in enumerate(categories)}
label2id

{'race': 0,
 'gender': 1,
 'culture': 2,
 'victim': 3,
 'disabled': 4,
 'social': 5,
 'body': 6}

In [22]:
df_trn['label'] = df_trn.apply(lambda x: label2id[x['targetCategory']], axis=1)

In [28]:
p = Path(output_path)
df_trn.to_csv(str(p/"dataset.txt"), columns=["post"], header=False, index=False)
df_trn.to_csv(str(p/"labels.txt"), columns=["label"], header=False, index=False)

In [29]:
with open(p/"classes.txt", "w+") as f:
    f.write("\n".join(categories))