In [1]:
from cleantext import clean
from sklearn.model_selection import train_test_split
import json
from datasets import load_dataset
import numpy as np
import pandas as pd
from collections import Counter
import pickle
import nltk

In [2]:
def most_common(lst):
    counts = Counter(lst)
    return counts.most_common(1)[0][0]

def format_hatexplain(dataset):
    df = pd.concat([dataset["train"].to_pandas(), dataset["validation"].to_pandas(), dataset["test"].to_pandas()])
    df.reset_index(drop=True, inplace=True)
    df = df.rename(columns={'post_tokens': 'unprocessed_docs'})
    
    df['labels'] = df['annotators'].apply(lambda x: x.get('label'))
    df['final_label'] = df['labels'].apply(most_common)
    df.drop('annotators', axis=1, inplace=True)
    df.drop('rationales', axis=1, inplace=True)
    return df

<h4>Format Hatexplain</h4>

In [3]:
token = "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"
hatexplain = load_dataset("hatexplain", token=token)

In [4]:
df_hatexplain = format_hatexplain(hatexplain)

<h4>Preprocess Hatexplain Dataset</h4>

In [5]:
def expand_contractions(tokens, contractions):
    # Function to replace a single token if it's a contraction
    def replace_and_split_token(token):
        # Check both original and lowercased token to ensure coverage
        expanded_token = contractions.get(token, contractions.get(token.lower(), token))
        return expanded_token.split()

    expanded_tokens = [replace_and_split_token(token) for token in tokens]
    return [item for sublist in expanded_tokens for item in sublist]

In [6]:
with open('contractions_dict.json', 'r', encoding='utf-8') as file:
    contractions_dict = json.load(file)

df_hatexplain['expanded_contractions'] = df_hatexplain['unprocessed_docs'].apply(lambda x: expand_contractions(x, contractions_dict))

In [7]:
def clean_text(text):
    text = ' '.join(text)
    text = clean(text, no_emoji=True)
    tokens = text.split(' ') 
    return tokens

In [8]:
#Apply the function to the DataFrame column
df_hatexplain['documents'] = df_hatexplain['expanded_contractions'].apply(lambda x: clean_text(x))

In [9]:
df_hatexplain.drop('unprocessed_docs', axis=1, inplace=True)
df_hatexplain.drop('expanded_contractions', axis=1, inplace=True)
df_hatexplain = df_hatexplain[['id', 'documents', 'labels', 'final_label']]

In [10]:
# print(df_hatexplain["expanded_contractions"][10148])
# df_hatexplain["preprocessed_docs"][10148]

In [11]:
documents = df_hatexplain["documents"].tolist()

In [12]:
print(documents[0])

['you', 'really', 'think', 'i', 'would', 'not', 'have', 'been', 'raped', 'by', 'feral', 'hindu', 'or', 'muslim', 'back', 'in', 'india', 'or', 'bangladesh', 'and', 'a', 'neo', 'nazi', 'would', 'rape', 'me', 'as', 'well', 'just', 'to', 'see', 'me', 'cry']


<h4>Data splitting</h4>

In [13]:
# Use train_test_split to create a stratified split
df_hatexplain_train, df_hatexplain_test = train_test_split(df_hatexplain, test_size=0.2, stratify=df_hatexplain['final_label'], random_state=42, shuffle=True)
train_documents = df_hatexplain_train['documents'].tolist()
test_documents = df_hatexplain_test['documents'].tolist()

# Check the distribution of labels in each set

train_distribution = df_hatexplain_train['final_label'].value_counts()
test_distribution = df_hatexplain_test['final_label'].value_counts()
print("\nTraining Set Label Distribution:")
print(train_distribution)
print("\nTest Set Label Distribution:")
print(test_distribution)


Training Set Label Distribution:
final_label
1    6251
0    4748
2    4384
Name: count, dtype: int64

Test Set Label Distribution:
final_label
1    1563
0    1187
2    1096
Name: count, dtype: int64


In [14]:
y_train = df_hatexplain_train['final_label'].values
y_test = df_hatexplain_test['final_label'].values

<h4>Binary Data Creation and Splitting</h4>

In [15]:
hate_df = df_hatexplain[df_hatexplain['final_label'] == 0]
offensive_df = df_hatexplain[df_hatexplain['final_label'] == 2]
normal_df = df_hatexplain[df_hatexplain['final_label'] == 1]

num_samples_offensive = int(len(hate_df) * (len(offensive_df) / (len(offensive_df) + len(normal_df))))
num_samples_normal = len(hate_df) - num_samples_offensive  

offensive_sampled = offensive_df.sample(n=num_samples_offensive, random_state=42) 
normal_sampled = normal_df.sample(n=num_samples_normal, random_state=42)

df_hatexplain_binary = pd.concat([hate_df, offensive_sampled, normal_sampled])
df_hatexplain_binary['final_binary_label'] = df_hatexplain_binary['final_label'].apply(lambda x: 1 if (x == 1 or x == 2) else 0)
df_hatexplain_binary = df_hatexplain_binary.sample(frac=1, random_state=42).reset_index(drop=True)

5935
5480
7814


In [16]:
documents_binary = df_hatexplain_binary["documents"].tolist()

In [17]:
# Use train_test_split to create a stratified split
df_hatexplain_binary_train, df_hatexplain_binary_test = train_test_split(df_hatexplain_binary, test_size=0.10, stratify=df_hatexplain_binary['final_binary_label'], random_state=42, shuffle=True)
train_documents_binary = df_hatexplain_binary_train['documents'].tolist()
test_documents_binary = df_hatexplain_binary_test['documents'].tolist()

# Check the distribution of labels in each set

train_distribution_binary = df_hatexplain_binary_train['final_binary_label'].value_counts()
test_distribution_binary = df_hatexplain_binary_test['final_binary_label'].value_counts()
print("\nTraining Set Label Distribution:")
print(train_distribution_binary)
print("\nTest Set Label Distribution:")
print(test_distribution_binary)


Training Set Label Distribution:
final_binary_label
1    5342
0    5341
Name: count, dtype: int64

Test Set Label Distribution:
final_binary_label
0    594
1    593
Name: count, dtype: int64


In [18]:
y_train_binary = df_hatexplain_binary_train['final_binary_label'].values
y_test_binary = df_hatexplain_binary_test['final_binary_label'].values

In [19]:
print(train_documents[0])
print(train_documents_binary[0])

['oprah', 'is', 'a', 'stupid', 'nigger', 'that', 'only', 'got', 'into', 'the', 'entertainment', 'business', 'because', 'she', 'fucked', 'a', 'kike', 'with', 'jungle', 'fever']


In [20]:
# Example data: a mix of numpy arrays and lists
data_to_save = {
    "documents": documents,
    "train_documents": train_documents,
    "test_documents": test_documents,
    "y_train":y_train,
    "y_test":y_test,
    "documents_binary": documents_binary,
    "train_documents_binary": train_documents_binary,
    "test_documents_binary": test_documents_binary,
    "y_train_binary":y_train_binary,
    "y_test_binary":y_test_binary,
}

# To save a list or array
with open('hatexplain_data.pickle', 'wb') as file:
    pickle.dump(data_to_save, file)