# Load and Preprocess Datasets

In [None]:
#loading and importing required dependencies
!pip install --upgrade datasets
!pip install --upgrade transformers[torch]
import transformers
import datasets
from datasets import load_dataset

In [None]:
#loading hugging face hatexplain dataset
df1 = load_dataset("hatexplain")
df1

I will preprocess the dataset. First of all, I will join the tokens into a sentence. Later on, I will label the texts in a better format, taking into account the majority vote of annotators'.

In [None]:
def process_tokens(example):
    # Join the tokens into a single sentence and remove empty strings and commas
    sentence = ' '.join(filter(lambda x: x not in ['', ','], example["post_tokens"]))
    example["text"] = sentence
    return example

# Apply the function to the train, validation, and test splits
df1 = df1.map(process_tokens)

In [None]:
# Define a function to compute the majority label
def compute_majority_label(example):
    # Get the "label" list from the "annotators" dictionary
    labels = example["annotators"]["label"]

    # Count the occurrences of each label
    label_counts = {}
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

    # Find the label with the maximum count (majority label)
    majority_label = max(label_counts, key=label_counts.get)

    # Replace the "label" list with the majority label
    example["labels"] = majority_label
    return example

# Apply the function to the train, validation, and test splits
df1 = df1.map(compute_majority_label)

In [None]:
#remove columns
columns = ["id","rationales","annotators","post_tokens"]
df1 = df1.map(remove_columns = columns)

In [None]:
from datasets import concatenate_datasets

# concatenate train, validation and test splits of dataset
df1 = concatenate_datasets([df1["train"], df1["validation"], df1["test"]])


In [None]:
print(df1.features)
print(df1.unique('labels'))

Here, the dataset has three labels, hatespeech (0), normal (1), or offensive (2). I will merge 'hate' and 'offensive' into one common category of 'hate', and the dataset will have two labels hatespeech (0) and normal (1).

In [None]:
# Define a function to map labels
def map_labels(example):
    if example['labels'] == 0 or example['labels'] == 2:
        example['labels'] = 1
    elif example['labels'] == 1:
        example['labels'] = 0
    return example

# Apply the label mapping function to the dataset
df1 = df1.map(map_labels)

In [None]:
#load the hugging face's hate_speech18 dataset
df2 = load_dataset('hate_speech18')
df2

In [None]:
#remove columns
columns = ['user_id','subforum_id','num_contexts']
df2 = df2.map(remove_columns = columns)
#Trainer expects text and labels as feature names so we rename columns
df2=df2.rename_column("label", "labels")

In [None]:
print(df2.features)
print(df2.unique('labels'))

I will drop the texts with 'relation' and 'idk/skip label' labels. And same as the preceding datasets, there will be normal (0) and hate (1) labels.

In [None]:
#filter out the values with idk/skip and relation labels
#take nohate and hate labels
df2 = df2.filter(lambda example: example['labels'] in (0, 1))
print(df2.unique('labels'))

In [None]:
#aligning the features of datasets
df2 = df2.cast(df1.features)
df2

In [None]:
#load hugging face's goodwin278/labelled_hatespeech
df3 = load_dataset("goodwin278/labelled_hatespeech")
df3

In [None]:
#remove and rename columns
columns = ['Platform']
df3 = df3.map(remove_columns = columns)
#rename columns to text and labels
df3=df3.rename_column("Comment", "text")
df3=df3.rename_column("Hateful", "labels")
#convert to dataset from datasetdict
df3 = df3['train']
df3

In [None]:
print(df3.features)
print(df3.unique('labels'))

In [None]:
#aligning the features of datasets
df3 = df3.cast(df1.features)
df3

In [None]:
#load hugging face's christinacdl/binary_hate_speech dataset
df4 = load_dataset("christinacdl/binary_hate_speech")
df4

In [None]:
#rename column name
df4 = df4.rename_column('label','labels')

# concatenate train, test and validation splits of the dataset
df4 = concatenate_datasets([df4["train"], df4["validation"], df4["test"]])

# Now 'combined_dataset' contains all the data from the train, validation, and test sets
df4

In [None]:
print(df4.features)
print(df4.unique('labels'))

In [None]:
#map the labels
label_mapping = {
    'NOT_OFF_HATEFUL_TOXIC': 0,
    'OFF_HATEFUL_TOXIC': 1
}
def map_labels(example):
    example['labels'] = label_mapping[example['labels']]
    return example

df4 = df4.map(map_labels)
df4

In [None]:
print(df4.features)
print(df4.unique('labels'))

In [None]:
#aligning the features of datasets
df4 = df4.cast(df1.features)
df4

In [None]:
#load hugging face's tweets_hate_speech_detection dataset
df5 = load_dataset("tweets_hate_speech_detection")
df5

In [None]:
#rename columns
df5 = df5.rename_column('label','labels')
df5 = df5.rename_column('tweet','text')

# concatenate train and test splits of dataset
df5 = concatenate_datasets([df5["train"],df5["test"]])

# Now 'combined_dataset' contains all the data from the train, validation, and test sets
df5

In [None]:
#aligning the features of datasets
df5 = df5.cast(df1.features)
df5

In [None]:
print(df5.features)
print(df5.unique('labels'))

In [None]:
# Filter examples where label is -1
filtered_df5 = df5.filter(lambda example: example['labels'] == -1)

# Print the first few examples to verify
print(filtered_df5[:5])

Texts which has label -1 seem to not have much meaning, so I will filter them out.

In [None]:
#filter out the label -1
#take nohate and hate labels
df5 = df5.filter(lambda example: example['labels'] in (0, 1))
print(df5.unique('labels'))

In [None]:
print(df5.features)
print(df5.unique('labels'))

In [None]:
# Concatenate all of the datasets
new_df = concatenate_datasets([df1, df2, df3, df4, df5])

# Verify the concatenated dataset
new_df

In [None]:
# encode the column 'labels' 
new_df = new_df.class_encode_column("labels")

In [None]:
#train test split
new_df = new_df.train_test_split(test_size=0.2,shuffle=True, stratify_by_column="labels")
new_df

## bert-base tokenizer

AutoTokenizer is a class in the Transformers library that provides a convenient way to automatically select the appropriate tokenizer for a given pre-trained model. The AutoTokenizer class uses heuristics to determine the type of tokenizer that should be used based on the architecture and configuration of the pre-trained model. This can be useful when working with a variety of pre-trained models, because it allows you to use the appropriate tokenizer without having to manually select one for each model.

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

This code instantiates a tokenizer for the BERT (Bidirectional Encoder Representations from Transformers) pre-trained model with the bert-base-cased configuration. The bert-base-cased configuration refers to a version of the BERT model that has a cased vocabulary. The same idea is used to load Tokenizer for any other model like Distillbert etc. By instantiating a tokenizer for the bert-base-cased model using AutoTokenizer.from_pretrained(), you can tokenize text according to the same scheme used during pre-training of the BERT model. This can be useful when fine-tuning the pre-trained model on a specific task, because it ensures that the input data is pre-processed in the same way as the data used to train the original model.

In [None]:
#define function to tokenize the text function
def tokenize_data(example):
  return tokenizer(example['text'],padding='max_length',truncation=True)

#apply the transformation function to dataset using the 'map' method
tokenized_dataset = new_df.map(tokenize_data,batched=True)


## Load a pretrain bert base model

Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning. AutoModelForSequenceClassification is a class in the Transformers library that is used for sequence classification tasks, where the input is a sequence of text and the output is a label or category assigned to that sequence.

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

In [None]:
names = [name for name, param in model.named_parameters()]

names

In [None]:
#freezing layers
trainable_params = names[-20:]

for name, param in model.named_parameters():
    if not name in trainable_params:
        param.requires_grad = False

In [None]:
#define training arguments
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='./results/bert-base',                          # Directory where the model checkpoints and evaluation results will be stored
    run_name = 'bert-base',
    evaluation_strategy="steps",      # Interval for evaluating the model during training (every specified number of steps)
    eval_steps=300,
    save_strategy="steps",            # Interval for saving the model during training (every specified number of steps)
    save_total_limit=5,
    save_steps=300,                                  # Number of steps between two saves
    load_best_model_at_end=True,                      # Whether to load the best model at the end of training
    num_train_epochs=4,                              # Number of training epochs
    per_device_train_batch_size=4,                   # Batch size per GPU for training
    per_device_eval_batch_size=4,                    # Batch size per GPU for evaluation
    learning_rate=1e-4,                              # Learning rate
    weight_decay=0.01,                               # Weight decay
    warmup_steps=500,                                # Number of warmup steps
    logging_steps=100,                               # Number of steps between two logs
    gradient_accumulation_steps=16,                  # Number of steps to accumulate gradients before performing an optimizer step
)

In [None]:
#define the early stopping callback
from transformers import EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01,
)

# Combine the training arguments and the early stopping callback
training_args.callbacks = [early_stopping]

In [None]:
#define evaluation metrics
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
#define the trainer object
trainer_base = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    compute_metrics = compute_metrics
)

In [None]:
#train the bert base model
trainer_base.train()

## load a pretrain bertweet model


In [None]:
from transformers import AutoModelForSequenceClassification
#load the model
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2)

In [None]:
names = [name for name, param in model.named_parameters()]

names

In [None]:
#freezing layers
trainable_params = names[-20:]

for name, param in model.named_parameters():
    if not name in trainable_params:
        param.requires_grad = False

## bertweet-base tokenizer

In [None]:
#load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

In [None]:
#define function to tokenize the text function
def tokenize_data(example):
  return tokenizer(example['text'],padding='max_length',truncation=True)

#apply the transformation function to dataset using the 'map' method
tokenized_dataset = new_df.map(tokenize_data,batched=True)

In [None]:
#define training arguments
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='./results/bertweet-base',                          # Directory where the model checkpoints and evaluation results will be stored
    run_name = 'bertweet-base',
    evaluation_strategy="steps",      # Interval for evaluating the model during training (every specified number of steps)
    eval_steps=300,
    save_strategy="steps",            # Interval for saving the model during training (every specified number of steps)
    save_total_limit=5,
    save_steps=300,                                  # Number of steps between two saves
    load_best_model_at_end=True,                      # Whether to load the best model at the end of training
    num_train_epochs=4,                              # Number of training epochs
    per_device_train_batch_size=4,                   # Batch size per GPU for training
    per_device_eval_batch_size=4,                    # Batch size per GPU for evaluation
    learning_rate=1e-4,                              # Learning rate
    weight_decay=0.01,                               # Weight decay
    warmup_steps=500,                                # Number of warmup steps
    logging_steps=100,                               # Number of steps between two logs
    gradient_accumulation_steps=16,                  # Number of steps to accumulate gradients before performing an optimizer step
)

In [None]:
#define metrics
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
#define trainer base
trainer_base = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    compute_metrics = compute_metrics
)

In [None]:
#train the bertweet model
trainer_base.train()

## load a pretrain roberta model

RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a different pretraining scheme.

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

In [None]:
names = [name for name, param in model.named_parameters()]

names

In [None]:
#freezing layers
trainable_params = names[-20:]

for name, param in model.named_parameters():
    if not name in trainable_params:
        param.requires_grad = False

## roberta-base tokenizer

In [None]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [None]:
#define function to tokenize the text function
def tokenize_data(example):
  return tokenizer(example['text'],padding='max_length',truncation=True)

#apply the transformation function to dataset using the 'map' method
tokenized_dataset = new_df.map(tokenize_data,batched=True)

In [None]:
#define training arguments
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='./results/roberta-base',                          # Directory where the model checkpoints and evaluation results will be stored
    run_name = 'roberta-base',
    evaluation_strategy="steps",      # Interval for evaluating the model during training (every specified number of steps)
    eval_steps=300,
    save_strategy="steps",            # Interval for saving the model during training (every specified number of steps)
    save_total_limit=5,
    save_steps=300,                                  # Number of steps between two saves
    load_best_model_at_end=True,                      # Whether to load the best model at the end of training
    num_train_epochs=4,                              # Number of training epochs
    per_device_train_batch_size=4,                   # Batch size per GPU for training
    per_device_eval_batch_size=4,                    # Batch size per GPU for evaluation
    learning_rate=1e-4,                              # Learning rate
    weight_decay=0.01,                               # Weight decay
    warmup_steps=500,                                # Number of warmup steps
    logging_steps=100,                               # Number of steps between two logs
    gradient_accumulation_steps=16,                  # Number of steps to accumulate gradients before performing an optimizer step
)

In [None]:
#define metrics
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
#define trainer base
trainer_base = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    compute_metrics = compute_metrics
)

In [None]:
#train the roberta model
trainer_base.train()

## load a pretrain distilbert model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=2)

In [None]:
names = [name for name, param in model.named_parameters()]

names

In [None]:
#freezing layers
trainable_params = names[-20:]

for name, param in model.named_parameters():
    if not name in trainable_params:
        param.requires_grad = False

## distilbert tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
#define function to tokenize the text function
def tokenize_data(example):
  return tokenizer(example['text'],padding='max_length',truncation=True)

#apply the transformation function to dataset using the 'map' method
tokenized_dataset = new_df.map(tokenize_data,batched=True)

In [None]:
#define the training arguments
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='./results/distilbert-base',                          # Directory where the model checkpoints and evaluation results will be stored
    run_name = 'distilbert-base',
    evaluation_strategy="steps",      # Interval for evaluating the model during training (every specified number of steps)
    eval_steps=300,
    save_strategy="steps",            # Interval for saving the model during training (every specified number of steps)
    save_total_limit=5,
    save_steps=300,                                  # Number of steps between two saves
    load_best_model_at_end=True,                      # Whether to load the best model at the end of training
    num_train_epochs=4,                              # Number of training epochs
    per_device_train_batch_size=4,                   # Batch size per GPU for training
    per_device_eval_batch_size=4,                    # Batch size per GPU for evaluation
    learning_rate=1e-4,                              # Learning rate
    weight_decay=0.01,                               # Weight decay
    warmup_steps=500,                                # Number of warmup steps
    logging_steps=100,                               # Number of steps between two logs
    gradient_accumulation_steps=16,                  # Number of steps to accumulate gradients before performing an optimizer step

)

In [None]:
#define metrics
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
#define trainer base
trainer_base = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['test'],
    compute_metrics = compute_metrics
)

In [None]:
#train the distilbert model
trainer_base.train()