In [13]:
# !pip install transformers -q
# !pip install datasets -q
# !pip install evaluate -q

In [14]:
import os
os.environ["WANDB_DISABLED"] = "true"
import evaluate
import pandas as pd

df_train = pd.read_csv('/home/aflah20082/ML_Project/Data/PreprocessedData/english_train_preprocess.csv')
df_test = pd.read_csv('/home/aflah20082/ML_Project/Data/PreprocessedData/english_test_preprocess.csv')
df_val = pd.read_csv('/home/aflah20082/ML_Project/Data/PreprocessedData/english_dev_preprocess.csv')

In [15]:
label_replacement = {
    'Hope_speech': 0,
    'Non_hope_speech': 1,
    'not-English': 2,
}

df_train['label'] = df_train['label'].replace(label_replacement)
df_test['label'] = df_test['label'].replace(label_replacement)
df_val['label'] = df_val['label'].replace(label_replacement)

# Drop rows with label not-English
df_train = df_train[df_train['label'] != 2]
df_test = df_test[df_test['label'] != 2]
df_val = df_val[df_val['label'] != 2]

In [16]:
df_train = df_train[['preprocessed_text', 'label']]
df_test = df_test[['preprocessed_text', 'label']]
df_val = df_val[['preprocessed_text', 'label']]

In [17]:
df_train = df_train.rename(columns={'preprocessed_text': 'text'})
df_test = df_test.rename(columns={'preprocessed_text': 'text'})
df_val = df_val.rename(columns={'preprocessed_text': 'text'})

In [18]:
from datasets import Dataset, DatasetDict
tds = Dataset.from_pandas(df_train[['text', 'label']], preserve_index=False)
vds = Dataset.from_pandas(df_val[['text', 'label']], preserve_index=False)
testds = Dataset.from_pandas(df_test[['text', 'label']], preserve_index=False)


ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds
ds['test'] = testds

In [19]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("GroNLP/hateBERT")

# tokenizer.add_special_tokens(["<intent>", "</intent>"])

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [20]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42)

In [21]:
import numpy as np
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("precision")
metric3 = evaluate.load("recall")
metric4 = evaluate.load("f1")

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [22]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("GroNLP/hateBERT", num_labels=2) 

training_args = TrainingArguments(output_dir=f"Model Dumps/hatebert_finetune", evaluation_strategy="epoch")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of the model checkpoint at GroNLP/hateBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2324,0.2691,0.922914
2,0.1697,0.318708,0.927842
3,0.0869,0.382246,0.923618


Saving model checkpoint to Model Dumps/hatebert_finetune/checkpoint-500
Configuration saved in Model Dumps/hatebert_finetune/checkpoint-500/config.json
Model weights saved in Model Dumps/hatebert_finetune/checkpoint-500/pytorch_model.bin
Saving model checkpoint to Model Dumps/hatebert_finetune/checkpoint-1000
Configuration saved in Model Dumps/hatebert_finetune/checkpoint-1000/config.json
Model weights saved in Model Dumps/hatebert_finetune/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to Model Dumps/hatebert_finetune/checkpoint-1500
Configuration saved in Model Dumps/hatebert_finetune/checkpoint-1500/config.json
Model weights saved in Model Dumps/hatebert_finetune/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to Model Dumps/hatebert_finetune/checkpoint-2000
Configuration saved in Model Dumps/hatebert_finetune/checkpoint-2000/config.json
Model weights saved in Model Dumps/hatebert_finetune/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to Model Dumps/

TrainOutput(global_step=8529, training_loss=0.16271105957612692, metrics={'train_runtime': 7833.0789, 'train_samples_per_second': 8.709, 'train_steps_per_second': 1.089, 'total_flos': 1.79494361966592e+16, 'train_loss': 0.16271105957612692, 'epoch': 3.0})

In [23]:
predictions = trainer.predict(tokenized_datasets["validation"])
preds = np.argmax(predictions.predictions, axis=-1)
accuracy = metric1.compute(predictions=preds, references=predictions.label_ids)
precision = metric2.compute(predictions=preds, references=predictions.label_ids)
recall = metric3.compute(predictions=preds, references=predictions.label_ids)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2841
  Batch size = 8


In [24]:
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)

Accuracy:  {'accuracy': 0.9236184442097853}
Precision:  {'precision': 0.9495412844036697}
Recall:  {'recall': 0.9669131957960296}


In [25]:
# Predictions on test set
predictions = trainer.predict(tokenized_datasets["test"])
preds = np.argmax(predictions.predictions, axis=-1)
accuracy = metric1.compute(predictions=preds, references=predictions.label_ids)
precision = metric2.compute(predictions=preds, references=predictions.label_ids)
recall = metric3.compute(predictions=preds, references=predictions.label_ids)
macro_f1 = metric4.compute(predictions=preds, references=predictions.label_ids, average='macro')
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("Macro F1: ", macro_f1)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 2843
  Batch size = 8


Accuracy:  {'accuracy': 0.9282448118185016}
Precision:  {'precision': 0.9540098821740783}
Recall:  {'recall': 0.9679907443116081}
Macro F1:  {'f1': 0.7596955112269047}
