In [77]:
# imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Emotions Dataset

In [78]:
emo_train = load_dataset("csv", data_files="./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
emo_valid = load_dataset("csv", data_files="./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
emo_test = load_dataset("csv", data_files="./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")

In [79]:
#  	0 - neutral
#	1 - joy (furaha)
#	2 - anger (hasira)
#	3 - sadness (huzuni)
#	4 - disgust (machukizo)
#	5 - suprise (mshangao)
#	6 - fear (woga)


classes = ['neutral','joy','anger','sadness','disgust','suprise','fear']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

## Load Pre-Trained Model
### AfriBerta

In [80]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
model = AutoModelForSequenceClassification.from_pretrained("castorini/afriberta_base", num_labels=len(classes),id2label=id2class, label2id=class2id,problem_type = "multi_label_classification")

def model_init(trial):
    return AutoModelForTokenClassification.from_pretrained(
        "castorini/afriberta_base",
        num_labels=len(classes),
        id2label=id2class,
        label2id=class2id,
        problem_type = "multi_label_classification"
    )

tokenizer.model_max_length = 512 

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at castorini/afriberta_base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preprocess Data

In [81]:
# Preprocessing the Emotion Dataset using the Trainer

import ast

def process_label(data):
    lables = []
    for element in data['labels']:
        element = element.replace("[","")
        element = element.replace("]","")
        element = element.replace(" ","")
        element_list = element.split(",")
        label_list = [int(item) for item in element_list]
        lables.append(label_list)
    data['labels'] = lables
    return data

tokenised_trained_emotion = emo_train.map(process_label, batched=True)
tokenised_valid_emotion = emo_valid.map(process_label, batched=True)
tokenised_test_emotion = emo_test.map(process_label, batched=True)


def tokenize_function(data):
    all_labels = data['labels']
    labels = [0. for i in range(len(classes))]
    for label in all_labels:
        label_id = label
        labels[label_id] = 1.
    
    data = tokenizer(data["text"], padding = "max_length", truncation=True)
    data['labels'] = labels
    return data


tokenised_trained_emotion = tokenised_trained_emotion.map(tokenize_function)
tokenised_valid_emotion = tokenised_valid_emotion.map(tokenize_function)
tokenised_test_emotion = tokenised_test_emotion.map(tokenize_function)

tokenised_trained_emotion = tokenised_trained_emotion.rename_column("labels","label")
tokenised_valid_emotion = tokenised_valid_emotion.rename_column("labels","label")
tokenised_test_emotion = tokenised_test_emotion.rename_column("labels","label")

#print(tokenised_trained_emotion['train']['label'])

    
small_train_dataset = tokenised_trained_emotion["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenised_valid_emotion["train"].shuffle(seed=42).select(range(100))


In [82]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [83]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

import evaluate

metric = evaluate.load("accuracy")
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True),
    }


def sigmoid(x):
   return 1/(1 + np.exp(-x))

class CustomTrainer(Trainer):
   def compute_loss(self, model, inputs, return_outputs=False):
      outputs = model(
          input_ids=inputs['input_ids'],
          attention_mask=inputs['attention_mask'],
      )
      cls_logits = outputs.logits
      loss = torch.nn.BCEWithLogitsLoss()(cls_logits.float(),
                                       inputs['labels'].float())
      return (loss, outputs) if return_outputs else loss
    

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels.astype(int).reshape(-1), predictions, average="binary")
    acc = accuracy_score(labels.astype(int).reshape(-1), predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}



training_args = TrainingArguments(
   output_dir="emotion_model",
   learning_rate = 2.3123902791176186e-05,
   adam_epsilon = 1e-8, # default
   eval_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_trained_emotion['train'],
    eval_dataset=tokenised_valid_emotion['train'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    #model_init=model_init,
    callbacks=[EarlyStoppingCallback]
)

#best_trial = trainer.hyperparameter_search(
#    direction="maximize",
#    backend="optuna",
#    hp_space=optuna_hp_space,
#    n_trials=5,
#)


trainer.train()

  0%|          | 0/3651 [00:00<?, ?it/s]

{'loss': 0.3444, 'grad_norm': 3.328479766845703, 'learning_rate': 1.99571124883583e-05, 'epoch': 0.41}
{'loss': 0.2771, 'grad_norm': 2.6560094356536865, 'learning_rate': 1.679032218554042e-05, 'epoch': 0.82}


  0%|          | 0/163 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.25942298769950867, 'eval_accuracy': 0.8972353783456328, 'eval_f1': 0.6084767100293748, 'eval_precision': 0.7382892057026477, 'eval_recall': 0.5174875089221984, 'eval_runtime': 20.7258, 'eval_samples_per_second': 62.579, 'eval_steps_per_second': 7.865, 'epoch': 1.0}
{'loss': 0.247, 'grad_norm': 2.880528688430786, 'learning_rate': 1.3623531882722536e-05, 'epoch': 1.23}
{'loss': 0.2056, 'grad_norm': 2.581974506378174, 'learning_rate': 1.0456741579904651e-05, 'epoch': 1.64}


  0%|          | 0/163 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.2587554454803467, 'eval_accuracy': 0.8991078312589492, 'eval_f1': 0.645784996133024, 'eval_precision': 0.7046413502109705, 'eval_recall': 0.5960028551034975, 'eval_runtime': 25.4176, 'eval_samples_per_second': 51.028, 'eval_steps_per_second': 6.413, 'epoch': 2.0}
{'loss': 0.1957, 'grad_norm': 2.3895490169525146, 'learning_rate': 7.289951277086768e-06, 'epoch': 2.05}
{'loss': 0.1553, 'grad_norm': 1.16203773021698, 'learning_rate': 4.1231609742688845e-06, 'epoch': 2.47}
{'loss': 0.1522, 'grad_norm': 3.737990379333496, 'learning_rate': 9.563706714510009e-07, 'epoch': 2.88}


Non-default generation parameters: {'max_length': 512}


  0%|          | 0/163 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512}


{'eval_loss': 0.26952168345451355, 'eval_accuracy': 0.8970150897675956, 'eval_f1': 0.6462353386303443, 'eval_precision': 0.6876006441223832, 'eval_recall': 0.609564596716631, 'eval_runtime': 24.9564, 'eval_samples_per_second': 51.971, 'eval_steps_per_second': 6.531, 'epoch': 3.0}
{'train_runtime': 4010.2616, 'train_samples_per_second': 7.28, 'train_steps_per_second': 0.91, 'train_loss': 0.22202677323177591, 'epoch': 3.0}


TrainOutput(global_step=3651, training_loss=0.22202677323177591, metrics={'train_runtime': 4010.2616, 'train_samples_per_second': 7.28, 'train_steps_per_second': 0.91, 'total_flos': 5139287096684544.0, 'train_loss': 0.22202677323177591, 'epoch': 3.0})

In [86]:
import numpy as np
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import torch

def evaluate_model(model, test_dataset):
    model.eval()
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for example in test_dataset:
            inputs = {k: torch.tensor(v).unsqueeze(0).to(model.device) for k, v in example.items() 
                      if k in ['input_ids', 'attention_mask', 'token_type_ids']}
            label = example['label']
            
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.sigmoid(logits).squeeze().cpu().numpy()
            predictions = (predictions > 0.5).astype(int)
            
            all_predictions.append(predictions)
            all_true_labels.append(label)

    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)
    
    # Calculate metrics
    accuracy = accuracy_score(all_true_labels, all_predictions)
    f1 = f1_score(all_true_labels, all_predictions, average='macro')
    recall = recall_score(all_true_labels, all_predictions, average='macro')
    precision = precision_score(all_true_labels, all_predictions, average='macro')

    return {
        'accuracy': accuracy,
        'f1_score': f1,
        'recall': recall,
        'precision': precision
    }




# Evaluate the model
metrics = evaluate_model(model, tokenised_test_emotion['train'])

# Print results
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

accuracy: 0.5629
f1_score: 0.5947
recall: 0.5484
precision: 0.6773


In [87]:
tokenizer.save_pretrained("./first_finetuning_model")
model.save_pretrained("./first_finetuning_model")

Non-default generation parameters: {'max_length': 512}
