In [None]:
# imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset
from datasets import load_dataset

## Load Emotions Dataset

In [2]:
emo_train = load_dataset("csv", data_files="./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
emo_valid = load_dataset("csv", data_files="./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
emo_test = load_dataset("csv", data_files="./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")

In [3]:
# emo_train_df = pd.read_csv("./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
# display(emo_train_df)

# emo_valid_df = pd.read_csv("./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
# display(emo_valid_df)

# emo_test_df = pd.read_csv("./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")
# display(emo_test_df)

## Load Pre-Trained Model
### AfriBerta

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
model = AutoModelForTokenClassification.from_pretrained("castorini/afriberta_base", num_labels=7)

tokenizer.model_max_length = 512 

## Preprocess Data

In [5]:
import torch
from torch.utils.data import Dataset




In [6]:
# Preprocessing the Emotion Dataset using the Trainer

import ast

def tokenize_function(data):
    return tokenizer(data["text"], padding = "max_length", truncation=True)

tokenised_trained_emotion = emo_train.map(tokenize_function, batched=True)
tokenised_valid_emotion = emo_valid.map(tokenize_function, batched=True)
tokenised_test_emotion = emo_test.map(tokenize_function, batched=True)

tokenised_trained_emotion = tokenised_trained_emotion.rename_column("labels","label")
tokenised_valid_emotion = tokenised_valid_emotion.rename_column("labels","label")
tokenised_test_emotion = tokenised_test_emotion.rename_column("labels","label")


        
 # Function to process each label
def process_label(label):
    element_list = ast.literal_eval(label)
    if len(element_list) >= 1:
        new_value = element_list[0]
        return new_value # Return the first element as string
    return label  # Return unchanged if only one element

# Apply the function to the entire 'label' column
tokenised_trained_emotion = tokenised_trained_emotion.map(lambda fixlabel: {'label': process_label(fixlabel['label'])})       

tokenised_valid_emotion = tokenised_valid_emotion.map(lambda fixlabel: {'label': process_label(fixlabel['label'])})     

tokenised_test_emotion = tokenised_test_emotion.map(lambda fixlabel: {'label': process_label(fixlabel['label'])})            
        

small_train_dataset = tokenised_trained_emotion["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenised_valid_emotion["train"].shuffle(seed=42).select(range(1000))


In [None]:
from transformers import TrainingArguments, Trainer

import evaluate


metric = evaluate.load("accuracy")


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids = inputs['input_ids'],
            attention_mask = inputs['attention_mask'],
        )
        loss = torch.nn.CrossEntropyLoss()(outputs['logits'][:,0],
                                         inputs['labels'])
        return (loss, outputs) if return_outputs else loss
    
    

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits[:,0], axis=1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()