In [45]:
# imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load Emotions Dataset

In [46]:
emo_train = load_dataset("csv", data_files="./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
emo_valid = load_dataset("csv", data_files="./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
emo_test = load_dataset("csv", data_files="./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")

In [47]:
#  	0 - neutral
#	1 - joy (furaha)
#	2 - anger (hasira)
#	3 - sadness (huzuni)
#	4 - disgust (machukizo)
#	5 - suprise (mshangao)
#	6 - fear (woga)


classes = ['neutral','joy','anger','sadness','disgust','suprise','fear']
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

## Load Pre-Trained Model
### AfriBerta

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
model = AutoModelForTokenClassification.from_pretrained("castorini/afriberta_base", num_labels=len(classes),id2label=id2class, label2id=class2id,problem_type = "multi_label_classification")

tokenizer.model_max_length = 512 

## Preprocess Data

In [None]:
# Preprocessing the Emotion Dataset using the Trainer

import ast

def process_label(data):
    lables = []
    for element in data['labels']:
        element = element.replace("[","")
        element = element.replace("]","")
        element = element.replace(" ","")
        element_list = element.split(",")
        label_list = [int(item) for item in element_list]
        lables.append(label_list)
    data['labels'] = lables
    return data

tokenised_trained_emotion = emo_train.map(process_label, batched=True)
tokenised_valid_emotion = emo_valid.map(process_label, batched=True)
tokenised_test_emotion = emo_test.map(process_label, batched=True)


def tokenize_function(data):
    all_labels = data['labels']
    labels = [0. for i in range(len(classes))]
    for label in all_labels:
        label_id = label
        labels[label_id] = 1.
    
    data = tokenizer(data["text"], padding = "max_length", truncation=True)
    data['labels'] = labels
    return data


tokenised_trained_emotion = tokenised_trained_emotion.map(tokenize_function)
tokenised_valid_emotion = tokenised_valid_emotion.map(tokenize_function)
tokenised_test_emotion = tokenised_test_emotion.map(tokenize_function)

tokenised_trained_emotion = tokenised_trained_emotion.rename_column("labels","label")
tokenised_valid_emotion = tokenised_valid_emotion.rename_column("labels","label")
tokenised_test_emotion = tokenised_test_emotion.rename_column("labels","label")

#print(tokenised_trained_emotion['train']['label'])

    
small_train_dataset = tokenised_trained_emotion["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenised_valid_emotion["train"].shuffle(seed=42).select(range(1000))


In [50]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from transformers import TrainingArguments, Trainer

import evaluate


metric = evaluate.load("accuracy")
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

class CustomTrainer(Trainer):
   def compute_loss(self, model, inputs, return_outputs=False):
      outputs = model(
          input_ids=inputs['input_ids'],
          attention_mask=inputs['attention_mask'],
      )
      cls_logits = outputs['logits'][:, 0, :]
      loss = torch.nn.BCEWithLogitsLoss()(cls_logits.float(),
                                       inputs['labels'].float())
      return (loss, outputs) if return_outputs else loss
    

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0, :]
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    #predictions = np.argmax(logits, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


#training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", learning_rate = 5e-5, adam_epsilon = 1e-8, lr_scheduler_type = "reduce_lr_on_plateau", lr_scheduler_kwargs = {'patience':5} )

training_args = TrainingArguments(
   output_dir="my_awesome_model",
   #learning_rate=2e-5,
   learning_rate = 5e-5,
   adam_epsilon = 1e-8,
   lr_scheduler_type = "reduce_lr_on_plateau",
   lr_scheduler_kwargs = {'patience':5},
   #per_device_train_batch_size=4,
   #per_device_eval_batch_size=4,
   num_train_epochs=2,
   weight_decay=0.01,
   evaluation_strategy="epoch",
   save_strategy="epoch",
   load_best_model_at_end=True,
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()

In [None]:
tokenizer.save_pretrained("./first_finetuning_model")
model.save_pretrained("./first_finetuning_model")