In [1]:
# imports
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Load Emotions Dataset

In [2]:
emo_train = load_dataset("csv", data_files="./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
emo_valid = load_dataset("csv", data_files="./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
emo_test = load_dataset("csv", data_files="./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")

In [3]:
# emo_train_df = pd.read_csv("./Swahili Emotion Data/emo_train.csv", encoding = "ISO-8859-1")
# display(emo_train_df)

# emo_valid_df = pd.read_csv("./Swahili Emotion Data/emo_valid.csv", encoding = "ISO-8859-1")
# display(emo_valid_df)

# emo_test_df = pd.read_csv("./Swahili Emotion Data/emo_test.csv", encoding = "ISO-8859-1")
# display(emo_test_df)

## Load Pre-Trained Model
### AfriBerta

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("castorini/afriberta_base")
model = AutoModelForTokenClassification.from_pretrained("castorini/afriberta_base", num_labels=7)
print(f"Number of labels: {model.config.num_labels}")

tokenizer.model_max_length = 512 

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at castorini/afriberta_base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of labels: 7


## Preprocess Data

In [5]:
import torch
from torch.utils.data import Dataset




In [6]:
# Preprocessing the Emotion Dataset using the Trainer

import ast

def tokenize_function(data):
    return tokenizer(data["text"], padding = "max_length", truncation=True)

tokenised_trained_emotion = emo_train.map(tokenize_function, batched=True)
tokenised_valid_emotion = emo_valid.map(tokenize_function, batched=True)
tokenised_test_emotion = emo_test.map(tokenize_function, batched=True)

tokenised_trained_emotion = tokenised_trained_emotion.rename_column("labels","label")
tokenised_valid_emotion = tokenised_valid_emotion.rename_column("labels","label")
tokenised_test_emotion = tokenised_test_emotion.rename_column("labels","label")

print(tokenised_trained_emotion)
print(tokenised_trained_emotion["train"])
print(tokenised_trained_emotion["train"]["text"])
print(tokenised_trained_emotion["train"]["label"])

        
 # Function to process each label
def process_label(label):
    element_list = ast.literal_eval(label)
    #if len(element_list) >= 1:
    #    new_value = element_list[0]
    #    return new_value # Return the first element as string
    return element_list  # Return unchanged if only one element

# Apply the function to the entire 'label' column
tokenised_trained_emotion = tokenised_trained_emotion.map(lambda fixlabel: {'label': process_label(fixlabel['label'])})       

tokenised_valid_emotion = tokenised_valid_emotion.map(lambda fixlabel: {'label': process_label(fixlabel['label'])})     

tokenised_test_emotion = tokenised_test_emotion.map(lambda fixlabel: {'label': process_label(fixlabel['label'])})   

print(tokenised_trained_emotion["train"]["label"])
        

small_train_dataset = tokenised_trained_emotion["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenised_valid_emotion["train"].shuffle(seed=42).select(range(1000))


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 9732
    })
})
Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 9732
})
['Ni zaidi ya Asha ngedere', 'Tuendelee kuchapa kazi ndugu zangu tena kwa furaha isiyo na kifani.', 'waziri anasoma utadhani uamefungwa mashine', ' Ukweli ni kwamba mjinga ndie ambae haoni Magu anachokifanya.', 'Hapo mlijiridhisha nn sasa??', 'samahani . Nimesahau yote kabisa .', 'Kutoka mahali fulani gizani, kikifuatana na kicheko, sauti ya kike  ilianza kuugua na kisha tukasikia kama kilio kwa mbalii', 'ninajisikia maumivu katika ovari zangu, kuzungumza na mimi kama mimi kama kuweka', 'Ilikuwa matarajio ya kupata dhahabu ambayo yaliwachochea wanaume hawa wenye usongo wa maisha kufanya juhudi kubwa za kufungua kampuni ', 'Huyo zuchu anavyojikuta sasa mmmh', 'Lily alipiga risasi ya kutisha haraka juu na chini barabarani. ', 'Kwa bahati mbaya, watu wenye wasi

In [14]:
from transformers import TrainingArguments, Trainer

import evaluate


metric = evaluate.load("accuracy")

print(tokenised_trained_emotion["train"][0])
print(model(tokenised_trained_emotion["train"][0]))


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print(f"Labels shape: {inputs['labels'].shape}")
        outputs = model(
            input_ids = inputs['input_ids'],
            attention_mask = inputs['attention_mask'],
        )
        loss = torch.nn.BCEWithLogitsLoss()(outputs['logits'], inputs['labels'].float())
        #loss = torch.nn.CrossEntropyLoss(reduction="mean")(outputs['logits'][:,0,:], inputs['labels'])
        return (loss, outputs) if return_outputs else loss
    
    

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", learning_rate = 5e-5, adam_epsilon = 1e-8, lr_scheduler_type = "reduce_lr_on_plateau", lr_scheduler_kwargs = {'patience':5} )

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

{'text': 'Ni zaidi ya Asha ngedere', 'label': [4], 'input_ids': [0, 886, 720, 268, 24080, 261, 2948, 27241, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

TypeError: unhashable type: 'list'