In [None]:
# Fine-Tuning Afro-XLM-RoBERTa with 5-Fold CV on 80% Train Set + Final Test on 20% Holdout
# Dataset: 477 Offensive, 500 Non-Offensive Samples

"""
Fine-tuning Afro-XLM-R with 5-fold cross-validation on an 80% training split,
followed by final evaluation on a 20% holdout test set.

Dataset (after preprocessing and splitting):
- 477 Offensive samples
- 500 Non-offensive samples

This script assumes you already have:
- train.csv  (80% of data)
- test.csv   (20% holdout)

both stored in a local ./data/ directory.
"""

!pip install transformers datasets torch scikit-learn

import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, matthews_corrcoef
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from torch.nn import CrossEntropyLoss

# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------
# Base model used for fine-tuning. Replace this with your fine-tuning
# model on Hugging Face, e.g. "Davlan/afro-xlmr-base".

model_name = 'Davlan/afro-xlmr-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Optional: if your experiments use explicit trigger markers, document it:
# SPECIAL_TOKENS = {"additional_special_tokens": ["<TRIGGER>", "</TRIGGER>"]}
# tokenizer.add_special_tokens(SPECIAL_TOKENS)
# tokenizer.add_special_tokens({'additional_special_tokens': ['<TRIGGER>', '</TRIGGER>']})

# Directory where preprocessed CSV files are stored.
# For public code, we assume a local ./data/ folder in the repo.
DATA_DIR = Path("data")

train_df = pd.read_csv('DATA_DIR/training_data2.csv')
test_df = pd.read_csv('DATA_DIR/testing_data2.csv')
train_df['TEXT'] = train_df['TEXT'].astype(str)
test_df['TEXT'] = test_df['TEXT'].astype(str)

def tokenize_batch(batch):
    return tokenizer(batch['TEXT'], padding='max_length', truncation=True, max_length=128)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1])
    return {
        'accuracy': accuracy_score(labels, preds),
        'mcc': matthews_corrcoef(labels, preds),
        'roc_auc': roc_auc_score(labels, torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()),
        'f1_macro': f1.mean(),
        'recall_1': recall[1]
    }

def compute_weighted_loss(model, inputs, return_outputs=False):
    labels = inputs.pop('labels')
    outputs = model(**inputs)
    loss_fct = CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(model.device))
    loss = loss_fct(outputs.logits.view(-1, 2), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir='DATA_DIR/AfroXLMR-80-20-noTrigger',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no',
    eval_strategy='epoch',
    metric_for_best_model='recall_1',
    greater_is_better=True,
    load_best_model_at_end=False
)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
best_models = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df['TEXT'], train_df['TARGET'])):
    print(f'\n=== Fold {fold + 1}/5 ===')
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_fold).map(tokenize_batch, batched=True)
    val_dataset = Dataset.from_pandas(val_fold).map(tokenize_batch, batched=True)
    train_dataset = train_dataset.rename_column('TARGET', 'label')
    val_dataset = val_dataset.rename_column('TARGET', 'label')

    label_mapping = {'Non-offensive': 0, 'Offensive': 1}
    train_dataset = train_dataset.map(lambda examples: {'label': [label_mapping[x] for x in examples['label']]}, batched=True)
    val_dataset = val_dataset.map(lambda examples: {'label': [label_mapping[x] for x in examples['label']]}, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True)
    # model.resize_token_embeddings(len(tokenizer))

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.loss_function = compute_weighted_loss
    trainer.train()
    metrics = trainer.evaluate()
    fold_results.append(metrics)
    best_models.append(model)
    print(metrics)

best_fold_index = np.argmax([r['eval_recall_1'] for r in fold_results])
best_model = best_models[best_fold_index]
print(f'\nBest Fold: Fold {best_fold_index + 1}')

test_dataset = Dataset.from_pandas(test_df).map(tokenize_batch, batched=True)
test_dataset = test_dataset.rename_column('TARGET', 'label')

# Apply label_mapping to test_dataset before setting format
test_dataset = test_dataset.map(lambda examples: {'label': [label_mapping[x] for x in examples['label']]}, batched=True) # Apply label mapping to the test dataset
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

test_trainer = Trainer(
    model=best_model,
    args=TrainingArguments(output_dir='./tmp-test-jw300', per_device_eval_batch_size=64),
    compute_metrics=compute_metrics
)
test_metrics = test_trainer.evaluate(test_dataset)
print('\n=== Test Set Metrics ===')
print(test_metrics)

test_trainer = Trainer(
    model=best_model,
    args=TrainingArguments(output_dir='./tmp-test-afroxlmr', per_device_eval_batch_size=64),
    compute_metrics=compute_metrics
)
test_metrics = test_trainer.evaluate(test_dataset)
print('\n=== Test Set Metrics ===')
print(test_metrics)

save_dir = 'DATA_DIR/AfroXLMR-best-fold-model-trigger-free'
best_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Best performing model saved to: {save_dir}")


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]


=== Fold 1/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbkekgathetse[0m ([33mbkekgathetse-university-of-pretoria[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6885,0.659175,0.698718,0.494246,0.919612,0.66574,0.38961
2,0.6134,0.475366,0.858974,0.718231,0.941476,0.858974,0.87013
3,0.4999,0.359675,0.871795,0.749332,0.940325,0.871032,0.805195
4,0.4289,0.329935,0.884615,0.772977,0.941147,0.884139,0.831169
5,0.4531,0.324967,0.878205,0.757731,0.939668,0.877959,0.844156


{'eval_loss': 0.3249669671058655, 'eval_accuracy': 0.8782051282051282, 'eval_mcc': 0.7577312882940623, 'eval_roc_auc': 0.9396679270096991, 'eval_f1_macro': 0.877959402149298, 'eval_recall_1': 0.8441558441558441, 'eval_runtime': 0.9269, 'eval_samples_per_second': 168.297, 'eval_steps_per_second': 3.236, 'epoch': 5.0}

=== Fold 2/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6804,0.643396,0.724359,0.448755,0.809305,0.724348,0.727273
2,0.5815,0.560401,0.737179,0.525958,0.85106,0.726374,0.948052
3,0.4598,0.44621,0.801282,0.604066,0.890679,0.801209,0.831169
4,0.3595,0.418745,0.826923,0.656832,0.898241,0.826745,0.87013
5,0.3406,0.405178,0.826923,0.656832,0.90268,0.826745,0.87013


{'eval_loss': 0.40517792105674744, 'eval_accuracy': 0.8269230769230769, 'eval_mcc': 0.6568323438998138, 'eval_roc_auc': 0.9026795988821306, 'eval_f1_macro': 0.8267450948130476, 'eval_recall_1': 0.8701298701298701, 'eval_runtime': 0.9241, 'eval_samples_per_second': 168.815, 'eval_steps_per_second': 3.246, 'epoch': 5.0}

=== Fold 3/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6731,0.649119,0.679487,0.368345,0.7253,0.676885,0.779221
2,0.5284,0.553691,0.711538,0.430206,0.802565,0.710097,0.792208
3,0.417,0.483771,0.794872,0.589735,0.845307,0.794737,0.779221
4,0.3664,0.46828,0.814103,0.628135,0.856978,0.814034,0.805195
5,0.367,0.462849,0.814103,0.628426,0.861581,0.813911,0.792208


{'eval_loss': 0.4628492593765259, 'eval_accuracy': 0.8141025641025641, 'eval_mcc': 0.6284261228115992, 'eval_roc_auc': 0.8615814565181654, 'eval_f1_macro': 0.8139113981325325, 'eval_recall_1': 0.7922077922077922, 'eval_runtime': 0.927, 'eval_samples_per_second': 168.279, 'eval_steps_per_second': 3.236, 'epoch': 5.0}

=== Fold 4/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6844,0.673239,0.692308,0.419485,0.751767,0.682065,0.883117
2,0.6016,0.546046,0.74359,0.4874,0.831333,0.74321,0.714286
3,0.4652,0.47898,0.782051,0.565279,0.851718,0.781477,0.74026
4,0.3629,0.458609,0.794872,0.592268,0.865856,0.794025,0.74026
5,0.3661,0.451414,0.801282,0.604066,0.870623,0.801209,0.831169


{'eval_loss': 0.4514135718345642, 'eval_accuracy': 0.8012820512820513, 'eval_mcc': 0.604065969281972, 'eval_roc_auc': 0.8706230478382376, 'eval_f1_macro': 0.8012085337279566, 'eval_recall_1': 0.8311688311688312, 'eval_runtime': 0.9306, 'eval_samples_per_second': 167.63, 'eval_steps_per_second': 3.224, 'epoch': 5.0}

=== Fold 5/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6756,0.658789,0.679487,0.404651,0.791913,0.661458,0.448718
2,0.6037,0.554626,0.769231,0.540062,0.817719,0.768889,0.807692
3,0.5361,0.475865,0.820513,0.641236,0.856673,0.820483,0.833333
4,0.4009,0.440794,0.826923,0.65433,0.876561,0.826859,0.846154
5,0.3817,0.429645,0.820513,0.641026,0.879191,0.820513,0.820513


{'eval_loss': 0.4296453893184662, 'eval_accuracy': 0.8205128205128205, 'eval_mcc': 0.6410256410256411, 'eval_roc_auc': 0.8791913214990137, 'eval_f1_macro': 0.8205128205128205, 'eval_recall_1': 0.8205128205128205, 'eval_runtime': 0.93, 'eval_samples_per_second': 167.75, 'eval_steps_per_second': 3.226, 'epoch': 5.0}

Best Fold: Fold 2


Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]


=== Test Set Metrics ===
{'eval_loss': 0.38945093750953674, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.8622448979591837, 'eval_mcc': 0.7228973640941718, 'eval_roc_auc': 0.9014675052410901, 'eval_f1_macro': 0.8603214823026368, 'eval_recall_1': 0.8111111111111111, 'eval_runtime': 1.1695, 'eval_samples_per_second': 167.586, 'eval_steps_per_second': 3.42}



=== Test Set Metrics ===
{'eval_loss': 0.38945093750953674, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.8622448979591837, 'eval_mcc': 0.7228973640941718, 'eval_roc_auc': 0.9014675052410901, 'eval_f1_macro': 0.8603214823026368, 'eval_recall_1': 0.8111111111111111, 'eval_runtime': 1.1634, 'eval_samples_per_second': 168.468, 'eval_steps_per_second': 3.438}
Best performing model saved to: /content/drive/My Drive/Colab Notebooks/AfroXLMR-best-fold-model-80-20-noTriggers
