In [None]:
# Fine-Tuning PuoBERTa with 5-Fold CV on 80% Train Set + Final Test on 20% Holdout
# Dataset: 477 Offensive, 500 Non-Offensive Samples

"""
Fine-tuning PuoBERTa with 5-fold cross-validation on an 80% training split,
followed by final evaluation on a 20% holdout test set.

Dataset (after preprocessing and splitting):
- 477 Offensive samples
- 500 Non-offensive samples

This script assumes you already have:
- train.csv  (80% of data)
- test.csv   (20% holdout)

both stored in a local ./data/ directory.
"""
!pip install transformers datasets torch scikit-learn

import os
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score, matthews_corrcoef
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, RobertaForSequenceClassification, RobertaTokenizer
from datasets import Dataset
from torch.nn import CrossEntropyLoss

# Directory where preprocessed CSV files are stored.
# For public code, we assume a local ./data/ folder in the repo.
DATA_DIR = Path("data")

# Load training (80%) and testing (20%) CSV files
train_df = pd.read_csv('DATA_DIR/train.csv')
test_df = pd.read_csv('DATA_DIR/test.csv')
train_df['TEXT'] = train_df['TEXT'].astype(str)
test_df['TEXT'] = test_df['TEXT'].astype(str)

# Load tokenizer and add trigger tokens
tokenizer = RobertaTokenizer.from_pretrained('dsfsi/PuoBERTa')
# tokenizer.add_special_tokens({'additional_special_tokens': ['<TRIGGER>', '</TRIGGER>']})

def tokenize_batch(batch):
    return tokenizer(batch['TEXT'], padding='max_length', truncation=True, max_length=128)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1])
    return {
        'accuracy': accuracy_score(labels, preds),
        'mcc': matthews_corrcoef(labels, preds),
        'roc_auc': roc_auc_score(labels, torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()),
        'f1_macro': f1.mean(),
        'recall_1': recall[1]
    }

# Define custom loss function for class imbalance
def compute_weighted_loss(model, inputs, return_outputs=False):
    labels = inputs.pop('labels')
    outputs = model(**inputs)
    loss_fct = CrossEntropyLoss(weight=torch.tensor([1.0, 2.0]).to(model.device))
    loss = loss_fct(outputs.logits.view(-1, 2), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = TrainingArguments(
    output_dir='DATA_DIR/PuoBERTa-finetuned',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='no',
    eval_strategy='epoch',
    metric_for_best_model='recall_1',
    greater_is_better=True,
    load_best_model_at_end=False
)

# Apply 5-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_results = []
best_models = []

for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df['TEXT'], train_df['TARGET'])):
    print(f'\n=== Fold {fold + 1}/5 ===')
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]

    train_dataset = Dataset.from_pandas(train_fold).map(tokenize_batch, batched=True)
    val_dataset = Dataset.from_pandas(val_fold).map(tokenize_batch, batched=True)
    train_dataset = train_dataset.rename_column('TARGET', 'label')
    val_dataset = val_dataset.rename_column('TARGET', 'label')

    # Convert 'label' column to integers using a mapping
    label_mapping = {'Non-offensive': 0, 'Offensive': 1}
    train_dataset = train_dataset.map(lambda examples: {'label': [label_mapping[x] for x in examples['label']]}, batched=True)
    val_dataset = val_dataset.map(lambda examples: {'label': [label_mapping[x] for x in examples['label']]}, batched=True)

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    # ---------------------------------------------------------------------
    # Configuration
    # ---------------------------------------------------------------------
    # Base model used for fine-tuning. Replace this with your fine-tuning
    # model on Hugging Face, e.g. "dsfsi/PuoBERTa".

    model = RobertaForSequenceClassification.from_pretrained('dsfsi/PuoBERTa', num_labels=2)

    # Optional: if your experiments use explicit trigger markers, document it:
    # SPECIAL_TOKENS = {"additional_special_tokens": ["<TRIGGER>", "</TRIGGER>"]}
    # NOTE: When adding special tokens you must also resize the model embeddings:
    # model.resize_token_embeddings(len(tokenizer))

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.loss_function = compute_weighted_loss
    trainer.train()
    metrics = trainer.evaluate()
    fold_results.append(metrics)
    best_models.append(model)
    print(metrics)
# Final evaluation on the held-out 20% test set
best_fold_index = np.argmax([r['eval_recall_1'] for r in fold_results])
best_model = best_models[best_fold_index]
print(f'\nBest Fold: Fold {best_fold_index + 1}')

test_dataset = Dataset.from_pandas(test_df).map(tokenize_batch, batched=True)
test_dataset = test_dataset.rename_column('TARGET', 'label') # Changed 'TARGET' to 'TARGET LABEL'

# Convert 'label' column to integers using a mapping before setting the format
label_mapping = {'Non-offensive': 0, 'Offensive': 1}
test_dataset = test_dataset.map(lambda examples: {'label': [label_mapping[x] for x in examples['label']]}, batched=True)

test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

test_trainer = Trainer(
    model=best_model,
    args=TrainingArguments(output_dir='./tmp-test', per_device_eval_batch_size=64),
    compute_metrics=compute_metrics
)
test_metrics = test_trainer.evaluate(test_dataset)
print('\n=== Test Set Metrics ===')
print(test_metrics)

# Save best model and tokenizer to Google Drive
save_dir = 'DATA_DIR/PuoBERTa-setswana-offensive-v11'
best_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Best performing model saved to: {save_dir}")


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/877k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/523k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]


=== Fold 1/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/334M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at dsfsi/PuoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbkekgathetse[0m ([33mbkekgathetse-university-of-pretoria[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6363,0.539493,0.839744,0.685734,0.911228,0.838623,0.766234
2,0.4653,0.357652,0.891026,0.782116,0.942956,0.891021,0.896104
3,0.321,0.290056,0.884615,0.772977,0.957587,0.884139,0.831169
4,0.2127,0.267911,0.891026,0.783451,0.958573,0.890806,0.857143
5,0.2526,0.265818,0.897436,0.797002,0.960053,0.897165,0.857143


{'eval_loss': 0.2658180892467499, 'eval_accuracy': 0.8974358974358975, 'eval_mcc': 0.797002456468804, 'eval_roc_auc': 0.9600526056222259, 'eval_f1_macro': 0.897165458141068, 'eval_recall_1': 0.8571428571428571, 'eval_runtime': 0.4536, 'eval_samples_per_second': 343.881, 'eval_steps_per_second': 6.613, 'epoch': 5.0}

=== Fold 2/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at dsfsi/PuoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.5855,0.541961,0.762821,0.531716,0.842512,0.762028,0.831169
2,0.3833,0.455522,0.788462,0.588834,0.880322,0.786982,0.883117
3,0.3033,0.406928,0.807692,0.61565,0.898241,0.807692,0.818182
4,0.1733,0.395047,0.807692,0.61565,0.907776,0.807692,0.818182
5,0.1974,0.391837,0.807692,0.61565,0.908927,0.807692,0.818182


{'eval_loss': 0.3918374478816986, 'eval_accuracy': 0.8076923076923077, 'eval_mcc': 0.6156501726121979, 'eval_roc_auc': 0.9089265165214532, 'eval_f1_macro': 0.8076923076923077, 'eval_recall_1': 0.8181818181818182, 'eval_runtime': 0.4707, 'eval_samples_per_second': 331.415, 'eval_steps_per_second': 6.373, 'epoch': 5.0}

=== Fold 3/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at dsfsi/PuoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.5913,0.537495,0.75,0.499877,0.817195,0.749908,0.74026
2,0.4102,0.488597,0.788462,0.578707,0.855006,0.787755,0.74026
3,0.2698,0.459315,0.801282,0.602483,0.887062,0.801209,0.792208
4,0.2236,0.453702,0.801282,0.602483,0.89709,0.801209,0.792208
5,0.2421,0.459539,0.807692,0.615321,0.898241,0.807661,0.805195


{'eval_loss': 0.4595390856266022, 'eval_accuracy': 0.8076923076923077, 'eval_mcc': 0.6153213874732862, 'eval_roc_auc': 0.8982409995068222, 'eval_f1_macro': 0.8076606937366431, 'eval_recall_1': 0.8051948051948052, 'eval_runtime': 0.4633, 'eval_samples_per_second': 336.684, 'eval_steps_per_second': 6.475, 'epoch': 5.0}

=== Fold 4/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at dsfsi/PuoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6058,0.511529,0.833333,0.670578,0.875719,0.833086,0.883117
2,0.3891,0.398618,0.846154,0.696325,0.903995,0.845926,0.896104
3,0.2972,0.373991,0.826923,0.653786,0.916982,0.826859,0.818182
4,0.1595,0.376427,0.839744,0.67978,0.920434,0.839579,0.818182
5,0.2163,0.372283,0.852564,0.705186,0.922735,0.852558,0.857143


{'eval_loss': 0.37228333950042725, 'eval_accuracy': 0.8525641025641025, 'eval_mcc': 0.7051861616651155, 'eval_roc_auc': 0.9227354923557456, 'eval_f1_macro': 0.8525580439695911, 'eval_recall_1': 0.8571428571428571, 'eval_runtime': 0.4712, 'eval_samples_per_second': 331.059, 'eval_steps_per_second': 6.367, 'epoch': 5.0}

=== Fold 5/5 ===


Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/156 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at dsfsi/PuoBERTa and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Mcc,Roc Auc,F1 Macro,Recall 1
1,0.6323,0.510782,0.807692,0.620505,0.916502,0.806899,0.74359
2,0.4296,0.353,0.852564,0.706581,0.935733,0.852412,0.820513
3,0.3037,0.313025,0.871795,0.744569,0.938199,0.871711,0.897436
4,0.2237,0.314455,0.865385,0.73373,0.937541,0.865113,0.910256
5,0.206,0.309591,0.871795,0.743834,0.938856,0.871774,0.884615


{'eval_loss': 0.30959075689315796, 'eval_accuracy': 0.8717948717948718, 'eval_mcc': 0.7438343052617367, 'eval_roc_auc': 0.9388560157790927, 'eval_f1_macro': 0.8717737958244287, 'eval_recall_1': 0.8846153846153846, 'eval_runtime': 0.4731, 'eval_samples_per_second': 329.722, 'eval_steps_per_second': 6.341, 'epoch': 5.0}

Best Fold: Fold 5


Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]


=== Test Set Metrics ===
{'eval_loss': 0.3381184935569763, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.8673469387755102, 'eval_mcc': 0.7326205066429489, 'eval_roc_auc': 0.9288259958071279, 'eval_f1_macro': 0.8662186285834296, 'eval_recall_1': 0.8444444444444444, 'eval_runtime': 0.5897, 'eval_samples_per_second': 332.398, 'eval_steps_per_second': 6.784}
Best performing model saved to: /content/drive/My Drive/Colab Notebooks/PuoBERTa-best-fold-model-80-20
