### References
- Multi-Label Classification Model From Scratch: Step-by-Step Tutorial (https://huggingface.co/blog/Valerii-Knowledgator/multi-label-classification)
- https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb
- https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

# Libraries

In [19]:
%pip install -q -U datasets transformers accelerate sentencepiece

In [20]:
import os
import random
import numpy as np
import torch
import transformers
from pprint import pprint
from datetime import datetime
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import f1_score

# Config

In [21]:
seed = 42
lang = 'sun'
project_name = f'RoBERTa-Base-SE2025T11A-{lang}-v{datetime.now().strftime("%Y%m%d%H%M%S")}'
print("Project name:", project_name)

# hf_model_id = 'bhadresh-savani/roberta-base-emotion'
# hf_model_id = 'bhadresh-savani/bert-base-uncased-emotion'
# hf_model_id = 'w11wo/sundanese-bert-base-emotion-classifier'
hf_model_id = 'w11wo/sundanese-roberta-base-emotion-classifier'
# hf_model_id = 'w11wo/sundanese-roberta-base'
# hf_model_id = 'alxxtexxr/XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222170134'
hf_tokenizer_id = hf_model_id
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
# hf_data_config = 'track_a_sun_70_15_15_stratify_v2'
hf_data_config = 'track_a_sun_go_emotions_80_10_10'
# hf_data_config = 'track_a_sun_go_emotions_70_15_15_balanced'

num_epochs = 2

Project name: RoBERTa-Base-SE2025T11A-sun-v20250111150219


In [22]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [23]:
datasets = load_dataset(hf_data_id, hf_data_config)
print(datasets)
print()

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion', 'stratify', 'aug_go_emotions']]
splits = [*datasets.keys()]

print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

(…)ack_a/sun_go_emotions_80_10_10/train.csv:   0%|          | 0.00/198k [00:00<?, ?B/s]

(…)track_a/sun_go_emotions_80_10_10/val.csv:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

(…)rack_a/sun_go_emotions_80_10_10/test.csv:   0%|          | 0.00/24.3k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa'],
        num_rows: 2014
    })
    val: Dataset({
        features: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa'],
        num_rows: 240
    })
    test: Dataset({
        features: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa'],
        num_rows: 241
    })
})

Data columns: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [24]:
class2id = {class_:id for id, class_ in enumerate(emotion_cols)}
id2class = {id:class_ for class_, id in class2id.items()}

print("Class to ID:")
pprint(class2id, width=1)
print()
print("ID to Class:")
pprint(id2class, width=1)

Class to ID:
{'biasa': 6,
 'jijik': 1,
 'marah': 0,
 'sedih': 4,
 'senang': 3,
 'takut': 2,
 'terkejut': 5}

ID to Class:
{0: 'marah',
 1: 'jijik',
 2: 'takut',
 3: 'senang',
 4: 'sedih',
 5: 'terkejut',
 6: 'biasa'}


## Preprocess Data

In [25]:
tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_id)

In [26]:
def one_hot_encode_emotion(emotion, emotion_cols):
    emotions = emotion.replace(" ", "").split(",")
    one_hot_emotion = [1.0 if emotion_col in emotions else 0.0 for emotion_col in emotion_cols] # Ensure that the label is float, not int
    return one_hot_emotion

def preprocess_function(data):
   text = data['text']
   emotion = data['emotion']
   labels = one_hot_encode_emotion(emotion, emotion_cols)
   data = tokenizer(text, truncation=True)
   data['labels'] = labels
   return data

tokenized_datasets = {split: datasets[split].map(preprocess_function) for split in splits}

Map:   0%|          | 0/2014 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

In [27]:
# Sanity check
data = tokenized_datasets['train'][5]

print("Text:", data['text'])
print("Emotion(s):", data['emotion'])
print("Labels:", data['labels'], '-->', emotion_cols)

Text: min ai mnh teu cape lulumpatan wae :v kumaha weh ngagawe keun jelma mah ceuk mang dana ge :v
Emotion(s): senang, terkejut
Labels: [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0] --> ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [28]:
data_collator = DataCollatorWithPadding(tokenizer)

# Model

In [29]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     hf_model_id, 
#     num_labels=len(emotion_cols),
#     id2label=id2class, 
#     label2id=class2id,
#     problem_type = "multi_label_classification",
# )

################ TRANSFER LEARNING ################
model = AutoModelForSequenceClassification.from_pretrained(
    hf_model_id, 
    # num_labels=len(emotion_cols),
    # id2label=id2class, 
    # label2id=class2id,
    problem_type = "multi_label_classification",
)
print(model.classifier)

model.classifier.out_proj = torch.nn.Linear(in_features=768, out_features=len(emotion_cols), bias=True)
print(model.classifier)

# Freeze the rest of the layers for transfer learning
# for param in model.parameters():
#     param.requires_grad = False

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=4, bias=True)
)
RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=7, bias=True)
)


# Finetuning

In [30]:
def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = sigmoid(predictions)
    y_pred = (probs > 0.5).astype(int)
    y_true = labels.astype(int)

    # Compute F1 score for each type of averaging method
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0.0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0.0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0.0)
    f1_samples = f1_score(y_true, y_pred, average='samples', zero_division=0.0)
    f1_labels = f1_score(y_true, y_pred, average=None, zero_division=0.0)
    f1_labels_dict = {f'f1_label_{emotion_cols[i]}': f1_labels[i] for i in range(len(f1_labels))}

    return {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'f1_weighted': f1_weighted,
        'f1_samples': f1_samples,
        **f1_labels_dict,
    }

In [31]:
train_args = TrainingArguments(
    # Training config
    per_device_train_batch_size=2,
    num_train_epochs=num_epochs,
    learning_rate=2e-5,
    weight_decay=0.01,

    # Logging config for training
    logging_strategy='steps',
    logging_steps=100,

    # Evaluation config during training
    per_device_eval_batch_size=2,
    eval_strategy='steps',
    eval_steps=100,

    # Model saving config
    output_dir=project_name,
    save_strategy='epoch',
    # load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [32]:
trainer.train()

Step,Training Loss,Validation Loss,F1 Macro,F1 Micro,F1 Weighted,F1 Samples,F1 Label Marah,F1 Label Jijik,F1 Label Takut,F1 Label Senang,F1 Label Sedih,F1 Label Terkejut,F1 Label Biasa
100,0.4884,0.410573,0.114975,0.225434,0.160473,0.142361,0.0,0.0,0.0,0.596491,0.208333,0.0,0.0
200,0.4219,0.370646,0.247847,0.387597,0.297266,0.282639,0.086957,0.0,0.434783,0.784615,0.428571,0.0,0.0
300,0.3919,0.356718,0.297764,0.442822,0.350903,0.338889,0.0,0.0,0.470588,0.8,0.628571,0.185185,0.0
400,0.3807,0.355507,0.385381,0.491304,0.440722,0.430556,0.4375,0.177778,0.372093,0.806202,0.59375,0.310345,0.0
500,0.3436,0.347267,0.396919,0.52,0.450993,0.45,0.37931,0.0,0.566667,0.817518,0.603175,0.411765,0.0
600,0.3494,0.335597,0.423649,0.511737,0.472542,0.419444,0.37037,0.333333,0.530612,0.783333,0.597015,0.350877,0.0
700,0.3607,0.324403,0.454008,0.535469,0.491914,0.447222,0.235294,0.352941,0.5,0.81203,0.626866,0.392857,0.258065
800,0.3299,0.328007,0.463136,0.552846,0.517104,0.511111,0.454545,0.4,0.5,0.832117,0.59375,0.461538,0.0
900,0.3168,0.31695,0.510548,0.584677,0.556984,0.548611,0.520833,0.421053,0.52,0.846154,0.617647,0.5,0.148148
1000,0.3832,0.314899,0.496566,0.565957,0.537455,0.505556,0.481013,0.326531,0.542373,0.816,0.628571,0.474576,0.206897


TrainOutput(global_step=2014, training_loss=0.3204741385629421, metrics={'train_runtime': 224.2165, 'train_samples_per_second': 17.965, 'train_steps_per_second': 8.982, 'total_flos': 43847986865700.0, 'train_loss': 0.3204741385629421, 'epoch': 2.0})

In [33]:
trainer.push_to_hub()

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1736607778.2f4d2fe78b76.1312.2:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250111150219/commit/d7f67ae58d2779c2bc259980d90a6c399b322931', commit_message='End of training', commit_description='', oid='d7f67ae58d2779c2bc259980d90a6c399b322931', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250111150219', endpoint='https://huggingface.co', repo_type='model', repo_id='alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250111150219'), pr_revision=None, pr_num=None)

# Evaluation

In [34]:
eval = trainer.evaluate(eval_dataset=tokenized_datasets['test'])

print("Evaluation to copy:")
f1_keys = [eval_key for eval_key in eval.keys() if 'f1' in eval_key]
for i, k in enumerate(f1_keys): print(eval[k], end="\t" if i + 1 < len(f1_keys) else "")
print()
print()
print("Evaluation full results:")
pprint(eval)

Evaluation to copy:
0.6025994780411946	0.6301369863013698	0.6223253289738249	0.6147994467496541	0.6578947368421053	0.5	0.5925925925925926	0.8031496062992126	0.5753424657534246	0.5526315789473685	0.5365853658536586

Evaluation full results:
{'epoch': 2.0,
 'eval_f1_label_biasa': 0.5365853658536586,
 'eval_f1_label_jijik': 0.5,
 'eval_f1_label_marah': 0.6578947368421053,
 'eval_f1_label_sedih': 0.5753424657534246,
 'eval_f1_label_senang': 0.8031496062992126,
 'eval_f1_label_takut': 0.5925925925925926,
 'eval_f1_label_terkejut': 0.5526315789473685,
 'eval_f1_macro': 0.6025994780411946,
 'eval_f1_micro': 0.6301369863013698,
 'eval_f1_samples': 0.6147994467496541,
 'eval_f1_weighted': 0.6223253289738249,
 'eval_loss': 0.2972015142440796,
 'eval_runtime': 1.3082,
 'eval_samples_per_second': 184.227,
 'eval_steps_per_second': 92.496}


# Inference

In [35]:
data = datasets['val'][1]
text = data['text']
emotion_true = data['emotion']

inputs = tokenizer(text, return_tensors='pt').to(model.device)

outputs = trainer.model(**inputs)
logits = outputs.logits
probs = sigmoid(logits.squeeze().detach().cpu().numpy()) # apply sigmoid + threshold
labels_pred = (probs > 0.5).astype(int)
emotion_pred = [id2class[idx] for idx, label in enumerate(labels_pred) if label == 1.0] # turn predicted id's into actual label names

print("Text:", text)
print("True emotion(s):", emotion_true)
print("Predicted emotion(s):", ", ".join(emotion_pred))

Text: Request doel sumbang mang anu naha salah judul na
True emotion(s): takut, senang
Predicted emotion(s): senang
