### References
- Multi-Label Classification Model From Scratch: Step-by-Step Tutorial (https://huggingface.co/blog/Valerii-Knowledgator/multi-label-classification)
- https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb
- https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

# Libraries

In [19]:
%pip install -q -U datasets transformers accelerate sentencepiece

In [20]:
import os
import random
import numpy as np
import torch
import transformers
from pprint import pprint
from datetime import datetime
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import f1_score

# Config

In [21]:
seed = 42
lang = 'sun'
project_name = f'RoBERTa-Base-SE2025T11A-{lang}-v{datetime.now().strftime("%Y%m%d%H%M%S")}'
print("Project name:", project_name)

# hf_model_id = 'bhadresh-savani/roberta-base-emotion'
# hf_model_id = 'bhadresh-savani/bert-base-uncased-emotion'
# hf_model_id = 'w11wo/sundanese-bert-base-emotion-classifier'
hf_model_id = 'w11wo/sundanese-roberta-base-emotion-classifier'
# hf_model_id = 'w11wo/sundanese-roberta-base'
# hf_model_id = 'alxxtexxr/XLM-RoBERTa-Base-Sundanese-Emotion-Classifier-v20241222170134'
hf_tokenizer_id = hf_model_id
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
# hf_data_config = 'track_a_sun_70_15_15_stratify_v2'
# hf_data_config = 'track_a_sun_go_emotions_70_15_15_v2'
hf_data_config = 'track_a_sun_go_emotions_70_15_15_v6'
# hf_data_config = 'track_a_sun_go_emotions_70_15_15_balanced'

num_epochs = 2

Project name: RoBERTa-Base-SE2025T11A-sun-v20250112124824


In [22]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [23]:
datasets = load_dataset(hf_data_id, hf_data_config)
print(datasets)
print()

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion', 'stratify', 'aug_go_emotions']]
splits = [*datasets.keys()]

print("Data columns:", cols)
print("Emotions columns:", emotion_cols)

README.md:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

(…)_a/sun_go_emotions_70_15_15_v5/train.csv:   0%|          | 0.00/175k [00:00<?, ?B/s]

(…)ck_a/sun_go_emotions_70_15_15_v5/val.csv:   0%|          | 0.00/33.9k [00:00<?, ?B/s]

(…)k_a/sun_go_emotions_70_15_15_v5/test.csv:   0%|          | 0.00/35.4k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa'],
        num_rows: 1764
    })
    val: Dataset({
        features: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa'],
        num_rows: 365
    })
    test: Dataset({
        features: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa'],
        num_rows: 365
    })
})

Data columns: ['text', 'emotion', 'marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']
Emotions columns: ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [24]:
class2id = {class_:id for id, class_ in enumerate(emotion_cols)}
id2class = {id:class_ for class_, id in class2id.items()}

print("Class to ID:")
pprint(class2id, width=1)
print()
print("ID to Class:")
pprint(id2class, width=1)

Class to ID:
{'biasa': 6,
 'jijik': 1,
 'marah': 0,
 'sedih': 4,
 'senang': 3,
 'takut': 2,
 'terkejut': 5}

ID to Class:
{0: 'marah',
 1: 'jijik',
 2: 'takut',
 3: 'senang',
 4: 'sedih',
 5: 'terkejut',
 6: 'biasa'}


## Preprocess Data

In [25]:
tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_id)

In [26]:
def one_hot_encode_emotion(emotion, emotion_cols):
    emotions = emotion.replace(" ", "").split(",")
    one_hot_emotion = [1.0 if emotion_col in emotions else 0.0 for emotion_col in emotion_cols] # Ensure that the label is float, not int
    return one_hot_emotion

def preprocess_function(data):
   text = data['text']
   emotion = data['emotion']
   labels = one_hot_encode_emotion(emotion, emotion_cols)
   data = tokenizer(text, truncation=True)
   data['labels'] = labels
   return data

tokenized_datasets = {split: datasets[split].map(preprocess_function) for split in splits}

Map:   0%|          | 0/1764 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/365 [00:00<?, ? examples/s]

Map:   0%|          | 0/365 [00:00<?, ? examples/s]

In [27]:
# Sanity check
data = tokenized_datasets['train'][5]

print("Text:", data['text'])
print("Emotion(s):", data['emotion'])
print("Labels:", data['labels'], '-->', emotion_cols)

Text: Abdi reuwas yen babaturan sa kamar anjeun nepi-ngancurkeun sesa dahareun anu masih alus ku cara kitu.
Emotion(s): marah, jijik
Labels: [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] --> ['marah', 'jijik', 'takut', 'senang', 'sedih', 'terkejut', 'biasa']


In [28]:
data_collator = DataCollatorWithPadding(tokenizer)

# Model

In [29]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     hf_model_id, 
#     num_labels=len(emotion_cols),
#     id2label=id2class, 
#     label2id=class2id,
#     problem_type = "multi_label_classification",
# )

################ TRANSFER LEARNING ################
model = AutoModelForSequenceClassification.from_pretrained(
    hf_model_id, 
    # num_labels=len(emotion_cols),
    # id2label=id2class, 
    # label2id=class2id,
    problem_type = "multi_label_classification",
)
print(model.classifier)

model.classifier.out_proj = torch.nn.Linear(in_features=768, out_features=len(emotion_cols), bias=True)
print(model.classifier)

# Freeze the rest of the layers for transfer learning
# for param in model.parameters():
#     param.requires_grad = False

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=4, bias=True)
)
RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=7, bias=True)
)


# Finetuning

In [30]:
def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = sigmoid(predictions)
    y_pred = (probs > 0.5).astype(int)
    y_true = labels.astype(int)

    # Compute F1 score for each type of averaging method
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0.0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0.0)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0.0)
    f1_samples = f1_score(y_true, y_pred, average='samples', zero_division=0.0)
    f1_labels = f1_score(y_true, y_pred, average=None, zero_division=0.0)
    f1_labels_dict = {f'f1_label_{emotion_cols[i]}': f1_labels[i] for i in range(len(f1_labels))}

    return {
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
        'f1_weighted': f1_weighted,
        'f1_samples': f1_samples,
        **f1_labels_dict,
    }

In [31]:
train_args = TrainingArguments(
    # Training config
    per_device_train_batch_size=2,
    num_train_epochs=num_epochs,
    learning_rate=2e-5,
    weight_decay=0.01,

    # Logging config for training
    logging_strategy='steps',
    logging_steps=100,

    # Evaluation config during training
    per_device_eval_batch_size=2,
    eval_strategy='steps',
    eval_steps=100,

    # Model saving config
    output_dir=project_name,
    save_strategy='epoch',
    # load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [32]:
trainer.train()

Step,Training Loss,Validation Loss,F1 Macro,F1 Micro,F1 Weighted,F1 Samples,F1 Label Marah,F1 Label Jijik,F1 Label Takut,F1 Label Senang,F1 Label Sedih,F1 Label Terkejut,F1 Label Biasa
100,0.4934,0.415543,0.117673,0.185484,0.152603,0.111872,0.0,0.0,0.229508,0.536232,0.057971,0.0,0.0
200,0.4028,0.387063,0.228627,0.397351,0.278948,0.29589,0.090909,0.0,0.506667,0.816143,0.186667,0.0,0.0
300,0.4013,0.367298,0.320676,0.468606,0.370072,0.373242,0.325,0.0,0.560976,0.78733,0.571429,0.0,0.0
400,0.3868,0.335209,0.38799,0.514706,0.442523,0.431507,0.444444,0.0,0.556962,0.789744,0.656489,0.268293,0.0
500,0.3711,0.322632,0.423273,0.550642,0.482325,0.480822,0.561404,0.035088,0.506667,0.814815,0.598131,0.446809,0.0
600,0.3469,0.329818,0.447038,0.551724,0.500128,0.483744,0.35443,0.412371,0.506667,0.812785,0.612903,0.430108,0.0
700,0.3827,0.312046,0.45406,0.56528,0.509645,0.492694,0.568966,0.133333,0.604651,0.82,0.621359,0.430108,0.0
800,0.3608,0.305962,0.47171,0.577031,0.526896,0.518265,0.53211,0.215385,0.536585,0.837438,0.65,0.479167,0.051282
900,0.3453,0.302716,0.513106,0.596206,0.561676,0.543379,0.54902,0.379747,0.58427,0.831683,0.637931,0.509091,0.1
1000,0.3131,0.29154,0.569907,0.61457,0.593536,0.579452,0.591304,0.378378,0.55814,0.813725,0.642857,0.489796,0.515152


TrainOutput(global_step=1764, training_loss=0.3298908685610678, metrics={'train_runtime': 220.7751, 'train_samples_per_second': 15.98, 'train_steps_per_second': 7.99, 'total_flos': 38993575530600.0, 'train_loss': 0.3298908685610678, 'epoch': 2.0})

In [33]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

events.out.tfevents.1736686140.fdd44adb4231.11455.2:   0%|          | 0.00/24.6k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250112124824/commit/c2ceeaf749bfec85486e86a733ddd63e81717483', commit_message='End of training', commit_description='', oid='c2ceeaf749bfec85486e86a733ddd63e81717483', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250112124824', endpoint='https://huggingface.co', repo_type='model', repo_id='alxxtexxr/RoBERTa-Base-SE2025T11A-sun-v20250112124824'), pr_revision=None, pr_num=None)

# Evaluation

In [34]:
eval = trainer.evaluate(eval_dataset=tokenized_datasets['test'])

print("Evaluation to copy:")
f1_keys = [eval_key for eval_key in eval.keys() if 'f1' in eval_key]
for i, k in enumerate(f1_keys): print(eval[k], end="\t" if i + 1 < len(f1_keys) else "")
print()
print()
print("Evaluation full results:")
pprint(eval)

Evaluation to copy:
0.6265821781658331	0.6503856041131105	0.640286862353187	0.6273059360730594	0.6111111111111112	0.4883720930232558	0.6595744680851063	0.8186528497409327	0.6666666666666666	0.5210084033613446	0.6206896551724138

Evaluation full results:
{'epoch': 2.0,
 'eval_f1_label_biasa': 0.6206896551724138,
 'eval_f1_label_jijik': 0.4883720930232558,
 'eval_f1_label_marah': 0.6111111111111112,
 'eval_f1_label_sedih': 0.6666666666666666,
 'eval_f1_label_senang': 0.8186528497409327,
 'eval_f1_label_takut': 0.6595744680851063,
 'eval_f1_label_terkejut': 0.5210084033613446,
 'eval_f1_macro': 0.6265821781658331,
 'eval_f1_micro': 0.6503856041131105,
 'eval_f1_samples': 0.6273059360730594,
 'eval_f1_weighted': 0.640286862353187,
 'eval_loss': 0.28505367040634155,
 'eval_runtime': 1.9063,
 'eval_samples_per_second': 191.465,
 'eval_steps_per_second': 95.995}


# Inference

In [19]:
data = datasets['val'][1]
text = data['text']
emotion_true = data['emotion']

inputs = tokenizer(text, return_tensors='pt').to(model.device)

outputs = trainer.model(**inputs)
logits = outputs.logits
probs = sigmoid(logits.squeeze().detach().cpu().numpy()) # apply sigmoid + threshold
labels_pred = (probs > 0.5).astype(int)
emotion_pred = [id2class[idx] for idx, label in enumerate(labels_pred) if label == 1.0] # turn predicted id's into actual label names

print("Text:", text)
print("True emotion(s):", emotion_true)
print("Predicted emotion(s):", ", ".join(emotion_pred))

Text: Abdi hanjakal pisan
True emotion(s): sedih
Predicted emotion(s): sedih
