### Reference
- Multi-Label Classification Model From Scratch: Step-by-Step Tutorial (https://huggingface.co/blog/Valerii-Knowledgator/multi-label-classification)
- https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb
- https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb

# Libraries

In [1]:
%pip install -q -U datasets transformers accelerate sentencepiece # evaluate

In [2]:
import os
import random
import numpy as np
# import pandas as pd
import torch
import transformers
# import evaluate
from pprint import pprint
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import f1_score

# Config

In [3]:
seed = 42
lang = 'eng'
hf_model_id = 'google-bert/bert-base-uncased'
# hf_model_id = 'alxxtexxr/BERT-Base-SE2025T11A-eng-v0.3'
hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset'
hf_data_config = 'track_b_eng_70_15_15'
project_name = f'BERT-Base-SE2025T11B-{lang}-v0.1'
print("Project name:", project_name)

Project name: BERT-Base-SE2025T11B-eng-v0.1


In [4]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Set random seed for Transformers
    transformers.set_seed(seed)

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(seed)

Random seed set to: 42


# Data

## Load Data

In [5]:
datasets = load_dataset(hf_data_id, hf_data_config)

cols = list(datasets['train'].features)
emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]

# splits = data_files.keys()
# df = {split: pd.DataFrame(datasets[split]) for split in splits}

# cols = list(df['train'].columns)
print("Data columns:", cols)

# emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
# # neutral_emotion = df['train'][df['train'][emotion_cols].sum(axis=1) == 0]['emotion'].iloc[0]
# # emotions = emotion_cols + [neutral_emotion]
print("Emotions columns:", emotion_cols)
# print()

# print("Train DF size:", len(df['train']))
# print("Validation DF size:", len(df['val']))
# print("Testing DF size:", len(df['test']))

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


(…)ssed_data/track_b/eng_70_15_15/train.csv:   0%|          | 0.00/235k [00:00<?, ?B/s]

(…)cessed_data/track_b/eng_70_15_15/val.csv:   0%|          | 0.00/50.4k [00:00<?, ?B/s]

(…)essed_data/track_b/eng_70_15_15/test.csv:   0%|          | 0.00/48.0k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Data columns: ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']
Emotions columns: ['anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']


In [6]:
class2id = {class_:id for id, class_ in enumerate(emotion_cols)}
id2class = {id:class_ for class_, id in class2id.items()}

print("Class to ID:\n")
pprint(class2id, width=1)
print()
print("ID to Class:\n")
pprint(id2class, width=1)

Class to ID:

{'anger_0': 0,
 'anger_1': 1,
 'anger_2': 2,
 'anger_3': 3,
 'fear_0': 4,
 'fear_1': 5,
 'fear_2': 6,
 'fear_3': 7,
 'joy_0': 8,
 'joy_1': 9,
 'joy_2': 10,
 'joy_3': 11,
 'sad_0': 12,
 'sad_1': 13,
 'sad_2': 14,
 'sad_3': 15,
 'surprise_0': 16,
 'surprise_1': 17,
 'surprise_2': 18,
 'surprise_3': 19}

ID to Class:

{0: 'anger_0',
 1: 'anger_1',
 2: 'anger_2',
 3: 'anger_3',
 4: 'fear_0',
 5: 'fear_1',
 6: 'fear_2',
 7: 'fear_3',
 8: 'joy_0',
 9: 'joy_1',
 10: 'joy_2',
 11: 'joy_3',
 12: 'sad_0',
 13: 'sad_1',
 14: 'sad_2',
 15: 'sad_3',
 16: 'surprise_0',
 17: 'surprise_1',
 18: 'surprise_2',
 19: 'surprise_3'}


## Preprocess Data

In [7]:
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)

In [8]:
def get_labels(data, emotion_cols):
   return [float(data[emotion_col]) for emotion_col in emotion_cols]

def preprocess_function(data):
   text = data['text']
   labels = get_labels(data, emotion_cols)
   data = tokenizer(text, truncation=True)
   data['labels'] = labels
   return data

tokenized_datasets = {split: datasets[split].map(preprocess_function) for split in ['train', 'val', 'test']}
tokenized_datasets

Map:   0%|          | 0/1937 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

Map:   0%|          | 0/416 [00:00<?, ? examples/s]

{'train': Dataset({
     features: ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 1937
 }),
 'val': Dataset({
     features: ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     num_rows: 415
 }),
 'test': Dataset({
     features: ['text', 'anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
     n

In [9]:
# Sanity check
data0 = tokenized_datasets['train'][0]

print("Text:", data0['text'])
print("Labels:", data0['labels'], '-->', emotion_cols)

Text: Just got x-rays, and at least my knee does not appear to be broken ( although they are sending the x-rays to a specialist to be sure ).
Labels: [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0] --> ['anger_0', 'anger_1', 'anger_2', 'anger_3', 'fear_0', 'fear_1', 'fear_2', 'fear_3', 'joy_0', 'joy_1', 'joy_2', 'joy_3', 'sad_0', 'sad_1', 'sad_2', 'sad_3', 'surprise_0', 'surprise_1', 'surprise_2', 'surprise_3']


In [10]:
data_collator = DataCollatorWithPadding(tokenizer)

# Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    hf_model_id, 
    num_labels=len(emotion_cols),
    id2label=id2class, label2id=class2id,
    problem_type='multi_label_classification',
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Finetuning

In [12]:
# clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    probs = sigmoid(predictions)
    y_pred = (probs > 0.5).astype(int)
    y_true = labels.astype(int)

    # Compute F1 score for each type of averaging method
    f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0.0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0.0)
    # f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0.0)
    # f1_samples = f1_score(y_true, y_pred, average='samples', zero_division=0.0)
    f1_labels = f1_score(y_true, y_pred, average=None, zero_division=0.0)
    f1_labels_dict = {f'f1_label_{emotion_cols[i]}': f1_labels[i] for i in range(len(f1_labels))}

    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        **f1_labels_dict,
    }

In [13]:
train_args = TrainingArguments(
    # Training config
    per_device_train_batch_size=2,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,

    # Logging config for training
    logging_strategy='steps',
    logging_steps=50,

    # Evaluation config during training
    per_device_eval_batch_size=2,
    eval_strategy='steps',
    eval_steps=50,

    # Model saving config
    output_dir=project_name,
    save_strategy='epoch',
    # load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [14]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malimtegar[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,F1 Label Anger 0,F1 Label Anger 1,F1 Label Anger 2,F1 Label Anger 3,F1 Label Fear 0,F1 Label Fear 1,F1 Label Fear 2,F1 Label Fear 3,F1 Label Joy 0,F1 Label Joy 1,F1 Label Joy 2,F1 Label Joy 3,F1 Label Sad 0,F1 Label Sad 1,F1 Label Sad 2,F1 Label Sad 3,F1 Label Surprise 0,F1 Label Surprise 1,F1 Label Surprise 2,F1 Label Surprise 3
50,0.5862,0.479669,0.686038,0.201388,0.935897,0.0,0.0,0.0,0.576,0.029197,0.0,0.0,0.85989,0.0,0.0,0.0,0.805755,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
100,0.4352,0.390799,0.669344,0.171385,0.935897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.810888,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
150,0.3807,0.374277,0.691958,0.20141,0.935897,0.0,0.0,0.0,0.600509,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.810888,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
200,0.376,0.367024,0.669344,0.171385,0.935897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.810888,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
250,0.375,0.364125,0.669191,0.17175,0.935897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.818182,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
300,0.3738,0.355381,0.689564,0.196176,0.935897,0.0,0.0,0.0,0.478088,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.828614,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
350,0.3587,0.352302,0.705344,0.207935,0.935897,0.0,0.0,0.0,0.707022,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.834862,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
400,0.3732,0.34189,0.697161,0.20294,0.935897,0.0,0.0,0.0,0.61324,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.828746,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
450,0.3414,0.340273,0.696842,0.201779,0.935897,0.0,0.0,0.0,0.585185,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.833583,0.0,0.0,0.0,0.821023,0.0,0.0,0.0
500,0.3332,0.335232,0.70008,0.205347,0.935897,0.0,0.0,0.0,0.655172,0.0,0.0,0.0,0.85989,0.0,0.0,0.0,0.834951,0.0,0.0,0.0,0.821023,0.0,0.0,0.0


TrainOutput(global_step=1938, training_loss=0.3184622050069803, metrics={'train_runtime': 270.2634, 'train_samples_per_second': 14.334, 'train_steps_per_second': 7.171, 'total_flos': 55533114532104.0, 'train_loss': 0.3184622050069803, 'epoch': 2.0})

In [15]:
# trainer.push_to_hub()

# Evaluation

In [18]:
trainer.evaluate(eval_dataset=tokenized_datasets['test'])

{'eval_loss': 0.2822621762752533,
 'eval_f1_micro': 0.7237311385459534,
 'eval_f1_macro': 0.23613747684154168,
 'eval_f1_label_anger_0': 0.9391979301423027,
 'eval_f1_label_anger_1': 0.0,
 'eval_f1_label_anger_2': 0.0,
 'eval_f1_label_anger_3': 0.0,
 'eval_f1_label_fear_0': 0.7125,
 'eval_f1_label_fear_1': 0.0,
 'eval_f1_label_fear_2': 0.1276595744680851,
 'eval_f1_label_fear_3': 0.0,
 'eval_f1_label_joy_0': 0.9138461538461539,
 'eval_f1_label_joy_1': 0.0,
 'eval_f1_label_joy_2': 0.0,
 'eval_f1_label_joy_3': 0.0,
 'eval_f1_label_sad_0': 0.8735244519392917,
 'eval_f1_label_sad_1': 0.0,
 'eval_f1_label_sad_2': 0.0,
 'eval_f1_label_sad_3': 0.0,
 'eval_f1_label_surprise_0': 0.8795986622073578,
 'eval_f1_label_surprise_1': 0.2764227642276423,
 'eval_f1_label_surprise_2': 0.0,
 'eval_f1_label_surprise_3': 0.0,
 'eval_runtime': 2.4533,
 'eval_samples_per_second': 169.568,
 'eval_steps_per_second': 84.784,
 'epoch': 2.0}

# Inference

In [51]:
emotions = sorted(set(item.split('_')[0] for item in emotion_cols))
emotions

['anger', 'fear', 'joy', 'sad', 'surprise']

In [83]:
def labels2intensities(labels):
    return torch.argmax(torch.tensor(labels).reshape(5, 4), dim=1)

def print_emotion_intensiies(emotions, intensities):    
    for emotion, intensity in zip(emotions, intensities):
        print(f"- {emotion}:", intensity.item())

data = datasets['val'][0]
text = data['text']
labels_true = get_labels(data, emotion_cols)
intensities_true = labels2intensities(labels_true)

inputs = tokenizer(text, return_tensors='pt').to(model.device)

outputs = trainer.model(**inputs)
logits = outputs.logits
probs = sigmoid(logits.squeeze().detach().cpu().numpy()) # apply sigmoid + threshold
labels_pred = (probs > 0.5).astype(int)
intensities_pred = labels2intensities(labels_pred)

print("Text:", text)
print()
print("True emotion(s):")
print_emotion_intensiies(emotions, intensities_true)
print()
print("Predicted emotion(s):")
print_emotion_intensiies(emotions, intensities_pred)

Text: and Christian fish bumper stickers, it caught my eye and made me grin.

True emotion(s):
- anger: 0
- fear: 0
- joy: 2
- sad: 0
- surprise: 1

Predicted emotion(s):
- anger: 0
- fear: 0
- joy: 1
- sad: 0
- surprise: 0
