### References

- Finetune T5 for classification and multiple choice (https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)

- Finetune T5 for sentiment span extraction (https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)

# Libraries

In [3]:
%pip install -q pytorch_lightning
%pip install -q -U datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import random
import argparse
import logging
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from pprint import pprint
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score

# Config

In [5]:
args_dict = dict(
    seed = 42,
    lang = 'eng', # 'eng' | 'deu' | 'ptbr' | 'rus' | 'sun'
    project_name = 'T5-Base-Emotion-eng',
    hf_data_id = 'alxxtexxr/SemEval2025-Task11-Dataset',
    
    # Model for English
    hf_model_id = 'google-t5/t5-base',

    # Model for Non-English
    # hf_model_id = 'google/mt5-small',

    # Model for Indonesian/Sundanese
    # hf_model_id = 'indonlp/cendol-mt5-small-inst',

    max_seq_length = 512,
    learning_rate = 3e-4,
    weight_decay = 0.0,
    adam_epsilon = 1e-8,
    warmup_steps = 0,
    train_batch_size = 8,
    eval_batch_size = 8,
    num_train_epochs = 2,
    gradient_accumulation_steps = 16,
    n_gpu = -1, # use all available GPUs
    early_stop_callback = False,
    fp_16 = False, # if you want to enable 16-bit training then install apex and set this to true. I have tried to set this to true and the result was SHIT! (F1 score: 0.03-0.04)
    opt_level = 'O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm = 1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
)
args = argparse.Namespace(**args_dict)
pprint(args_dict, sort_dicts=False)

{'seed': 42,
 'lang': 'eng',
 'project_name': 'T5-Base-Emotion-eng',
 'hf_data_id': 'alxxtexxr/SemEval2025-Task11-Dataset',
 'hf_model_id': 'google-t5/t5-base',
 'max_seq_length': 512,
 'learning_rate': 0.0003,
 'weight_decay': 0.0,
 'adam_epsilon': 1e-08,
 'warmup_steps': 0,
 'train_batch_size': 8,
 'eval_batch_size': 8,
 'num_train_epochs': 2,
 'gradient_accumulation_steps': 16,
 'n_gpu': -1,
 'early_stop_callback': False,
 'fp_16': False,
 'opt_level': 'O1',
 'max_grad_norm': 1.0}


In [6]:
def set_seed(seed):
    # Set random seed for NumPy
    np.random.seed(seed)

    # Set random seed for Torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    torch.backends.cudnn.deterministic = True  # Ensures deterministic results
    torch.backends.cudnn.benchmark = False  # Avoids non-deterministic algorithms

    # Optionally set random seed for sklearn and Python's own random module
    random.seed(seed)

    # Set random seed for os
    os.environ['PYTHONHASHSEED'] = str(seed)

    print(f"Random seed set to: {seed}")

set_seed(args.seed)

Random seed set to: 42


# Data

## Load Data

In [7]:
data_files = {
    'train': f'preprocessed_data/{args.lang}/train.csv', 
    'val': f'preprocessed_data/{args.lang}/val.csv',
    'test': f'preprocessed_data/{args.lang}/test.csv',
}
dataset = load_dataset(args.hf_data_id, data_files=data_files)

splits = data_files.keys()
df = {split: pd.DataFrame(dataset[split]) for split in splits}

cols = list(df['train'].columns)
print("DF columns:", cols)

emotion_cols = [col for col in cols if col not in ['Unnamed: 0', 'text', 'emotion']]
# neutral_emotion = df['train'][df['train'][emotion_cols].sum(axis=1) == 0]['emotion'].iloc[0]
# emotions = emotion_cols + [neutral_emotion]
print("Emotions columns:", emotion_cols)
print()

print("Train DF size:", len(df['train']))
print("Validation DF size:", len(df['val']))
print("Testing DF size:", len(df['test']))

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


preprocessed_data/eng/train.csv:   0%|          | 0.00/236k [00:00<?, ?B/s]

preprocessed_data/eng/val.csv:   0%|          | 0.00/57.3k [00:00<?, ?B/s]

preprocessed_data/eng/test.csv:   0%|          | 0.00/9.55k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DF columns: ['Unnamed: 0', 'text', 'emotion', 'anger', 'fear', 'joy', 'sad', 'surprise']
Emotions columns: ['anger', 'fear', 'joy', 'sad', 'surprise']

Train DF size: 2214
Validation DF size: 554
Testing DF size: 116


In [8]:
df['train']

Unnamed: 0.1,Unnamed: 0,text,emotion,anger,fear,joy,sad,surprise
0,2113,I now have 12 of those canker sore suckers in ...,"fear, sad",0,1,0,1,0
1,2422,It just... went away.,"fear, sad, surprise",0,1,0,1,1
2,2061,I naively walked up and stuck my head in the d...,"fear, surprise",0,1,0,0,1
3,298,i just cracked my head into a glass door.A del...,"anger, fear",1,1,0,0,0
4,413,I was so wrapping up in the boy in my arms.,joy,0,0,1,0,0
...,...,...,...,...,...,...,...,...
2209,2522,My foot was tingling for sometime afterwards b...,fear,0,1,0,0,0
2210,1667,"The sky was only slightly disturbed, light gra...",fear,0,1,0,0,0
2211,2349,"Dont wait, or say a single vow.","fear, sad",0,1,0,1,0
2212,899,"""a woman is going to get an abortion.","fear, sad, surprise",0,1,0,1,1


## Create PyTorch Datasets

In [9]:
tokenizer = AutoTokenizer.from_pretrained(args.hf_model_id, clean_up_tokenization_spaces=False)
# model = AutoModelForSeq2SeqLM.from_pretrained(args.tokenizer_name_or_path)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [10]:
emotion_token_lengths = {emotion: len(tokenizer.encode(emotion)) for emotion in df['train']['emotion'].unique()}
print("Emotion token lengths:")
pprint(emotion_token_lengths, width=1)
print()

target_max_len = max(emotion_token_lengths.values())
print("Target max. length:", target_max_len)

Emotion token lengths:
{'anger': 2,
 'anger, fear': 4,
 'anger, fear, joy, sad': 8,
 'anger, fear, joy, sad, surprise': 10,
 'anger, fear, sad': 6,
 'anger, fear, sad, surprise': 8,
 'anger, fear, surprise': 6,
 'anger, joy': 4,
 'anger, joy, surprise': 6,
 'anger, sad': 4,
 'anger, sad, surprise': 6,
 'anger, surprise': 4,
 'fear': 2,
 'fear, joy': 4,
 'fear, joy, sad': 6,
 'fear, joy, sad, surprise': 8,
 'fear, joy, surprise': 6,
 'fear, sad': 4,
 'fear, sad, surprise': 6,
 'fear, surprise': 4,
 'joy': 2,
 'joy, sad': 4,
 'joy, sad, surprise': 6,
 'joy, surprise': 4,
 'neutral': 2,
 'sad': 2,
 'sad, surprise': 4,
 'surprise': 2}

Target max. length: 10


In [11]:
class EmotionDataset(Dataset):
  def __init__(self, tokenizer,
               one_hot_class_columns,
               df, data_split,
               data_column='text', class_column='emotion', max_len=512):
    self.data_column = data_column
    self.one_hot_class_columns = one_hot_class_columns
    self.class_column = class_column

    self.data = df[data_split]

    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self.one_hot_targets = []

    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {
        "source_ids": source_ids,
        "source_mask": src_mask,
        "target_ids": target_ids,
        "target_mask": target_mask,
        "one_hot_target": self.one_hot_targets[index],
    }

  def _build(self):
    for idx in range(len(self.data)):
      input_, target, one_hot_target = self.data.loc[idx, self.data_column], self.data.loc[idx, self.class_column], self.data.loc[idx, self.one_hot_class_columns]
      one_hot_target = one_hot_target.values.tolist()

      # input_ = input_ + ' </s>'
      # target = target + " </s>"

      # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len, return_tensors="pt", padding='max_length', truncation=True,
      )

      # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=target_max_len, return_tensors="pt", padding='max_length', truncation=True,
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)
      self.one_hot_targets.append(one_hot_target)

In [12]:
def get_dataset(tokenizer, type_path, args):
    return EmotionDataset(tokenizer=tokenizer, one_hot_class_columns=emotion_cols,
                          df=df, data_split=type_path, max_len=args.max_seq_length)

In [13]:
train_set = get_dataset(tokenizer, 'train', args)

for i in range(3):
    d = train_set[i]
    print(tokenizer.decode(d['source_ids']))
    print(tokenizer.decode(d['target_ids']))
    print(d['one_hot_target'])
    print()

I now have 12 of those canker sore suckers in my mouth along with a fever since friday.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad

In [14]:
train_loader = DataLoader(train_set, batch_size=2)
batch = next(iter(train_loader))
print(torch.stack(batch['one_hot_target']).T.tolist())

[[0, 1, 0, 1, 0], [0, 1, 0, 1, 1]]


# Training

In [15]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.automatic_optimization = False

    # self.hparams = hparams
    self.save_hyperparameters(hparams)

    self.model = AutoModelForSeq2SeqLM.from_pretrained(hparams.hf_model_id)
    self.tokenizer = AutoTokenizer.from_pretrained(hparams.hf_model_id, clean_up_tokenization_spaces=False)

    self.training_step_outputs = []
    self.validation_step_outputs = []

  def is_logger(self):
    # return self.trainer.proc_rank <= 0
    return self.trainer.global_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    return loss

  def training_step(self, batch, batch_idx):
    opt = self.optimizers()

    # scale losses by 1/N (for N batches of gradient accumulation)
    N = self.hparams.gradient_accumulation_steps
    loss = self._step(batch) / N
    self.manual_backward(loss)

    # accumulate gradients of N batches
    if (batch_idx + 1) % N == 0:
        # clip gradients
        self.clip_gradients(opt, gradient_clip_val=self.hparams.max_grad_norm, gradient_clip_algorithm="norm")

        opt.step()
        opt.zero_grad()
        
        self.lr_scheduler.step()

    tensorboard_logs = {"train_loss": loss}
    self.training_step_outputs.append({"loss": loss})
    return {"loss": loss, "log": tensorboard_logs}

  # def training_epoch_end(self, outputs):
  def on_train_epoch_end(self):
    avg_train_loss = torch.stack([x["loss"] for x in self.training_step_outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    self.validation_step_outputs.append({"val_loss": loss})
    return {"val_loss": loss}

  # def validation_epoch_end(self, outputs):
  def on_validation_epoch_end(self):
    avg_loss = torch.stack([x["val_loss"] for x in self.validation_step_outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

#   def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
#     # if self.trainer.use_tpu:
#     #   xm.optimizer_step(optimizer)
#     # else:
#     #   optimizer.step()
#     optimizer.step()
#     optimizer.zero_grad()
#     self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}
    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size,
                            drop_last=True, shuffle=True, num_workers=2)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=2)

In [16]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [17]:
!mkdir -p $args.project_name

In [18]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    # filepath=args.project_name,
    # prefix="checkpoint",
    monitor = "val_loss",
    mode = "min",
    save_top_k = 5
)

In [19]:
model = T5FineTuner(args)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [20]:
train_params = dict(
    # accumulate_grad_batches = args.gradient_accumulation_steps,
    # gpus = args.n_gpu,
    max_epochs = args.num_train_epochs,
    # early_stop_callback = False,
    precision =  '16-mixed' if args.fp_16 else 32,
    # amp_level = args.opt_level,
    # gradient_clip_val = args.max_grad_norm,
    # checkpoint_callback = checkpoint_callback,
    callbacks = [LoggingCallback()],
    accelerator = 'gpu', # 'auto' | 'cpu' | 'gpu' | 'tpu'
    devices = -1, # use all available GPUs
    strategy = 'auto', # 'auto' | 'dp' | 'ddp'
)
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [21]:
trainer.fit(model)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M  | eval
------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
0         Modules in train mode
541       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=2` reached.


# Evaluation

In [22]:
val_set = get_dataset(tokenizer, 'val', args)
val_loader = DataLoader(val_set, batch_size=32, shuffle=True)

In [23]:
val_batch0 = next(iter(val_loader))
# one_hot_targets = torch.stack(batch['one_hot_target']).T.tolist()

output_ids = model.model.cuda().generate(input_ids=val_batch0['source_ids'].cuda(),
                                        attention_mask=val_batch0['source_mask'].cuda(),
                                        max_length=target_max_len)
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
outputs

['fear, surprise',
 'fear, surprise',
 'joy, surprise',
 'fear, surprise',
 'fear, surprise',
 'fear, sad, surprise',
 'joy',
 'fear, surprise',
 'fear',
 'fear, surprise',
 'fear, surprise',
 'fear, surprise',
 'fear, surprise',
 'fear, surprise',
 'fear, sad',
 'neutral',
 'fear, sad',
 'anger, fear, sad',
 'anger, fear, sad',
 'fear, sad, surprise',
 'fear, sad',
 'fear, sad',
 'fear, surprise',
 'joy',
 'fear, sad',
 'joy',
 'fear, sad',
 'fear, sad',
 'fear',
 'fear, sad',
 'fear, surprise',
 'fear, sad']

In [24]:
def one_hot_encode_emotion(emotion):
    emotions = emotion.replace(' ', '').split(',')
    one_hot_emotion = [1 if emotion_col in emotions else 0 for emotion_col in emotion_cols]
    return one_hot_emotion

y_true = []
y_pred = []
for batch in val_loader:
    one_hot_targets = torch.stack(batch['one_hot_target']).T.tolist()

    output_ids = model.model.cuda().generate(input_ids=batch['source_ids'].cuda(),
                                          attention_mask=batch['source_mask'].cuda(),
                                          max_length=target_max_len)
    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    one_outputs = [one_hot_encode_emotion(output) for output in outputs]

    assert len(one_outputs) == len(one_hot_targets)
    y_true += one_hot_targets
    y_pred += one_outputs

In [25]:
y_true_0 = df['val'].apply(lambda row: row[emotion_cols].tolist(), axis=1).tolist()
print(f"True Y ({len(y_true_0)}):")
y_true_0[:10]

True Y (554):


[[0, 1, 0, 1, 0],
 [0, 1, 0, 0, 1],
 [0, 0, 0, 1, 1],
 [0, 0, 0, 1, 0],
 [1, 1, 0, 0, 1],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0],
 [0, 1, 0, 0, 1],
 [0, 1, 0, 1, 1],
 [1, 1, 0, 0, 0]]

In [26]:
# Calculate F1 score for each type of averaging method
f1_micro = f1_score(y_true, y_pred, average='micro', zero_division=0.0)
f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0.0)
# f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0.0)
# f1_samples = f1_score(y_true, y_pred, average='samples', zero_division=0.0)
f1_per_label = f1_score(y_true, y_pred, average=None, zero_division=0.0)

print(f'F1 Score (Micro-Average): {f1_micro}')
print(f'F1 Score (Macro-Average): {f1_macro}')
# print(f'F1 Score (Weighted-Average): {f1_weighted}')
# print(f'F1 Score (Samples-Average): {f1_samples}')
print()

# F1 score per label
for label, f1 in zip(emotion_cols, f1_per_label):
    print(f"F1 Score for '{label}': {f1}")

F1 Score (Micro-Average): 0.6885964912280702
F1 Score (Macro-Average): 0.6155755742307234

F1 Score for 'anger': 0.3695652173913043
F1 Score for 'fear': 0.8050847457627118
F1 Score for 'joy': 0.6212121212121212
F1 Score for 'sad': 0.6522781774580336
F1 Score for 'surprise': 0.6297376093294461
