## Imports

In [29]:
## Imports
import os
import re
import numpy as np

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import gensim.downloader
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

## Tokenizer

In [30]:
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")


def word_tokenize(s: str):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]

## Metrics

In [31]:

EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]

def print_results(gold_labels, predicted_labels):
    # overall
    p, r, f, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)

    print("=== Overall (Macro Avg) ===")
    print("Precision:", p)
    print("Recall:", r)
    print("F1:", f)
    print("Accuracy:", acc)
    print()

    # Per-emotion metrics
    p_i, r_i, f_i, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average=None, zero_division=0
    )

    print("=== Per Emotion (Class) Metrics ===")
    for i, emotion in enumerate(EMOTIONS):
        print(f"{emotion}:")
        print("  Precision:", p_i[i])
        print("  Recall:   ", r_i[i])
        print("  F1:       ", f_i[i])
    print()



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    p, r, f, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"precision": p, "recall": r, "f1": f, "accuracy": acc}




## Dataset

In [32]:
DATASET_DIR = "NJU_MusicMood_v1.0"

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

timestamp_pattern = re.compile(r"\[\d{2}:\d{2}(?:\.\d{2})?\]")


def clean_lyrics(text: str) -> str:
    # Remove timestamps like [00:29]
    text = timestamp_pattern.sub("", text)

    # Lowercase
    text = text.lower()

    # Normalize quotes
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')

    # Remove ellipses and repeated dots
    text = re.sub(r"\.{2,}", " ", text)

    # Remove long underscores
    text = re.sub(r"_{2,}", " ", text)

    # Remove trailing "end" markers at the end of the file
    text = re.sub(r"\bend[.\s]*$", "", text.strip())

    # Replace newlines with space
    text = text.replace("\n", " ")

    # Keep only letters, digits, spaces, apostrophes
    text = re.sub(r"[^a-z0-9' ]+", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    return text.strip()



def get_lyrics(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return clean_lyrics(raw)


def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue

        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":
                continue

            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)
    return texts, labels


# Load data 
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels = get_lyrics_and_labels("Test")

assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# Datasets
train_ds = Dataset.from_dict(
    {"text": train_texts, "label": [label2id[l] for l in train_labels]}
)
dev_ds = Dataset.from_dict(
    {"text": dev_texts, "label": [label2id[l] for l in dev_labels]}
)


## Baseline: Bag of Words & Logistic Regression

In [33]:
print("=== Baseline: Bag of Words & Logistic Regression ===")

count_vectorizer = CountVectorizer(analyzer=word_tokenize)
train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts = count_vectorizer.transform(dev_texts)

lr_bow = LogisticRegression(max_iter=500, random_state=0)
lr_bow.fit(train_counts, train_labels)

lr_bow_dev_predictions = lr_bow.predict(dev_counts)
print_results(dev_labels, lr_bow_dev_predictions)

=== Baseline: Bag of Words & Logistic Regression ===
=== Overall (Macro Avg) ===
Precision: 0.38026220704863894
Recall: 0.37264480168452674
F1: 0.3731436538331033
Accuracy: 0.363395225464191

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.49295774647887325
  Recall:    0.49295774647887325
  F1:        0.49295774647887325
Happy:
  Precision: 0.4567901234567901
  Recall:    0.3490566037735849
  F1:        0.39572192513368987
Relaxed:
  Precision: 0.3053435114503817
  Recall:    0.39603960396039606
  F1:        0.3448275862068966
Sad:
  Precision: 0.26595744680851063
  Recall:    0.25252525252525254
  F1:        0.25906735751295334



## Baseline 2: Word2Vec & Logistic Regression

In [34]:
print("=== Word2Vec & Logistic Regression ===")

w2v_model = gensim.downloader.load("word2vec-google-news-300")
VECTOR_SIZE = w2v_model.vector_size


def vec_for_doc(tokenized_doc):
    vectors = [w2v_model[word] for word in tokenized_doc if word in w2v_model.key_to_index]
    if not vectors:
        return np.zeros(VECTOR_SIZE, dtype="float32")
    return np.mean(vectors, axis=0)


train_vecs = [vec_for_doc(word_tokenize(x)) for x in train_texts]
dev_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_texts]

lr_w2v = LogisticRegression(max_iter=500, random_state=0)
lr_w2v.fit(train_vecs, train_labels)

w2v_dev_predictions = lr_w2v.predict(dev_vecs)
print_results(dev_labels, w2v_dev_predictions)

=== Word2Vec & Logistic Regression ===
=== Overall (Macro Avg) ===
Precision: 0.46144573621456286
Recall: 0.4781614522068736
F1: 0.4553408410787571
Accuracy: 0.4535809018567639

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6129032258064516
  Recall:    0.8028169014084507
  F1:        0.6951219512195121
Happy:
  Precision: 0.4803921568627451
  Recall:    0.46226415094339623
  F1:        0.47115384615384615
Relaxed:
  Precision: 0.3358208955223881
  Recall:    0.44554455445544555
  F1:        0.3829787234042553
Sad:
  Precision: 0.4166666666666667
  Recall:    0.20202020202020202
  F1:        0.272108843537415



## Split lyrics into start middle end

In [35]:
def get_segment(text, segment="start", portion=0.3):
    """
    Extract a portion of the lyrics.
    portion=0.3 means 30% of tokens.
    segment can be "start", "middle", or "end".
    """
    tokens = text.split()
    n = len(tokens)
    if n == 0:
        return ""

    cut = int(n * portion)  # number of tokens for start/end

    if segment == "start":
        return " ".join(tokens[:cut])

    elif segment == "middle":
        start = int(n * 0.35)
        end = int(n * 0.65)
        return " ".join(tokens[start:end])

    elif segment == "end":
        return " ".join(tokens[-cut:])

    else:
        return text


# Build dev
dev_start_texts = [get_segment(t, "start") for t in dev_texts]
dev_middle_texts = [get_segment(t, "middle") for t in dev_texts]
dev_end_texts = [get_segment(t, "end") for t in dev_texts]



## Evaluate BoW model Segmented Lyrics

In [36]:
print("=== BoW Logistic Regression: Position-based Evaluation (NO retraining) ===")

# START
dev_start_counts = count_vectorizer.transform(dev_start_texts)
bow_start_preds = lr_bow.predict(dev_start_counts)
print("\n--- BoW on START segment only ---")
print_results(dev_labels, bow_start_preds)

# MIDDLE
dev_middle_counts = count_vectorizer.transform(dev_middle_texts)
bow_middle_preds = lr_bow.predict(dev_middle_counts)
print("\n--- BoW on MIDDLE segment only ---")
print_results(dev_labels, bow_middle_preds)

# END
dev_end_counts = count_vectorizer.transform(dev_end_texts)
bow_end_preds = lr_bow.predict(dev_end_counts)
print("\n--- BoW on END segment only ---")
print_results(dev_labels, bow_end_preds)


=== BoW Logistic Regression: Position-based Evaluation (NO retraining) ===

--- BoW on START segment only ---
=== Overall (Macro Avg) ===
Precision: 0.41852128751856615
Recall: 0.3243719867601961
F1: 0.27986468063888137
Accuracy: 0.32625994694960214

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5882352941176471
  Recall:    0.28169014084507044
  F1:        0.38095238095238093
Happy:
  Precision: 0.6
  Recall:    0.11320754716981132
  F1:        0.19047619047619047
Relaxed:
  Precision: 0.29537366548042704
  Recall:    0.8217821782178217
  F1:        0.43455497382198954
Sad:
  Precision: 0.19047619047619047
  Recall:    0.08080808080808081
  F1:        0.11347517730496454


--- BoW on MIDDLE segment only ---
=== Overall (Macro Avg) ===
Precision: 0.40280577696950937
Recall: 0.3146209717968874
F1: 0.27415579397176265
Accuracy: 0.3183023872679045

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5625
  Recall:    0.2535211267605634
  F1:        0.34951456310679613
Ha

## Evaluate Word2Vec on Segmented Lyrics

In [37]:
print("=== Word2Vec Logistic Regression: Position-based Evaluation (NO retraining) ===")

# START
dev_start_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_start_texts]
w2v_start_preds = lr_w2v.predict(dev_start_vecs)
print("\n--- W2V on START segment only ---")
print_results(dev_labels, w2v_start_preds)

# MIDDLE
dev_middle_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_middle_texts]
w2v_middle_preds = lr_w2v.predict(dev_middle_vecs)
print("\n--- W2V on MIDDLE segment only ---")
print_results(dev_labels, w2v_middle_preds)

# END
dev_end_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_end_texts]
w2v_end_preds = lr_w2v.predict(dev_end_vecs)
print("\n--- W2V on END segment only ---")
print_results(dev_labels, w2v_end_preds)

=== Word2Vec Logistic Regression: Position-based Evaluation (NO retraining) ===

--- W2V on START segment only ---
=== Overall (Macro Avg) ===
Precision: 0.4094516545709824
Recall: 0.4308409946761351
F1: 0.4092941134519704
Accuracy: 0.41114058355437666

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5051546391752577
  Recall:    0.6901408450704225
  F1:        0.5833333333333334
Happy:
  Precision: 0.43434343434343436
  Recall:    0.4056603773584906
  F1:        0.4195121951219512
Relaxed:
  Precision: 0.3464566929133858
  Recall:    0.43564356435643564
  F1:        0.38596491228070173
Sad:
  Precision: 0.35185185185185186
  Recall:    0.1919191919191919
  F1:        0.24836601307189543


--- W2V on MIDDLE segment only ---
=== Overall (Macro Avg) ===
Precision: 0.4279684703083266
Recall: 0.45646796941187606
F1: 0.42801924337609987
Accuracy: 0.4323607427055703

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5445544554455446
  Recall:    0.7746478873239436
  F1:    

## Helpers

In [23]:
def tokenize_dataset(dataset, tokenizer, max_length: int = 256):
    def _tok(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    tokenized = dataset.map(_tok, batched=True)
    tokenized = tokenized.remove_columns(["text"])
    tokenized.set_format(type="torch")
    return tokenized


def train_and_eval_transformer(
    model_name: str,
    train_dataset: Dataset,
    dev_dataset: Dataset,
    output_dir: str,
    num_epochs: int,
    learning_rate: float,
    train_bs: int,
    eval_bs: int,
    set_pad_token_eos: bool = False,
):
    print(f"=== Fine-tuning {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if set_pad_token_eos:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized_train = tokenize_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenize_dataset(dev_dataset, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(EMOTIONS),
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    )

    if set_pad_token_eos:
        model.config.pad_token_id = tokenizer.eos_token_id

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        weight_decay=0.01,
        eval_strategy="epoch",
        logging_steps=16,
        log_level="error",
        report_to="none",
        save_strategy="epoch",
        dataloader_pin_memory=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(f"{model_name} dev results:", eval_results)

    pred_output = trainer.predict(tokenized_dev)
    logits = pred_output.predictions
    pred_ids = np.argmax(logits, axis=-1)
    pred_labels = [id2label[i] for i in pred_ids]

    print(f"{model_name} classification report:")
    print_results(dev_labels, pred_labels)

    return trainer, eval_results, pred_labels

def eval_transformer_on_segments(model_name: str, trainer: Trainer):
    """
    Evaluate a fine-tuned transformer (trainer) on START/MIDDLE/END
    segments of the dev set, without retraining.
    """
    tokenizer = trainer.tokenizer

    for seg in ["start", "middle", "end"]:
        # Build segmented dev texts
        seg_texts = [get_segment(t, seg) for t in dev_texts]

        # Build a segmented dev Dataset with same labels
        seg_dev_ds = Dataset.from_dict(
            {
                "text": seg_texts,
                "label": [label2id[l] for l in dev_labels],
            }
        )

        # Tokenize segmented dataset
        tokenized_seg_dev = tokenize_dataset(seg_dev_ds, tokenizer)

        # Predict
        pred_output = trainer.predict(tokenized_seg_dev)
        logits = pred_output.predictions
        pred_ids = np.argmax(logits, axis=-1)
        pred_labels = [id2label[i] for i in pred_ids]

        print(f"\n=== {model_name} on {seg.upper()} segment only ===")
        print_results(dev_labels, pred_labels)


## DistilGPT2

In [27]:
gpt2_trainer, gpt2_results, gpt2_pred_labels = train_and_eval_transformer(
    model_name="distilgpt2",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilgpt2_output",
    num_epochs=5,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=True,
)

=== Fine-tuning distilgpt2 ===


Map: 100%|██████████| 400/400 [00:00<00:00, 1184.45 examples/s]
Map: 100%|██████████| 377/377 [00:00<00:00, 1583.07 examples/s]
  trainer = Trainer(


{'loss': 2.3599, 'grad_norm': 29.54327964782715, 'learning_rate': 4.85e-05, 'epoch': 0.16}
{'loss': 1.8893, 'grad_norm': 92.1890640258789, 'learning_rate': 4.69e-05, 'epoch': 0.32}
{'loss': 1.5923, 'grad_norm': 33.16315841674805, 'learning_rate': 4.53e-05, 'epoch': 0.48}
{'loss': 1.428, 'grad_norm': 21.53084373474121, 'learning_rate': 4.3700000000000005e-05, 'epoch': 0.64}
{'loss': 1.4135, 'grad_norm': 29.720989227294922, 'learning_rate': 4.21e-05, 'epoch': 0.8}
{'loss': 1.3911, 'grad_norm': 48.5584602355957, 'learning_rate': 4.05e-05, 'epoch': 0.96}
{'eval_loss': 1.3949381113052368, 'eval_precision': 0.3240610760433206, 'eval_recall': 0.3534003446850133, 'eval_f1': 0.31606123048252804, 'eval_accuracy': 0.35013262599469497, 'eval_runtime': 102.2248, 'eval_samples_per_second': 3.688, 'eval_steps_per_second': 0.929, 'epoch': 1.0}
{'loss': 1.3335, 'grad_norm': 22.279226303100586, 'learning_rate': 3.8900000000000004e-05, 'epoch': 1.12}
{'loss': 1.2804, 'grad_norm': 43.611995697021484, 'lea

## Distilbert

In [26]:
distilbert_trainer, distilbert_results, distilbert_pred_labels = train_and_eval_transformer(
    model_name="distilbert/distilbert-base-uncased",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilbert_musicmood",
    num_epochs=3,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=False,
)

=== Fine-tuning distilbert/distilbert-base-uncased ===


Map: 100%|██████████| 400/400 [00:00<00:00, 1531.62 examples/s]
Map: 100%|██████████| 377/377 [00:00<00:00, 2254.85 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.3758,1.303551,0.356097,0.48559,0.407268,0.458886
2,1.1797,1.196877,0.475669,0.50132,0.434963,0.469496
3,0.8546,1.145496,0.53545,0.559977,0.543441,0.535809


distilbert/distilbert-base-uncased dev results: {'eval_loss': 1.1454964876174927, 'eval_precision': 0.5354502471523748, 'eval_recall': 0.559977496553801, 'eval_f1': 0.5434413100952952, 'eval_accuracy': 0.5358090185676393, 'eval_runtime': 96.5857, 'eval_samples_per_second': 3.903, 'eval_steps_per_second': 0.984, 'epoch': 3.0}
distilbert/distilbert-base-uncased classification report:
=== Overall (Macro Avg) ===
Precision: 0.5354502471523748
Recall: 0.559977496553801
F1: 0.5434413100952952
Accuracy: 0.5358090185676393

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.648936170212766
  Recall:    0.8591549295774648
  F1:        0.7393939393939394
Happy:
  Precision: 0.5252525252525253
  Recall:    0.49056603773584906
  F1:        0.5073170731707317
Relaxed:
  Precision: 0.4888888888888889
  Recall:    0.43564356435643564
  F1:        0.4607329842931937
Sad:
  Precision: 0.4787234042553192
  Recall:    0.45454545454545453
  F1:        0.46632124352331605



## Segment Test

In [38]:
eval_transformer_on_segments("DistilGPT2", gpt2_trainer)
eval_transformer_on_segments("DistilBERT", distilbert_trainer)

Map: 100%|██████████| 377/377 [00:00<00:00, 1679.46 examples/s]



=== DistilGPT2 on START segment only ===
=== Overall (Macro Avg) ===
Precision: 0.4040863872739266
Recall: 0.3989931347639149
F1: 0.3790618725102379
Accuracy: 0.3793103448275862

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6027397260273972
  Recall:    0.6197183098591549
  F1:        0.6111111111111112
Happy:
  Precision: 0.42105263157894735
  Recall:    0.3018867924528302
  F1:        0.3516483516483517
Relaxed:
  Precision: 0.3
  Recall:    0.1188118811881188
  F1:        0.1702127659574468
Sad:
  Precision: 0.2925531914893617
  Recall:    0.5555555555555556
  F1:        0.3832752613240418



Map: 100%|██████████| 377/377 [00:00<00:00, 2803.16 examples/s]



=== DistilGPT2 on MIDDLE segment only ===
=== Overall (Macro Avg) ===
Precision: 0.46718459839149495
Recall: 0.47073696540528354
F1: 0.450027881236454
Accuracy: 0.4509283819628647

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6533333333333333
  Recall:    0.6901408450704225
  F1:        0.6712328767123288
Happy:
  Precision: 0.527027027027027
  Recall:    0.36792452830188677
  F1:        0.43333333333333335
Relaxed:
  Precision: 0.3148148148148148
  Recall:    0.16831683168316833
  F1:        0.21935483870967742
Sad:
  Precision: 0.3735632183908046
  Recall:    0.6565656565656566
  F1:        0.47619047619047616



Map: 100%|██████████| 377/377 [00:00<00:00, 3696.14 examples/s]



=== DistilGPT2 on END segment only ===
=== Overall (Macro Avg) ===
Precision: 0.42503704628704625
Recall: 0.4216326828005681
F1: 0.41430704626535536
Accuracy: 0.40318302387267907

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5769230769230769
  Recall:    0.6338028169014085
  F1:        0.6040268456375839
Happy:
  Precision: 0.44155844155844154
  Recall:    0.32075471698113206
  F1:        0.37158469945355194
Relaxed:
  Precision: 0.375
  Recall:    0.26732673267326734
  F1:        0.31213872832369943
Sad:
  Precision: 0.30666666666666664
  Recall:    0.46464646464646464
  F1:        0.36947791164658633



Map: 100%|██████████| 377/377 [00:00<00:00, 2446.04 examples/s]



=== DistilBERT on START segment only ===
=== Overall (Macro Avg) ===
Precision: 0.5102223594065909
Recall: 0.49575194697253405
F1: 0.4983741388589917
Accuracy: 0.48010610079575594

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.7164179104477612
  Recall:    0.676056338028169
  F1:        0.6956521739130435
Happy:
  Precision: 0.5263157894736842
  Recall:    0.37735849056603776
  F1:        0.43956043956043955
Relaxed:
  Precision: 0.4375
  Recall:    0.48514851485148514
  F1:        0.460093896713615
Sad:
  Precision: 0.36065573770491804
  Recall:    0.4444444444444444
  F1:        0.39819004524886875



Map: 100%|██████████| 377/377 [00:00<00:00, 1037.54 examples/s]



=== DistilBERT on MIDDLE segment only ===
=== Overall (Macro Avg) ===
Precision: 0.5329355898019736
Recall: 0.5139028983950746
F1: 0.5186916469487651
Accuracy: 0.4960212201591512

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.7536231884057971
  Recall:    0.7323943661971831
  F1:        0.7428571428571429
Happy:
  Precision: 0.6233766233766234
  Recall:    0.4528301886792453
  F1:        0.5245901639344263
Relaxed:
  Precision: 0.3888888888888889
  Recall:    0.4158415841584158
  F1:        0.4019138755980861
Sad:
  Precision: 0.36585365853658536
  Recall:    0.45454545454545453
  F1:        0.40540540540540543



Map: 100%|██████████| 377/377 [00:00<00:00, 3232.96 examples/s]



=== DistilBERT on END segment only ===
=== Overall (Macro Avg) ===
Precision: 0.5313176355256792
Recall: 0.5292664431205278
F1: 0.5297445809428624
Accuracy: 0.5145888594164456

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.7083333333333334
  Recall:    0.7183098591549296
  F1:        0.7132867132867133
Happy:
  Precision: 0.5789473684210527
  Recall:    0.5188679245283019
  F1:        0.5472636815920398
Relaxed:
  Precision: 0.42452830188679247
  Recall:    0.44554455445544555
  F1:        0.43478260869565216
Sad:
  Precision: 0.41346153846153844
  Recall:    0.43434343434343436
  F1:        0.4236453201970443

