## Imports

In [1]:
import os
import re
import numpy as np

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import gensim.downloader
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizer

In [2]:
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")


def word_tokenize(s: str):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]

## Metrics

In [4]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]

def print_results(gold_labels, predicted_labels):
    # overall
    p, r, f, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)

    print("=== Overall (Macro Avg) ===")
    print("Precision:", p)
    print("Recall:", r)
    print("F1:", f)
    print("Accuracy:", acc)
    print()

    # Per-emotion metrics
    p_i, r_i, f_i, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average=None, zero_division=0
    )

    print("=== Per Emotion (Class) Metrics ===")
    for i, emotion in enumerate(EMOTIONS):
        print(f"{emotion}:")
        print("  Precision:", p_i[i])
        print("  Recall:   ", r_i[i])
        print("  F1:       ", f_i[i])
    print()



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    p, r, f, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"precision": p, "recall": r, "f1": f, "accuracy": acc}




## Dataset

In [5]:
DATASET_DIR = "NJU_MusicMood_v1.0"

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

timestamp_pattern = re.compile(r"\[\d{2}:\d{2}(?:\.\d{2})?\]")


def clean_lyrics(text: str) -> str:
    # Remove timestamps
    text = timestamp_pattern.sub("", text)
    lines = [line.strip() for line in text.splitlines()]
    lines = [line for line in lines if line]  # drop empty lines
    return "\n".join(lines)


def get_lyrics(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return clean_lyrics(raw)


def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue

        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":
                continue

            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)
    return texts, labels


# Load data 
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels = get_lyrics_and_labels("Test")

assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# Datasets
train_ds = Dataset.from_dict(
    {"text": train_texts, "label": [label2id[l] for l in train_labels]}
)
dev_ds = Dataset.from_dict(
    {"text": dev_texts, "label": [label2id[l] for l in dev_labels]}
)


## Baseline: Bag of Words & Logistic Regression

In [6]:
print("=== Baseline: Bag of Words & Logistic Regression ===")

count_vectorizer = CountVectorizer(analyzer=word_tokenize)
train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts = count_vectorizer.transform(dev_texts)

lr_bow = LogisticRegression(max_iter=500, random_state=0)
lr_bow.fit(train_counts, train_labels)

lr_bow_dev_predictions = lr_bow.predict(dev_counts)
print_results(dev_labels, lr_bow_dev_predictions)

=== Baseline: Bag of Words & Logistic Regression ===
=== Overall (Macro Avg) ===
Precision: 0.3826099694894021
Recall: 0.3761659284450901
F1: 0.3761515331158733
Accuracy: 0.3660477453580902

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.5
  Recall:    0.5070422535211268
  F1:        0.5034965034965035
Happy:
  Precision: 0.4567901234567901
  Recall:    0.3490566037735849
  F1:        0.39572192513368987
Relaxed:
  Precision: 0.3076923076923077
  Recall:    0.39603960396039606
  F1:        0.3463203463203463
Sad:
  Precision: 0.26595744680851063
  Recall:    0.25252525252525254
  F1:        0.25906735751295334



## Baseline 2: Word2Vec & Logistic Regression

In [7]:
print("=== Word2Vec & Logistic Regression ===")

w2v_model = gensim.downloader.load("word2vec-google-news-300")
VECTOR_SIZE = w2v_model.vector_size


def vec_for_doc(tokenized_doc):
    vectors = [w2v_model[word] for word in tokenized_doc if word in w2v_model.key_to_index]
    if not vectors:
        return np.zeros(VECTOR_SIZE, dtype="float32")
    return np.mean(vectors, axis=0)


train_vecs = [vec_for_doc(word_tokenize(x)) for x in train_texts]
dev_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_texts]

lr_w2v = LogisticRegression(max_iter=500, random_state=0)
lr_w2v.fit(train_vecs, train_labels)

w2v_dev_predictions = lr_w2v.predict(dev_vecs)
print_results(dev_labels, w2v_dev_predictions)

=== Word2Vec & Logistic Regression ===
=== Overall (Macro Avg) ===
Precision: 0.4596175291565
Recall: 0.47568620468212114
F1: 0.45246239781207337
Accuracy: 0.4509283819628647

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6063829787234043
  Recall:    0.8028169014084507
  F1:        0.6909090909090909
Happy:
  Precision: 0.47572815533980584
  Recall:    0.46226415094339623
  F1:        0.4688995215311005
Relaxed:
  Precision: 0.3308270676691729
  Recall:    0.43564356435643564
  F1:        0.37606837606837606
Sad:
  Precision: 0.425531914893617
  Recall:    0.20202020202020202
  F1:        0.273972602739726



## Helpers

In [8]:
def tokenize_dataset(dataset, tokenizer, max_length: int = 256):
    def _tok(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    tokenized = dataset.map(_tok, batched=True)
    tokenized = tokenized.remove_columns(["text"])
    tokenized.set_format(type="torch")
    return tokenized


def train_and_eval_transformer(
    model_name: str,
    train_dataset: Dataset,
    dev_dataset: Dataset,
    output_dir: str,
    num_epochs: int,
    learning_rate: float,
    train_bs: int,
    eval_bs: int,
    set_pad_token_eos: bool = False,
):
    print(f"=== Fine-tuning {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if set_pad_token_eos:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized_train = tokenize_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenize_dataset(dev_dataset, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(EMOTIONS),
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    )

    if set_pad_token_eos:
        model.config.pad_token_id = tokenizer.eos_token_id

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        weight_decay=0.01,
        eval_strategy="epoch",
        logging_steps=16,
        log_level="error",
        report_to="none",
        save_strategy="epoch",
        dataloader_pin_memory=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(f"{model_name} dev results:", eval_results)

    pred_output = trainer.predict(tokenized_dev)
    logits = pred_output.predictions
    pred_ids = np.argmax(logits, axis=-1)
    pred_labels = [id2label[i] for i in pred_ids]

    print(f"{model_name} classification report:")
    print_results(dev_labels, pred_labels)

    return trainer, eval_results, pred_labels

## DistilGPT2

In [15]:
gpt2_trainer, gpt2_results, gpt2_pred_labels = train_and_eval_transformer(
    model_name="distilgpt2",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilgpt2_output",
    num_epochs=5,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=True,
)

=== Fine-tuning distilgpt2 ===


Map: 100%|██████████| 400/400 [00:00<00:00, 841.22 examples/s]
Map: 100%|██████████| 377/377 [00:00<00:00, 1382.43 examples/s]
  trainer = Trainer(


{'loss': 2.0607, 'grad_norm': 30.447505950927734, 'learning_rate': 4.85e-05, 'epoch': 0.16}
{'loss': 1.7591, 'grad_norm': 70.75331115722656, 'learning_rate': 4.69e-05, 'epoch': 0.32}
{'loss': 1.7095, 'grad_norm': 41.877601623535156, 'learning_rate': 4.53e-05, 'epoch': 0.48}
{'loss': 1.4226, 'grad_norm': 28.678590774536133, 'learning_rate': 4.3700000000000005e-05, 'epoch': 0.64}
{'loss': 1.3951, 'grad_norm': 38.81130599975586, 'learning_rate': 4.21e-05, 'epoch': 0.8}
{'loss': 1.466, 'grad_norm': 61.95573043823242, 'learning_rate': 4.05e-05, 'epoch': 0.96}
{'eval_loss': 1.3817614316940308, 'eval_precision': 0.4059052922244166, 'eval_recall': 0.3381907749637573, 'eval_f1': 0.25614746543778805, 'eval_accuracy': 0.2917771883289125, 'eval_runtime': 179.307, 'eval_samples_per_second': 2.103, 'eval_steps_per_second': 0.53, 'epoch': 1.0}
{'loss': 1.2712, 'grad_norm': 22.22853660583496, 'learning_rate': 3.8900000000000004e-05, 'epoch': 1.12}
{'loss': 1.278, 'grad_norm': 40.40504837036133, 'learn

## Distilbert

In [9]:
distilbert_trainer, distilbert_results, distilbert_pred_labels = train_and_eval_transformer(
    model_name="distilbert/distilbert-base-uncased",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilbert_musicmood",
    num_epochs=3,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=False,
)

=== Fine-tuning distilbert/distilbert-base-uncased ===


Map: 100%|██████████| 400/400 [00:00<00:00, 973.00 examples/s] 
Map: 100%|██████████| 377/377 [00:00<00:00, 1314.74 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.4289,1.381346,0.283878,0.356637,0.241124,0.307692
2,1.1695,1.169916,0.451267,0.484077,0.421109,0.453581
3,1.011,1.109788,0.527291,0.546339,0.533452,0.522546


distilbert/distilbert-base-uncased dev results: {'eval_loss': 1.109787940979004, 'eval_precision': 0.52729114236282, 'eval_recall': 0.5463388486071568, 'eval_f1': 0.5334521378486706, 'eval_accuracy': 0.5225464190981433, 'eval_runtime': 86.7383, 'eval_samples_per_second': 4.346, 'eval_steps_per_second': 1.095, 'epoch': 3.0}
distilbert/distilbert-base-uncased classification report:
=== Overall (Macro Avg) ===
Precision: 0.52729114236282
Recall: 0.5463388486071568
F1: 0.5334521378486706
Accuracy: 0.5225464190981433

=== Per Emotion (Class) Metrics ===
Angry:
  Precision: 0.6896551724137931
  Recall:    0.8450704225352113
  F1:        0.759493670886076
Happy:
  Precision: 0.5555555555555556
  Recall:    0.4716981132075472
  F1:        0.5102040816326531
Relaxed:
  Precision: 0.4594594594594595
  Recall:    0.504950495049505
  F1:        0.4811320754716981
Sad:
  Precision: 0.4044943820224719
  Recall:    0.36363636363636365
  F1:        0.3829787234042553

