## Imports

In [18]:
## Imports
import os
import re
import numpy as np
import pandas as pd

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import gensim.downloader
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

## Tokenizer

In [2]:
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")


def word_tokenize(s: str):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]

## Metrics

In [63]:

EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]
metrics_result = []
emotions_metric_result = []





def print_results(gold_labels, predicted_labels, model_name = "", display = True):
    # overall
    p, r, f, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average="macro", zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)

    result = {"Model": model_name, "Precision": p, "Recall": r, "F1": f, "Accuracy": acc}
    

    index_found = next((i for i, d in enumerate(metrics_result) if d.get('Model') == model_name), None)

    if (index_found == None):
        metrics_result.append(result)
    else:
        metrics_result[index_found] = result

    df_all = pd.DataFrame(metrics_result)
  
    if (display == True):
        print("=== Overall (Macro Avg) ===")
        # print("Precision:", p)
        # print("Recall:", r)
        # print("F1:", f)
        # print("Accuracy:", acc)
        # print(metrics_result)
        print(df_all.to_string(index=False))
        print()

   


    # Per-emotion metrics
    p_i, r_i, f_i, _ = precision_recall_fscore_support(
        gold_labels, predicted_labels, average=None, zero_division=0
    )
    if (display == True):
        print("=== Per Emotion (Class) Metrics ===")
    for i, emotion in enumerate(EMOTIONS):
        # print("  Precision:", p_i[i])
        # print("  Recall:   ", r_i[i])
        # print("  F1:       ", f_i[i])
        emotion_result = {"Model": model_name, "Emotion": emotion, "Precision": p_i[i], "Recall": r_i[i], "F1": f_i[i]}
        emotion_index_found = next((i for i, d in enumerate(emotions_metric_result) if d.get('Model') == model_name and d.get("Emotion" == emotion)), None)

        if (emotion_index_found == None):
            emotions_metric_result.append(emotion_result)
        else:
            emotions_metric_result[emotion_index_found] = emotion_result
        df_emotions = pd.DataFrame(emotions_metric_result)
        filtered_df = df_emotions[df_emotions['Emotion'] == emotion]
        if (display == True):
            print(f"{emotion}:")
            print(filtered_df.to_string(index=False))
            print()
    
    
    
   


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    p, r, f, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"precision": p, "recall": r, "f1": f, "accuracy": acc}




## Dataset

In [4]:
DATASET_DIR = "NJU_MusicMood_v1.0"

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

timestamp_pattern = re.compile(r"\[\d{2}:\d{2}(?:\.\d{2})?\]")


def clean_lyrics(text: str) -> str:
    # Remove timestamps like [00:29]
    text = timestamp_pattern.sub("", text)

    # Lowercase
    text = text.lower()

    # Normalize quotes
    text = text.replace("’", "'").replace("“", '"').replace("”", '"')

    # Remove ellipses and repeated dots
    text = re.sub(r"\.{2,}", " ", text)

    # Remove long underscores
    text = re.sub(r"_{2,}", " ", text)

    # Remove trailing "end" markers at the end of the file
    text = re.sub(r"\bend[.\s]*$", "", text.strip())

    # Replace newlines with space
    text = text.replace("\n", " ")

    # Keep only letters, digits, spaces, apostrophes
    text = re.sub(r"[^a-z0-9' ]+", " ", text)

    # Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    return text.strip()



def get_lyrics(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return clean_lyrics(raw)


def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue

        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":
                continue

            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)
    return texts, labels


# Load data 
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels = get_lyrics_and_labels("Test")

assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# Datasets
train_ds = Dataset.from_dict(
    {"text": train_texts, "label": [label2id[l] for l in train_labels]}
)
dev_ds = Dataset.from_dict(
    {"text": dev_texts, "label": [label2id[l] for l in dev_labels]}
)


## Baseline: Bag of Words & Logistic Regression

In [64]:
print("=== Baseline: Bag of Words & Logistic Regression ===")

count_vectorizer = CountVectorizer(analyzer=word_tokenize)
train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts = count_vectorizer.transform(dev_texts)

lr_bow = LogisticRegression(max_iter=500, random_state=0)
lr_bow.fit(train_counts, train_labels)

lr_bow_dev_predictions = lr_bow.predict(dev_counts)
print_results(dev_labels, lr_bow_dev_predictions, "BoW & LR")

=== Baseline: Bag of Words & Logistic Regression ===
=== Overall (Macro Avg) ===
   Model  Precision   Recall       F1  Accuracy
BoW & LR   0.380262 0.372645 0.373144  0.363395

=== Per Emotion (Class) Metrics ===
Angry:
   Model Emotion  Precision   Recall       F1
BoW & LR   Angry   0.492958 0.492958 0.492958

Happy:
   Model Emotion  Precision   Recall       F1
BoW & LR   Happy    0.45679 0.349057 0.395722

Relaxed:
   Model Emotion  Precision  Recall       F1
BoW & LR Relaxed   0.305344 0.39604 0.344828

Sad:
   Model Emotion  Precision   Recall       F1
BoW & LR     Sad   0.265957 0.252525 0.259067



## Baseline 2: Word2Vec & Logistic Regression

In [65]:
print("=== Word2Vec & Logistic Regression ===")

w2v_model = gensim.downloader.load("word2vec-google-news-300")
VECTOR_SIZE = w2v_model.vector_size


def vec_for_doc(tokenized_doc):
    vectors = [w2v_model[word] for word in tokenized_doc if word in w2v_model.key_to_index]
    if not vectors:
        return np.zeros(VECTOR_SIZE, dtype="float32")
    return np.mean(vectors, axis=0)


train_vecs = [vec_for_doc(word_tokenize(x)) for x in train_texts]
dev_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_texts]

lr_w2v = LogisticRegression(max_iter=500, random_state=0)
lr_w2v.fit(train_vecs, train_labels)

w2v_dev_predictions = lr_w2v.predict(dev_vecs)
print_results(dev_labels, w2v_dev_predictions, "Word2Vec & LR")

=== Word2Vec & Logistic Regression ===
=== Overall (Macro Avg) ===
        Model  Precision   Recall       F1  Accuracy
     BoW & LR   0.380262 0.372645 0.373144  0.363395
Word2Vec & LR   0.461446 0.478161 0.455341  0.453581

=== Per Emotion (Class) Metrics ===
Angry:
        Model Emotion  Precision   Recall       F1
     BoW & LR   Angry   0.492958 0.492958 0.492958
Word2Vec & LR   Angry   0.612903 0.802817 0.695122

Happy:
        Model Emotion  Precision   Recall       F1
     BoW & LR   Happy   0.456790 0.349057 0.395722
Word2Vec & LR   Happy   0.480392 0.462264 0.471154

Relaxed:
        Model Emotion  Precision   Recall       F1
     BoW & LR Relaxed   0.305344 0.396040 0.344828
Word2Vec & LR Relaxed   0.335821 0.445545 0.382979

Sad:
        Model Emotion  Precision   Recall       F1
     BoW & LR     Sad   0.265957 0.252525 0.259067
Word2Vec & LR     Sad   0.416667 0.202020 0.272109



## Split lyrics into start middle end

In [66]:
def get_segment(text, segment="start", portion=0.3):
    """
    Extract a portion of the lyrics.
    portion=0.3 means 30% of tokens.
    segment can be "start", "middle", or "end".
    """
    tokens = text.split()
    n = len(tokens)
    if n == 0:
        return ""

    cut = int(n * portion)  # number of tokens for start/end

    if segment == "start":
        return " ".join(tokens[:cut])

    elif segment == "middle":
        start = int(n * 0.35)
        end = int(n * 0.65)
        return " ".join(tokens[start:end])

    elif segment == "end":
        return " ".join(tokens[-cut:])

    else:
        return text


# Build dev
dev_start_texts = [get_segment(t, "start") for t in dev_texts]
dev_middle_texts = [get_segment(t, "middle") for t in dev_texts]
dev_end_texts = [get_segment(t, "end") for t in dev_texts]



## Evaluate BoW model Segmented Lyrics

In [67]:
print("=== BoW Logistic Regression: Position-based Evaluation (NO retraining) ===")

# START
dev_start_counts = count_vectorizer.transform(dev_start_texts,)
bow_start_preds = lr_bow.predict(dev_start_counts)
# print("\n--- BoW on START segment only ---")
print_results(dev_labels, bow_start_preds, "BoW Segmented_START", False)

# MIDDLE
dev_middle_counts = count_vectorizer.transform(dev_middle_texts)
bow_middle_preds = lr_bow.predict(dev_middle_counts)
# print("\n--- BoW on MIDDLE segment only ---")
print_results(dev_labels, bow_middle_preds, "BoW Segmented_MIDDLE", False)

# END
dev_end_counts = count_vectorizer.transform(dev_end_texts)
bow_end_preds = lr_bow.predict(dev_end_counts)
# print("\n--- BoW on END segment only ---")
print_results(dev_labels, bow_end_preds, "BoW Segmented_END")


=== BoW Logistic Regression: Position-based Evaluation (NO retraining) ===
=== Overall (Macro Avg) ===
               Model  Precision   Recall       F1  Accuracy
            BoW & LR   0.380262 0.372645 0.373144  0.363395
       Word2Vec & LR   0.461446 0.478161 0.455341  0.453581
 BoW Segmented_START   0.414294 0.320851 0.275773  0.323607
BoW Segmented_MIDDLE   0.405777 0.314621 0.274071  0.318302
   BoW Segmented_END   0.341373 0.301432 0.271661  0.307692

=== Per Emotion (Class) Metrics ===
Angry:
               Model Emotion  Precision   Recall       F1
            BoW & LR   Angry   0.492958 0.492958 0.492958
       Word2Vec & LR   Angry   0.612903 0.802817 0.695122
 BoW Segmented_START   Angry   0.575758 0.267606 0.365385
BoW Segmented_MIDDLE   Angry   0.562500 0.253521 0.349515
   BoW Segmented_END   Angry   0.441176 0.211268 0.285714

Happy:
               Model Emotion  Precision   Recall       F1
            BoW & LR   Happy   0.456790 0.349057 0.395722
       Word2Vec & LR 

## Evaluate Word2Vec on Segmented Lyrics

In [68]:
print("=== Word2Vec Logistic Regression: Position-based Evaluation (NO retraining) ===")

# START
dev_start_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_start_texts]
w2v_start_preds = lr_w2v.predict(dev_start_vecs)
# print("\n--- W2V on START segment only ---")
print_results(dev_labels, w2v_start_preds, "W2V Segmented_START", False)

# MIDDLE
dev_middle_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_middle_texts]
w2v_middle_preds = lr_w2v.predict(dev_middle_vecs)
# print("\n--- W2V on MIDDLE segment only ---")
print_results(dev_labels, w2v_middle_preds, "W2V Segmented_MIDDLE", False)

# END
dev_end_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_end_texts]
w2v_end_preds = lr_w2v.predict(dev_end_vecs)
# print("\n--- W2V on END segment only ---")
print_results(dev_labels, w2v_end_preds, "W2V Segmented_END")

=== Word2Vec Logistic Regression: Position-based Evaluation (NO retraining) ===
=== Overall (Macro Avg) ===
               Model  Precision   Recall       F1  Accuracy
            BoW & LR   0.380262 0.372645 0.373144  0.363395
       Word2Vec & LR   0.461446 0.478161 0.455341  0.453581
 BoW Segmented_START   0.414294 0.320851 0.275773  0.323607
BoW Segmented_MIDDLE   0.405777 0.314621 0.274071  0.318302
   BoW Segmented_END   0.341373 0.301432 0.271661  0.307692
 W2V Segmented_START   0.409452 0.430841 0.409294  0.411141
W2V Segmented_MIDDLE   0.427968 0.456468 0.428019  0.432361
   W2V Segmented_END   0.388004 0.411518 0.388873  0.389920

=== Per Emotion (Class) Metrics ===
Angry:
               Model Emotion  Precision   Recall       F1
            BoW & LR   Angry   0.492958 0.492958 0.492958
       Word2Vec & LR   Angry   0.612903 0.802817 0.695122
 BoW Segmented_START   Angry   0.575758 0.267606 0.365385
BoW Segmented_MIDDLE   Angry   0.562500 0.253521 0.349515
   BoW Segmented_E

## Helpers

In [69]:
def tokenize_dataset(dataset, tokenizer, max_length: int = 256):
    def _tok(batch):
        return tokenizer(
            batch["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    tokenized = dataset.map(_tok, batched=True)
    tokenized = tokenized.remove_columns(["text"])
    tokenized.set_format(type="torch")
    return tokenized


def train_and_eval_transformer(
    model_name: str,
    train_dataset: Dataset,
    dev_dataset: Dataset,
    output_dir: str,
    num_epochs: int,
    learning_rate: float,
    train_bs: int,
    eval_bs: int,
    set_pad_token_eos: bool = False,
):
    print(f"=== Fine-tuning {model_name} ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if set_pad_token_eos:
        tokenizer.pad_token = tokenizer.eos_token

    tokenized_train = tokenize_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenize_dataset(dev_dataset, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(EMOTIONS),
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True,
    )

    if set_pad_token_eos:
        model.config.pad_token_id = tokenizer.eos_token_id

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        weight_decay=0.01,
        eval_strategy="epoch",
        logging_steps=16,
        log_level="error",
        report_to="none",
        save_strategy="epoch",
        dataloader_pin_memory=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    print(f"{model_name} dev results:", eval_results)

    pred_output = trainer.predict(tokenized_dev)
    logits = pred_output.predictions
    pred_ids = np.argmax(logits, axis=-1)
    pred_labels = [id2label[i] for i in pred_ids]

    print(f"{model_name} classification report:")
    print_results(dev_labels, pred_labels, model_name)

    return trainer, eval_results, pred_labels

def eval_transformer_on_segments(model_name: str, trainer: Trainer):
    """
    Evaluate a fine-tuned transformer (trainer) on START/MIDDLE/END
    segments of the dev set, without retraining.
    """
    tokenizer = trainer.tokenizer

    for seg in ["start", "middle", "end"]:
        # Build segmented dev texts
        seg_texts = [get_segment(t, seg) for t in dev_texts]

        # Build a segmented dev Dataset with same labels
        seg_dev_ds = Dataset.from_dict(
            {
                "text": seg_texts,
                "label": [label2id[l] for l in dev_labels],
            }
        )

        # Tokenize segmented dataset
        tokenized_seg_dev = tokenize_dataset(seg_dev_ds, tokenizer)

        # Predict
        pred_output = trainer.predict(tokenized_seg_dev)
        logits = pred_output.predictions
        pred_ids = np.argmax(logits, axis=-1)
        pred_labels = [id2label[i] for i in pred_ids]

        print(f"\n=== {model_name} on {seg.upper()} segment only ===")
        # print_results(dev_labels, pred_labels, model_name, False)
        if (seg == "end"):
            print_results(dev_labels, pred_labels, model_name)
        else:
            print_results(dev_labels, pred_labels, model_name, False)


## Distilbert

In [71]:
distilbert_trainer, distilbert_results, distilbert_pred_labels = train_and_eval_transformer(
    model_name="distilbert/distilbert-base-uncased",
    train_dataset=train_ds,
    dev_dataset=dev_ds,
    output_dir="./distilbert_musicmood",
    num_epochs=3,
    learning_rate=5e-5,
    train_bs=4,
    eval_bs=4,
    set_pad_token_eos=False,
)

=== Fine-tuning distilbert/distilbert-base-uncased ===


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

  trainer = Trainer(


{'loss': 1.4044, 'grad_norm': 2.96364426612854, 'learning_rate': 4.75e-05, 'epoch': 0.16}
{'loss': 1.4013, 'grad_norm': 4.3935956954956055, 'learning_rate': 4.483333333333333e-05, 'epoch': 0.32}
{'loss': 1.4317, 'grad_norm': 4.00533390045166, 'learning_rate': 4.216666666666667e-05, 'epoch': 0.48}
{'loss': 1.4107, 'grad_norm': 2.9911444187164307, 'learning_rate': 3.9500000000000005e-05, 'epoch': 0.64}
{'loss': 1.3998, 'grad_norm': 3.5910935401916504, 'learning_rate': 3.683333333333334e-05, 'epoch': 0.8}
{'loss': 1.4004, 'grad_norm': 2.920949935913086, 'learning_rate': 3.4166666666666666e-05, 'epoch': 0.96}
{'eval_loss': 1.387216567993164, 'eval_precision': 0.06697612732095491, 'eval_recall': 0.25, 'eval_f1': 0.10564853556485355, 'eval_accuracy': 0.26790450928381965, 'eval_runtime': 84.7773, 'eval_samples_per_second': 4.447, 'eval_steps_per_second': 1.121, 'epoch': 1.0}
{'loss': 1.404, 'grad_norm': 2.00425386428833, 'learning_rate': 3.15e-05, 'epoch': 1.12}
{'loss': 1.3717, 'grad_norm': 

## Segment Test

In [None]:
eval_transformer_on_segments("DistilBERT", distilbert_trainer)

Map:   0%|          | 0/377 [00:00<?, ? examples/s]