In [6]:
import os
import re

# --- Tokenizer
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")
def word_tokenize(s):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]


# --- Metrics / print_results 
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def print_results(gold_labels, predicted_labels):
    p, r, f, _ = precision_recall_fscore_support(gold_labels, 
                                                predicted_labels, 
                                                average='macro', 
                                                zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)
    print("Precision: ", p)
    print("Recall: ", r)
    print("F1: ", f)
    print("Accuracy: ", acc)
    print()



DATASET_DIR = "NJU_MusicMood_v1.0"   
EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]  

timestamp_pattern = re.compile(r"\[\d{2}:\d{2}(?:\.\d{2})?\]")

def clean_lyrics(text: str) -> str:
    text = timestamp_pattern.sub("", text)
    lines = [line.strip() for line in text.splitlines()]
    lines = [line for line in lines if line]  # drop empty
    return "\n".join(lines)

def get_lyrics(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = f.read()
    return clean_lyrics(raw)

        
# --- get Lyrics and emotions
def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue
        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":  
                continue
            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)   
    return texts, labels

# Training and testing
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels     = get_lyrics_and_labels("Test")

# Sanity checks
assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# --- Copy paste from a2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

count_vectorizer = CountVectorizer(analyzer=word_tokenize)

train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts   = count_vectorizer.transform(dev_texts)

lr = LogisticRegression(max_iter=500, random_state=0)
lr_classifier = lr.fit(train_counts, train_labels)

lr_dev_predictions = lr_classifier.predict(dev_counts)

# Print results 
print_results(dev_labels, lr_dev_predictions)

Precision:  0.3826099694894021
Recall:  0.3761659284450901
F1:  0.3761515331158733
Accuracy:  0.3660477453580902



In [7]:
import gensim.downloader
model = gensim.downloader.load('word2vec-google-news-300')

def vec_for_doc(tokenized_doc):
   
    available_vectors = []
    vector_size = model.vector_size
    empty_vector = [0.0] * vector_size
 
    for token in tokenized_doc:
        if token in model.key_to_index:
            available_vectors.append(model[token])
            
    if not available_vectors:
        return empty_vector

    num_words = len(available_vectors)
    summed_vector = empty_vector
    for vec in available_vectors:
        for i in range(vector_size):
            summed_vector[i] += vec[i]
    average_vector = [val / num_words for val in summed_vector]
    return average_vector

In [8]:
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")
def word_tokenize(s, apply_case_folding=True):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]

train_vecs = [vec_for_doc(word_tokenize(x)) for x in train_texts]
dev_vecs = [vec_for_doc(word_tokenize(x)) for x in dev_texts]

# Train logistic regression, same as A2
lr = LogisticRegression(max_iter=500,
                        random_state=0)
clf = lr.fit(train_vecs, train_labels)
dev_predictions = clf.predict(dev_vecs)

print_results(dev_labels, dev_predictions)

Precision:  0.4596175291565
Recall:  0.47568620468212114
F1:  0.45246239781207337
Accuracy:  0.4509283819628647



In [9]:
# Benchmark 1 - Fine-Tune DistilGPT-2 for Emotion
from datasets import Dataset

label2id = {label: i for i, label in enumerate(EMOTIONS)}
id2label = {i: label for label, i in label2id.items()}

train_ds = Dataset.from_dict({
    "text": train_texts,
    "label": [label2id[l] for l in train_labels]
})

dev_ds = Dataset.from_dict({
    "text": dev_texts,
    "label": [label2id[l] for l in dev_labels]
})


In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(EMOTIONS),
    label2id=label2id,
    id2label=id2label
)
model.config.pad_token_id = tokenizer.eos_token_id


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256  # you can try 512 later if you want
    )

train_ds = train_ds.map(tokenize, batched=True)
dev_ds   = dev_ds.map(tokenize, batched=True)

train_ds = train_ds.remove_columns(["text"])
dev_ds   = dev_ds.remove_columns(["text"])

train_ds.set_format(type="torch")
dev_ds.set_format(type="torch")


Map: 100%|██████████| 400/400 [00:00<00:00, 1029.35 examples/s]
Map: 100%|██████████| 377/377 [00:00<00:00, 1003.58 examples/s]


In [12]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)

    p, r, f, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro",
        zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        "precision": p,
        "recall": r,
        "f1": f,
        "accuracy": acc
    }

training_args = TrainingArguments(
    output_dir="./distilgpt2_output",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none",  # disable wandb etc
    dataloader_pin_memory=False

)


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    compute_metrics=compute_metrics
)

trainer.train()


{'loss': 2.6383, 'grad_norm': 25.089111328125, 'learning_rate': 4.91e-05, 'epoch': 0.1}
{'loss': 1.8106, 'grad_norm': 17.266050338745117, 'learning_rate': 4.8100000000000004e-05, 'epoch': 0.2}
{'loss': 1.3926, 'grad_norm': 18.691471099853516, 'learning_rate': 4.71e-05, 'epoch': 0.3}
{'loss': 1.4673, 'grad_norm': 32.9845085144043, 'learning_rate': 4.61e-05, 'epoch': 0.4}
{'loss': 1.4723, 'grad_norm': 29.77958869934082, 'learning_rate': 4.5100000000000005e-05, 'epoch': 0.5}
{'loss': 1.5243, 'grad_norm': 24.03281021118164, 'learning_rate': 4.41e-05, 'epoch': 0.6}
{'loss': 1.5421, 'grad_norm': 17.158782958984375, 'learning_rate': 4.3100000000000004e-05, 'epoch': 0.7}
{'loss': 1.4777, 'grad_norm': 7.565113067626953, 'learning_rate': 4.21e-05, 'epoch': 0.8}
{'loss': 1.4395, 'grad_norm': 13.969622611999512, 'learning_rate': 4.11e-05, 'epoch': 0.9}
{'loss': 1.3929, 'grad_norm': 21.69252586364746, 'learning_rate': 4.0100000000000006e-05, 'epoch': 1.0}


KeyboardInterrupt: 

In [18]:
results = trainer.evaluate()
print("DistilGPT-2 dev results:", results)

raw_pred = trainer.predict(dev_ds)
logits = raw_pred.predictions
pred_ids = logits.argmax(axis=-1)

distilgpt2_dev_predictions = [id2label[i] for i in pred_ids]
distilgpt2_dev_gold = dev_labels  # already string labels

print("DistilGPT-2 classification report:")
print_results(distilgpt2_dev_gold, distilgpt2_dev_predictions)


DistilGPT-2 dev results: {'eval_loss': 3.6030869483947754, 'eval_precision': 0.43744864493461966, 'eval_recall': 0.4627188303470262, 'eval_f1': 0.44662185643877167, 'eval_accuracy': 0.4376657824933687, 'eval_runtime': 12.1209, 'eval_samples_per_second': 31.103, 'eval_steps_per_second': 7.838, 'epoch': 5.0}
DistilGPT-2 classification report:
Precision:  0.43744864493461966
Recall:  0.4627188303470262
F1:  0.44662185643877167
Accuracy:  0.4376657824933687



In [None]:
#pip install transformers datasets torch
#https://www.geeksforgeeks.org/nlp/distilbert-in-natural-language-processing/
#pip install hf_xet, pip install transformers[torch]


from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd

MODEL_NAME = ("distilbert/distilbert-base-uncased-finetuned-sst-2-english")


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

EMOTION2ID = {e: i for i, e in enumerate(EMOTIONS)}
ID2EMOTION = {i: e for e, i in EMOTION2ID.items()}



train_df = pd.DataFrame({
    'text': train_texts,
    'labels': [EMOTION2ID[l] for l in train_labels]
})

test_df = pd.DataFrame({
    'text': dev_texts,
    'labels': [EMOTION2ID[l] for l in dev_labels]
})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def preprocess_function(examples):
    encodings = tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=256 #try 512
    )
    encodings['labels'] = examples['labels']
    return encodings


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(EMOTIONS),
    id2label=ID2EMOTION,
    label2id=EMOTION2ID
)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy="epoch",
    disable_tqdm=False,
    logging_steps=16,
    log_level="error",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

trainer.train()


results = trainer.evaluate()
print(f"Evaluation Results: {results}")


import numpy as np

# Get predictions on the tokenized test set
pred_output = trainer.predict(tokenized_test)

# Raw logits from the model
logits = pred_output.predictions   

# Predicted class IDs (0,1,2,3)
pred_ids = np.argmax(logits, axis=-1)

# Convert IDs back to emotion strings
distilbert_pred_labels = [ID2EMOTION[i] for i in pred_ids]

print("=== DistilBERT on Test Set ===")
print_results(dev_labels, distilbert_pred_labels)