In [2]:
import os
import re

# --- Tokenizer ---
word_tokenize_pattern = re.compile(r"(?u)\b\w\w+\b")
def word_tokenize(s):
    return [x.lower() for x in word_tokenize_pattern.findall(s)]


# --- Metrics / print_results ---
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def print_results(gold_labels, predicted_labels):
    p, r, f, _ = precision_recall_fscore_support(gold_labels, 
                                                predicted_labels, 
                                                average='macro', 
                                                zero_division=0
    )
    acc = accuracy_score(gold_labels, predicted_labels)
    print("Precision: ", p)
    print("Recall: ", r)
    print("F1: ", f)
    print("Accuracy: ", acc)
    print()



DATASET_DIR = "NJU_MusicMood_v1.0"   
EMOTIONS = ["Angry", "Happy", "Relaxed", "Sad"]  


# --- Get Lyrics
def get_lyrics(path):
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
        
# --- get Lyrics and emotions
def get_lyrics_and_labels(split: str):
    texts, labels = [], []
    for emotion in EMOTIONS:
        folder = os.path.join(DATASET_DIR, emotion, split)
        if not os.path.isdir(folder):
            continue
        for fname in os.listdir(folder):
            if not fname.endswith(".txt"):
                continue
            if fname.lower() == "info.txt":  
                continue
            path = os.path.join(folder, fname)
            txt = get_lyrics(path)
            if txt.strip():
                texts.append(txt)
                labels.append(emotion)   
    return texts, labels

# Training and testing
train_texts, train_labels = get_lyrics_and_labels("Train")
dev_texts, dev_labels     = get_lyrics_and_labels("Test")

# Sanity checks
assert len(train_texts) == len(train_labels)
assert len(dev_texts) == len(dev_labels)

# --- Copy paste from a2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

count_vectorizer = CountVectorizer(analyzer=word_tokenize)

train_counts = count_vectorizer.fit_transform(train_texts)
dev_counts   = count_vectorizer.transform(dev_texts)

lr = LogisticRegression(max_iter=500, random_state=0)
lr_classifier = lr.fit(train_counts, train_labels)

lr_dev_predictions = lr_classifier.predict(dev_counts)

# Print results 
print_results(dev_labels, lr_dev_predictions)


Precision:  0.40099782663599426
Recall:  0.39362557369210216
F1:  0.39549680513806074
Accuracy:  0.3925729442970822

