In [2]:
#Import necessary libraries
import os
import csv
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

In [3]:
#Set base directory and threshold for similarity
base_dir = "pipeline/plain-lyrics-output"
threshold = 0.18

In [4]:
# Match POS tags to WordNet format
def get_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    return None

# Grab the title from the first line of the file
def get_title(path):
    with open(path, 'r', encoding='utf-8') as f:
        first = f.readline().strip()
        if first.lower().startswith("title:"):
            return first.split(":", 1)[1].strip()
    return os.path.basename(path)

# Pull out the main synsets from lyrics (skips title line)
def get_synsets(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()[1:]  # skip title line
    text = " ".join(lines).lower()
    words = word_tokenize(text)
    tagged = pos_tag(words)

    syns = []
    for word, tag in tagged:
        wn_tag = get_pos(tag)
        if wn_tag:
            s = wn.synsets(word, pos=wn_tag)
            if s:
                syns.append(s[0])
    return syns

# Average similarity between two songs
def compare(s1, s2):
    scores = []
    for a in s1:
        for b in s2:
            sim = a.path_similarity(b)
            if sim:
                scores.append(sim)
    return round(sum(scores) / len(scores), 3) if scores else 0.0

# Get all songs for one artist
def load_songs(artist):
    songs = {}
    artist_path = os.path.join(base_dir, artist)
    for album in os.listdir(artist_path):
        album_path = os.path.join(artist_path, album)
        if not os.path.isdir(album_path):
            continue  #Skips DS Store
        for file in os.listdir(album_path):
            if file.endswith(".txt"):
                full = os.path.join(album_path, file)
                title = get_title(full)
                label = f"{artist} - {title}"
                songs[label] = get_synsets(full)
    return songs

# Get songs
greta = load_songs("Greta")
zep = load_songs("Zeppelin")

# Write similarities above threshold
with open("song_similarity_network.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Source", "Target", "Similarity"])

    for g_label, g_syns in greta.items():
        for z_label, z_syns in zep.items():
            sim = compare(g_syns, z_syns)
            print(f"{g_label} vs {z_label} = {sim}")  # Debug: see the score
            if sim > threshold:
                writer.writerow([g_label, z_label, sim])

Greta - Frozen Light vs Zeppelin - Ramble On = 0.15
Greta - Frozen Light vs Zeppelin - Living Loving Maid (She's Just a Woman) = 0.139
Greta - Frozen Light vs Zeppelin - Thank You = 0.16
Greta - Frozen Light vs Zeppelin - Heartbreaker = 0.153
Greta - Frozen Light vs Zeppelin - Whole Lotta Love = 0.146
Greta - Frozen Light vs Zeppelin - What Is and What Should Never Be = 0.153
Greta - Frozen Light vs Zeppelin - The Lemon Song = 0.14
Greta - Frozen Light vs Zeppelin - Moby Dick = 0.132
Greta - Frozen Light vs Zeppelin - Bring It On Home = 0.149
Greta - Frozen Light vs Zeppelin - Communication Breakdown = 0.157
Greta - Frozen Light vs Zeppelin - Black Mountain Side = 0.127
Greta - Frozen Light vs Zeppelin - Dazed and Confused = 0.157
Greta - Frozen Light vs Zeppelin - Your Time Is Gonna Come = 0.16
Greta - Frozen Light vs Zeppelin - Good Times Bad Times = 0.158
Greta - Frozen Light vs Zeppelin - Babe I'm Gonna Leave You = 0.156
Greta - Frozen Light vs Zeppelin - You Shook Me = 0.143
Greta