In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import sys
import glob
import json
import numpy as np
import pandas as pd
from pprint import pprint

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)

from config import (
    PROCESSED_FEATURES_DIR, EMB_DISCOGS_MODEL_JSON, PROJECT_ROOT

)


# Create an output directory for our final descriptor files inside the apps folder.
output_dir = os.path.join(PROJECT_ROOT, "apps", "playlist_data")
os.makedirs(output_dir, exist_ok=True)
print("Output directory:", output_dir)


# Use the processed features directory from config.
processed_features_dir = PROCESSED_FEATURES_DIR

# Path to your discogs-effnet metadata JSON file (contains the 400 classes).
discogs_metadata_file = EMB_DISCOGS_MODEL_JSON

with open(discogs_metadata_file, "r") as f:
    discogs_metadata = json.load(f)

genre_classes = discogs_metadata.get("classes", [])
if len(genre_classes) != 400:
    print("Warning: Expected 400 genre classes, but found", len(genre_classes))

def predict_genre(activation_vector, classes):
    """
    Given a 400-dimensional activation vector, return the genre label corresponding
    to the maximum activation.
    """
    if activation_vector and len(activation_vector) == len(classes):
        idx = int(np.argmax(activation_vector))
        return classes[idx]
    else:
        return None

final_rows = []
json_files = glob.glob(os.path.join(processed_features_dir, "*.json"))
print(f"Found {len(json_files)} processed feature JSON files.")

for jf in json_files:
    try:
        with open(jf, "r") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error reading {jf}: {e}")
        continue

    row = {}
    # File path (as stored in the JSON)
    row["file"] = data.get("file", "")
    # Tempo (global BPM)
    row["tempo_global_bpm"] = data.get("tempo_global_bpm", None)
    # Voice/Instrumental: extract predicted class from "voice_instrumental"
    vi = data.get("voice_instrumental", None)
    if isinstance(vi, dict):
        row["voice_instrumental"] = vi.get("predicted_class", None)
    else:
        row["voice_instrumental"] = vi
    # Danceability (Signal-based)
    ds = data.get("danceability_signal", None)
    if isinstance(ds, dict):
        row["danceability_signal"] = ds.get("danceability", None)
    else:
        row["danceability_signal"] = None
    # Danceability (Classifier): probability for "danceable"
    dc = data.get("danceability_classifier", None)
    if isinstance(dc, dict):
        dc_vals = dc.get("danceability_classifier", None)
        row["danceability_classifier"] = dc_vals[0] if isinstance(dc_vals, list) and len(dc_vals) > 0 else None
    else:
        row["danceability_classifier"] = None
    # Emotion: arousal and valence
    emo = data.get("arousal_valence", None)
    if isinstance(emo, dict):
        row["arousal"] = emo.get("arousal", None)
        row["valence"] = emo.get("valence", None)
    else:
        row["arousal"] = None
        row["valence"] = None
    # Key and Scale (using the Temperley profile)
    row["key"] = data.get("temperley_key", None)
    row["scale"] = data.get("temperley_scale", None)
    # Genre: use "predicted_genre" if available; otherwise, compute from "genre_activations"
    if "predicted_genre" in data:
        row["predicted_genre"] = data["predicted_genre"]
    else:
        acts = data.get("genre_activations", None)
        row["predicted_genre"] = predict_genre(acts, genre_classes) if acts else None
    # Parent Genre: extract the broad category by splitting on '---'
    if row["predicted_genre"]:
        row["parent_genre"] = row["predicted_genre"].split('---')[0].strip()
    else:
        row["parent_genre"] = None

    # Do NOT include embeddings here.
    final_rows.append(row)

# Create the final DataFrame (without embeddings)
final_df = pd.DataFrame(final_rows, columns=[
    "file",
    "tempo_global_bpm",
    "voice_instrumental",
    "danceability_signal",
    "danceability_classifier",
    "arousal",
    "valence",
    "key",
    "scale",
    "parent_genre",
    "predicted_genre"
])

# Save final descriptors to CSV and TSV in the output directory.
csv_path = os.path.join(output_dir, "final_descriptors.csv")
final_df.to_csv(csv_path, index=False)
print("Final descriptors saved to CSV:", csv_path)

tsv_path = os.path.join(output_dir, "final_descriptors.tsv")
final_df.to_csv(tsv_path, index=False, sep="\t")
print("Final descriptors saved to TSV:", tsv_path)

print("Sample final descriptors:")
pprint(final_df.head(5).to_dict(orient="records"))


Output directory: /home/cepatinog/amplab/apps/playlist_data
Found 2100 processed feature JSON files.
Final descriptors saved to CSV: /home/cepatinog/amplab/apps/playlist_data/final_descriptors.csv
Final descriptors saved to TSV: /home/cepatinog/amplab/apps/playlist_data/final_descriptors.tsv
Sample final descriptors:
[{'arousal': 5.800262928009033,
  'danceability_classifier': 0.9986696243286133,
  'danceability_signal': 1.0441495180130005,
  'file': '/home/cepatinog/amplab/data/raw/audio_chunks/audio.003/3K/3KyMDgZ2FewZ6XyHuZWs0K.mp3',
  'key': 'F',
  'parent_genre': 'Hip Hop',
  'predicted_genre': 'Hip Hop---Hardcore Hip-Hop',
  'scale': 'major',
  'tempo_global_bpm': 186.0,
  'valence': 4.255828857421875,
  'voice_instrumental': 'voice'},
 {'arousal': 5.202495574951172,
  'danceability_classifier': 0.918759822845459,
  'danceability_signal': 1.1787617206573486,
  'file': '/home/cepatinog/amplab/data/raw/audio_chunks/audio.000/78/78TuGqFPPpC2rwCAewWeRW.mp3',
  'key': 'G',
  'parent_g

In [2]:
import os, glob, json
import pandas as pd

# Use the same processed_features_dir and json_files list.
json_files = glob.glob(os.path.join(processed_features_dir, "*.json"))

embedding_rows = []
for jf in json_files:
    try:
        with open(jf, "r") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error reading {jf}: {e}")
        continue
    row = {}
    row["file"] = data.get("file", "")
    # Get the Discogs embedding and store as a JSON string (if present)
    emb_disc = data.get("emb_discogs", None)
    row["emb_discogs"] = json.dumps(emb_disc) if emb_disc is not None else None
    # Get the MSD embedding and store as a JSON string (if present)
    emb_msd = data.get("emb_msd", None)
    row["emb_msd"] = json.dumps(emb_msd) if emb_msd is not None else None
    embedding_rows.append(row)

# Create a single DataFrame with both embeddings.
embeddings_df = pd.DataFrame(embedding_rows, columns=["file", "emb_discogs", "emb_msd"])

# Save the embeddings DataFrame to CSV and TSV in the output directory.
emb_csv_path = os.path.join(output_dir, "final_embeddings.csv")
embeddings_df.to_csv(emb_csv_path, index=False)
print("Embeddings saved to CSV:", emb_csv_path)

emb_tsv_path = os.path.join(output_dir, "final_embeddings.tsv")
embeddings_df.to_csv(emb_tsv_path, index=False, sep="\t")
print("Embeddings saved to TSV:", emb_tsv_path)



Embeddings saved to CSV: /home/cepatinog/amplab/apps/playlist_data/final_embeddings.csv
Embeddings saved to TSV: /home/cepatinog/amplab/apps/playlist_data/final_embeddings.tsv
