In [13]:

import os
import json
import pandas as pd


In [None]:
RAW_DIR = "../data_raw/spotify_top_songs"
OUTPUT_FILE = "../data_clean/spotify_top_songs_clean.csv"
os.makedirs("data_clean", exist_ok=True)

In [None]:
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_track_info(item, genre):
    """Extract useful metadata from raw track object"""
    track = item.get("track")
    if not track:
        return None
    
    return {
        "id": track.get("id"),
        "name": track.get("name"),
        "artists": ", ".join([artist.get("name") for artist in track.get("artists", [])]),
        "album": track.get("album", {}).get("name"),
        "release_date": track.get("album", {}).get("release_date"),
        "duration_ms": track.get("duration_ms"),
        "popularity": track.get("popularity"),
        "external_url": track.get("external_urls", {}).get("spotify"),
        "genre": genre
    }

In [None]:
all_tracks = []

for filename in os.listdir(RAW_DIR):
    if not filename.endswith(".json"):
        continue
    
    genre_name = filename.replace("_top_songs.json", "")
    raw_path = os.path.join(RAW_DIR, filename)
    
    raw_tracks = load_json(raw_path)
    
    seen_ids = set()
    genre_tracks = []
    
    for item in raw_tracks:
        if len(genre_tracks) >= 50:
            break
        
        track_info = extract_track_info(item, genre_name)
        if not track_info or not track_info["id"]:
            continue
        
        if track_info["id"] in seen_ids:
            continue
        
        seen_ids.add(track_info["id"])
        genre_tracks.append(track_info)
    
    all_tracks.extend(genre_tracks)


Processing Country...
Kept 50 unique tracks from Country
Processing Hip-Hop...
Kept 50 unique tracks from Hip-Hop
Processing Jazz...
Kept 50 unique tracks from Jazz
Processing Pop...
Kept 50 unique tracks from Pop
Processing Rock...
Kept 50 unique tracks from Rock


In [None]:
df = pd.DataFrame(all_tracks)
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")



Saved 250 total tracks (50 max per playlist) to ../data_clean/spotify_top_songs_clean.csv
