In [7]:
%pip install -q tqdm polars

Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import json
import pickle
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Initialize dictionaries to store song details and playlist data
song_details = {}
playlist_songs = {}

# Directory containing JSON files
json_directory = "data"

# Function to process a single JSON file
def process_json_file(json_file):
    global song_details, playlist_songs

    # Read the JSON file
    with open(os.path.join(json_directory, json_file), 'r') as f:
        data = json.load(f)

        # Extract playlist information
        for playlist in data.get("playlists", []):
            playlist_name = playlist["name"]
            song_ids = set()  # Store song IDs for this playlist

            for track in playlist.get("tracks", []):
                song_id = track["track_uri"]

                # Store song details
                song_details[song_id] = {
                    "artist_name": track["artist_name"],
                    "track_name": track["track_name"],
                    "album_name": track["album_name"],
                    "duration_ms": track["duration_ms"],
                    "album_uri": track["album_uri"]
                }

                # Add the song ID to the playlist's set of song IDs
                song_ids.add(song_id)

            # Store playlist data (set of song IDs)
            playlist_songs[playlist_name] = song_ids

# Get list of JSON files in the directory
json_files = [f for f in os.listdir(json_directory) if f.endswith('.json')]

# Using ThreadPoolExecutor for multithreading with tqdm progress bar
with ThreadPoolExecutor(max_workers=4) as executor:
    # Wrap the executor's map with tqdm for a progress bar
    list(tqdm(executor.map(process_json_file, json_files), total=len(json_files), desc="Processing JSON files"))

# Saving the data structures using pickle for fast retrieval
with open('song_details.pkl', 'wb') as f:
    pickle.dump(song_details, f)

with open('playlist_songs.pkl', 'wb') as f:
    pickle.dump(playlist_songs, f)

# Example of retrieving song details
song_id = 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI'
if song_id in song_details:
    print(song_details[song_id])

# Example of retrieving song IDs in a playlist
playlist_name = 'Throwbacks'
if playlist_name in playlist_songs:
    print(playlist_songs[playlist_name])


Processing JSON files: 100%|██████████| 1000/1000 [04:05<00:00,  4.08it/s]


{'artist_name': 'Missy Elliott', 'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)', 'album_name': 'The Cookbook', 'duration_ms': 226863, 'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K'}
{'spotify:track:478eJVTkk34oWr1abVMqfv', 'spotify:track:3ZFTkvIE7kyPt6Nu3PEa7V', 'spotify:track:3njpLvANriMsdv3dgADEad', 'spotify:track:03tqyYWC9Um2ZqU0ZN849H', 'spotify:track:2zOWuWdyuMqOoboNRKlJO0', 'spotify:track:6o3s08kk2fQI37vxGZDrJ1', 'spotify:track:3r9bgSJlJz2zlevcBRYXko', 'spotify:track:3oDFtOhcN08qeDPAK6MEQG', 'spotify:track:6lV2MSQmRIkycDScNtrBXO', 'spotify:track:4E5P1XyAFtrjpiIxkydly4', 'spotify:track:4P6BuLsqtg5uISdE77ypI9', 'spotify:track:6e8lA4dJYNRNcPAUYX4QOk', 'spotify:track:5OiLJ8tjUPFiPX2gVM8fxJ', 'spotify:track:06iMqWThw4w8fTFyccvOwr', 'spotify:track:3CcvahnsiArpTHYQEWV2Au', 'spotify:track:3JILPb8Uc6NKmHq1poT5ML', 'spotify:track:3f7gYMirBEKuc57218BjOY', 'spotify:track:4XHQyvbrBsQaaBUW1VvmsL', 'spotify:track:6Gn02ZC8juXwQ10Xk7ACXx', 'spotify:track:5KY7zgFeH2GWoL1zP9mME6', 

In [8]:
import pickle
import polars as pl
from collections import Counter
from tqdm import tqdm

# Load song metadata and playlist data
with open("song_details.pkl", "rb") as f:
    song_details = pickle.load(f)

with open("playlist_songs.pkl", "rb") as f:
    playlist_songs = pickle.load(f)

# Count occurrences of each track_uri across all playlists
track_counter = Counter()

for playlist, song_ids in tqdm(playlist_songs.items(), desc="Counting tracks"):
    track_counter.update(song_ids)

# Convert to Polars DataFrame
count_df = pl.DataFrame([
    {"track_uri": track_uri, "count": count}
    for track_uri, count in track_counter.items()
])

# Prepare song metadata as DataFrame
meta_df = pl.DataFrame([
    {
        "track_uri": uri,
        "track_name": meta["track_name"],
        "artist_name": meta["artist_name"],
    }
    for uri, meta in song_details.items()
])

# Join count with metadata
result_df = (
    count_df
    .join(meta_df, on="track_uri", how="left")
    .sort("count", descending=True)
    .head(100)
)

# Display results
print(result_df)

# Optional: Save to CSV
result_df.write_csv("top_100_songs.csv")


Counting tracks: 100%|██████████| 92944/92944 [00:01<00:00, 62630.11it/s]


shape: (100, 4)
┌─────────────────────────────────┬───────┬─────────────────────────────┬───────────────────┐
│ track_uri                       ┆ count ┆ track_name                  ┆ artist_name       │
│ ---                             ┆ ---   ┆ ---                         ┆ ---               │
│ str                             ┆ i64   ┆ str                         ┆ str               │
╞═════════════════════════════════╪═══════╪═════════════════════════════╪═══════════════════╡
│ spotify:track:1xznGGDReH1oQq0x… ┆ 3605  ┆ One Dance                   ┆ Drake             │
│ spotify:track:7KXjTSCq5nL1LoYt… ┆ 3588  ┆ HUMBLE.                     ┆ Kendrick Lamar    │
│ spotify:track:7BKLCZ1jbUBVqRi2… ┆ 3471  ┆ Closer                      ┆ The Chainsmokers  │
│ spotify:track:7yyRTcZmCiyzzJlN… ┆ 3182  ┆ Broccoli (feat. Lil Yachty) ┆ DRAM              │
│ spotify:track:3a1lNhkSLSkpJE4M… ┆ 3181  ┆ Congratulations             ┆ Post Malone       │
│ …                               ┆ …     ┆ 