- Pulls raw artist metadata from Spotify
- Adds ingestion timestamp
- Writes raw JSON to Bronze
- Creates a repeatable ingestion pattern
- Sets the foundation for Silver cleaning
This is exactly how a professional Databricks ingestion notebook should behave.


In [0]:
"./utils/configuration"

In [0]:
%run "../utils/common_functions"

In [0]:
#  secrets (Databricks Secret Scope recommended)

client_id = dbutils.secrets.get("spotify-scope", "spoti-client-id")
client_secret = dbutils.secrets.get("spotify-scope", "spoti-client-secret")

In [0]:
# Load secrets (Spotify secret scope)

spotify_client_id = dbutils.secrets.get("spotify-scope", "spotify-api-client-id")
spotify_client_secret = dbutils.secrets.get("spotify-scope", "spotify-api-client-secret")

In [0]:
# Import the requests library for making HTTP requests to the Spotify API
import requests
# Import the base64 library for encoding client credentials
import base64
# Import the json library for working with JSON data
import json
# Import the datetime class for handling timestamps
from datetime import datetime


In [0]:
def spotify_get(url: str, token: str, params: dict = None) -> dict:
    # Construct the authorization header using the provided OAuth token
    headers = {"Authorization": f"Bearer {token}"}

    # Make a GET request to the specified Spotify API URL with headers and optional query parameters
    response = requests.get(url, headers=headers, params=params)

    # Raise an HTTPError if the response contains an unsuccessful status code
    response.raise_for_status()
    
    # Parse and return the response body as a Python dictionary (parsed from JSON)
    return response.json()

In [0]:
def ingest_full_artist(artist_id: str, token: str) -> dict:
    """
    Fully ingests artist metadata, albums, tracks, audio features, and collaborators from Spotify API.
    Includes:
    - token refresh
    - batching
    - invalid ID filtering
    - 403-safe audio features calls
    """

    # 1. Retrieve artist metadata from Spotify API
    # This fetches the main artist object, including name, genres, popularity, etc.
    artist = spotify_get(f"https://api.spotify.com/v1/artists/{artist_id}", token)

    # 2. Retrieve all albums for the artist (including albums, singles, compilations, and appearances)
    # This gets up to 50 albums, singles, compilations, and appearances for the artist.
    albums = spotify_get(
        f"https://api.spotify.com/v1/artists/{artist_id}/albums",
        token,
        params={"limit": 50, "include_groups": "album,single,compilation,appears_on"}
    ).get("items", [])

    # 3. Retrieve all tracks for each album
    all_tracks = []
    for album in albums:
        album_id = album["id"]

        # Get tracks for the current album (up to 50 per album)
        tracks = spotify_get(
            f"https://api.spotify.com/v1/albums/{album_id}/tracks",
            token,
            params={"limit": 50}
        ).get("items", [])

        # Add album metadata to each track for context (album id, name, release date)
        for t in tracks:
            t["album_id"] = album_id
            t["album_name"] = album.get("name")
            t["album_release_date"] = album.get("release_date")

        # Accumulate all tracks across all albums
        all_tracks.extend(tracks)

    # Filter out tracks without valid IDs (some tracks may be missing 'id')
    track_ids = [t["id"] for t in all_tracks if t.get("id")]

    # 4. Retrieve audio features for all tracks in batches of 100 (Spotify API limit)
    audio_features = []
    for i in range(0, len(track_ids), 100):
        batch = track_ids[i:i+100]

        try:
            # Batch request for audio features (up to 100 track IDs per request)
            features = spotify_get(
                "https://api.spotify.com/v1/audio-features",
                token,
                params={"ids": ",".join(batch)}
            ).get("audio_features", [])

            # Only include non-null feature results (some tracks may not have features)
            audio_features.extend([f for f in features if f])

        except Exception as e:
            # Handle errors (e.g., 403 Forbidden) gracefully and continue with next batch
            print(f"Skipping audio-features batch due to error: {e}")
            continue

    # 5. Extract all collaborators (other artists) from track metadata
    # Collects all artist objects from each track's 'artists' field (may include duplicates)
    collaborators = []
    for t in all_tracks:
        collaborators.extend(t.get("artists", []))

    # Return a dictionary containing all ingested data, including a UTC timestamp
    # The result includes the artist, their albums, tracks, audio features, and collaborators
    return {
        "artist_id": artist_id,
        "timestamp": datetime.utcnow().isoformat(),
        "artist": artist,
        "albums": albums,
        "tracks": all_tracks,
        "audio_features": audio_features,
        "collaborators": collaborators
    }

In [0]:
ghana_artist_ids = [
    "01DTVE3KmoPogPZaOvMqO8",   # Sarkodie
    "2ayt5jDUuTCpoTG7sHSvuq",   # Stonebwoy
    "42q0rYXtR561ypg1Fcw1PI",   # Shatta Wale
    "1zO1FWFxxNUCqUuGATxZQZ",   # Gyakie
    "4tIKaxUmpXzshok2yCnwdf",   # King Promise
    "0pPz4oYqGp2Co2Sx7ORiYL",   # Medikal
    "2LiqbH7OhqP0yuaG8VL1wJ",   # Black Sherif
    "3WDXKsCKcxJhvrvpdg5IGI",   # Lasmid 
    "5yrRN6GxtTSHbcw7qxPg4S",   # Wendy Shay
    "1dlInrJwE0KSP9hZ0ALsI6",   # Efya
    "0GGKrcPOlBkmBzQDf2Ogkl",   # Kwame Eugene
    "14PimM6ohO2gYftuwTam9V",   # Kidi
    "21UPYSRWFKwtqvSAnFnSvS",   # Amaarae
    "2hVWBpjLW4Q7fboYz2pVYK",   # Moliy
    "6TQW172m1l4Tf4Hp55ZdDm",   # Kofi Kinata
    "52iM1kP5BpnLypZ0VtrpyY",   # Kwesi Arthur
    # Add more Ghanaian artists here
]

In [0]:
# dbutils.fs.put("/mnt/musicstg/bronze/artists_full/test.json", '{"ok": true}', overwrite=True)

In [0]:
import json, uuid

# Define the base path in DBFS where the ingested artist data will be stored
bronze_path = "/mnt/musicstg/bronze/artists_full/"

# Iterate over each Ghanaian artist ID in the provided list
for artist_id in ghana_artist_ids:
    # Ingest full artist data from Spotify API for the current artist_id using the provided token
    record = ingest_full_artist(artist_id, token)
    
    # Generate a unique file path for each artist record using the artist_id and a random UUID
    # This ensures that each file is uniquely named and avoids overwriting
    file_path = f"{bronze_path}/{artist_id}_{uuid.uuid4().hex}.json"
    
    # Serialize the artist record as a JSON string and write it to DBFS at the specified file path
    # The 'overwrite=True' flag ensures the file is overwritten if it already exists (unlikely due to UUID)
    dbutils.fs.put(file_path, json.dumps(record), overwrite=True)

# Print a completion message after all artist records have been ingested and written to DBFS
print("Bronze ingestion complete.")

In [0]:
display(dbutils.fs.ls("/mnt/musicstg/bronze/artists_full/"))

In [0]:
# Read all JSON files from the specified Bronze path into a Spark DataFrame
bronze_df = spark.read.json("/mnt/spotify/bronze/artists_full/")

# Display the DataFrame in a rich tabular format
display(bronze_df)