In [0]:
"./utils/configuration"

In [0]:
# %run "../utils/common_functions"

In [0]:
bronze_df = spark.read.json("/mnt/musicstg/bronze/artists_full/")

In [0]:
bronze_df.printSchema()

In [0]:
from pyspark.sql.functions import explode

# Transform bronze_df to extract album information and create the albums_silver DataFrame
albums_silver = (
    bronze_df
    # Select artist_id and explode the albums array to get one row per album
    .select(
        "artist_id",
        explode("albums").alias("album")
    )
    # Select relevant album fields and flatten the nested structure
    .select(
        "artist_id",
        "album.id",
        "album.name",
        "album.release_date",
        "album.total_tracks",
        "album.album_type",
        "album.external_urls.spotify",
        "album.images"
    )
    # Rename the album id column for clarity
    .withColumnRenamed("id", "album_id")
    # Remove duplicate albums based on album_id
    .dropDuplicates(["album_id"])
)

In [0]:
display(albums_silver)

In [0]:
albums_silver.write.mode("overwrite").parquet("/mnt/musicstg/silver/albums")