In [0]:
"./utils/configuration"

In [0]:
 # %run "../utils/common_functions"

In [0]:
bronze_df = spark.read.json("/mnt/musicstg/bronze/artists_full/")

In [0]:
bronze_df.printSchema()

In [0]:
from pyspark.sql.functions import explode

# Transform the bronze_df DataFrame to extract and flatten track information for the silver layer
tracks_silver = (
    bronze_df
    # Select artist_id and explode the tracks array into individual rows
    .select(
        "artist_id",
        explode("tracks").alias("track")
    )
    # Select relevant fields from the exploded track struct
    .select(
        "artist_id",
        "track.id",
        "track.name",
        "track.track_number",
        "track.disc_number",
        "track.duration_ms",
        "track.explicit",
        "track.external_urls.spotify",
        "track.album_id",
        "track.album_name",
        "track.album_release_date"
    )
    # Rename the 'id' column to 'track_id'
    .withColumnRenamed("id", "track_id") 
    # Remove duplicate tracks based on 'track_id'
    .dropDuplicates(["track_id"])
)

In [0]:
display(tracks_silver)

In [0]:
tracks_silver.write.mode("overwrite").parquet("/mnt/musicstg/silver/tracks")