In [0]:
"./utils/configuration"

In [0]:
# %run "../utils/common_functions"

In [0]:
bronze_df = spark.read.json("/mnt/musicstg/bronze/artists_full/")

In [0]:
# Review the data schema

bronze_df.printSchema()

In [0]:
from pyspark.sql.functions import explode

# Transform the bronze_df DataFrame to extract and flatten collaborators information
collaborators_silver = (
    bronze_df
    # Select artist_id and explode the collaborators array into separate rows
    .select(
        "artist_id",
        explode("collaborators").alias("collab")
    )
    # Select relevant collaborator fields
    .select(
        "artist_id",
        "collab.id",
        "collab.name",
        "collab.type",
        "collab.href",
        "collab.uri",
        "collab.external_urls.spotify"
    )
    # Rename the collaborator id column for clarity
    .withColumnRenamed("id", "collaborator_id")
    # Remove duplicate artist-collaborator pairs
    .dropDuplicates(["artist_id", "collaborator_id"])
)

In [0]:
display(collaborators_silver)

In [0]:
collaborators_silver.write.mode("overwrite").parquet("/mnt/musicstg/silver/collaborations/")