In [0]:
dbutils.widgets.removeAll()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:

dbutils.widgets.text("catalog", "proyectofinal")
dbutils.widgets.text("schema_source", "silver")
dbutils.widgets.text("schema_sink", "golden")


In [0]:
catalog = dbutils.widgets.get("catalog")
schema_source = dbutils.widgets.get("schema_source")
schema_sink = dbutils.widgets.get("schema_sink")

In [0]:
track = spark.table(f"{catalog}.{schema_source}.df_track_transformed")
audio = spark.table(f"{catalog}.{schema_source}.df_audio_transformed")
bridge = spark.table(f"{catalog}.{schema_source}.df_bridge_transformed")
artist = spark.table(f"{catalog}.{schema_source}.df_artist_transformed")

#Fact track

In [0]:
artist_agg_by_track = (
    bridge.join(artist, on="artist_id", how="left")
    .groupBy("track_id")
    .agg(
        F.countDistinct("artist_id").alias("artist_count"),
        F.sum(F.coalesce(F.col("followers"), F.lit(0))).alias("total_followers"),
        F.avg(F.col("artist_popularity")).alias("avg_artist_popularity"),
        F.concat_ws(", ", F.sort_array(F.collect_set("artist_name"))).alias("artists_concat")
    )
)

df_fact_track = (
    track.join(audio, on="track_id", how="left")
         .join(artist_agg_by_track, on="track_id", how="left")
)

df_fact_track.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema_sink}.fact_track_enriched")


# Artist impact

In [0]:
df_artist_impact = (
    bridge.join(track, on="track_id", how="left")
          .join(audio, on="track_id", how="left")
          .join(artist.select("artist_id","artist_name","followers","artist_popularity","main_genre"), on="artist_id", how="left")
          .groupBy("artist_id","artist_name","main_genre","followers","artist_popularity")
          .agg(
              F.countDistinct("track_id").alias("tracks_count"),
              F.avg("track_popularity").alias("avg_track_popularity"),
              F.max("track_popularity").alias("max_track_popularity"),
              F.avg("energy").alias("avg_energy"),
              F.avg("danceability").alias("avg_danceability"),
              F.avg("valence").alias("avg_valence"),
              F.avg("tempo").alias("avg_tempo")
          )
)

df_artist_impact.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema_sink}.artist_impact")


#Genre Artists

In [0]:
df_genre_artist = (
    bridge.join(track.select("track_id","genre_main","release_year","track_popularity"), on="track_id", how="left")
          .join(audio.select("track_id","energy","danceability","valence"), on="track_id", how="left")
          .join(artist.select("artist_id","artist_name","followers"), on="artist_id", how="left")
          .groupBy("genre_main","artist_id","artist_name","followers")
          .agg(
              F.countDistinct("track_id").alias("tracks_count"),
              F.avg("track_popularity").alias("avg_track_popularity"),
              F.avg("energy").alias("avg_energy"),
              F.avg("danceability").alias("avg_danceability"),
              F.avg("valence").alias("avg_valence")
          )
)

df_genre_artist.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema_sink}.genre_artist_summary")
