#Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

#Widgets

In [0]:
dbutils.widgets.removeAll()

In [0]:
dbutils.widgets.text("storageName", "adlsproyectofinalsd")
dbutils.widgets.text("container", "raw")
dbutils.widgets.text("catalog", "proyectofinal")
dbutils.widgets.text("schema", "bronze")


# Variables

In [0]:
storageName =  dbutils.widgets.get("storageName")
container = dbutils.widgets.get("container")
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

#Path

In [0]:
path_songs = f"abfss://{container}@{storageName}.dfs.core.windows.net/songs.csv"

# Structures

In [0]:
songs_schema = StructType(fields=[
    StructField("id", StringType(), True),                       # Unique Spotify ID
    StructField("name", StringType(), True),                     # Song title
    StructField("album_name", StringType(), True),               # Album name
    StructField("artists", StringType(), True),       # List of artist names

    StructField("danceability", DoubleType(), True),             # 0.0 - 1.0
    StructField("energy", DoubleType(), True),                   # 0.0 - 1.0
    StructField("key", IntegerType(), True),                     # pitch class (0-11 typically)
    StructField("loudness", DoubleType(), True),                 # dB
    StructField("mode", IntegerType(), True),                    # 1 major, 0 minor

    StructField("speechiness", DoubleType(), True),
    StructField("acousticness", DoubleType(), True),
    StructField("instrumentalness", DoubleType(), True),
    StructField("liveness", DoubleType(), True),
    StructField("valence", DoubleType(), True),

    StructField("tempo", DoubleType(), True),                    # BPM
    StructField("duration_ms", LongType(), True),                # ms
    StructField("lyrics", StringType(), True),                   # full lyrics
    StructField("year", IntegerType(), True),                    # release year

    StructField("genre", StringType(), True),                    # main genre (10 categories)
    StructField("popularity", IntegerType(), True),              # 0-100

    StructField("total_artist_followers", LongType(), True),     # sum of followers
    StructField("avg_artist_popularity", DoubleType(), True),    # average popularity

    StructField("artist_ids", StringType(), True),    # List of Spotify artist IDs
    StructField("niche_genres", StringType(), True),  # List of sub-genres
])

In [0]:
df_songs = spark.read.option('header', True)\
                        .schema(songs_schema)\
                        .option("quote", '"')\
                        .option("escape", '"')\
                        .option("multiLine", "true")\
                        .option("mode", "PERMISSIVE")\
                        .csv(path_songs)

In [0]:
display(df_songs)

In [0]:
df_songs.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.songs")