### Configuration

```bash
brew install apache-spark
pip install pyspark
```

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, min, max, avg

### First script to run in order to make the queries run

In [18]:
# Create an entry point to the PySpark Application
spark = SparkSession.builder \
      .master("local") \
      .appName("SpotifyGenre") \
      .getOrCreate()
# master contains the URL of your remote spark instance or 'local'

### Create the Schema

In [42]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType

# Define the schema for your CSV data
schemaSpotifyGenre = StructType([
    StructField("id", IntegerType(), True),
    StructField("track_id", StringType(), True),
    StructField("artists", StringType(), True),
    StructField("track_name", StringType(), True),
    StructField("album_name", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("explicit", BooleanType(), True),
    StructField("danceability", FloatType(), True),
    StructField("energy", FloatType(), True),
    StructField("key", IntegerType(), True),
    StructField("loudness", FloatType(), True),
    StructField("mode", IntegerType(), True),
    StructField("speechiness", FloatType(), True),
    StructField("acousticness", FloatType(), True),
    StructField("instrumentalness", FloatType(), True),
    StructField("liveness", FloatType(), True),
    StructField("valence", FloatType(), True),
    StructField("tempo", FloatType(), True),
    StructField("time_signature", IntegerType(), True),
    StructField("track_genre", StringType(), True)
])

# Display the schema
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: float (nullable = true)
 |-- energy: float (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: float (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- acousticness: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- track_genre: string (nullable = true)



### Parse the Dataset

In [20]:
df = spark.read.csv('./data/spotify_tracks_genre.csv', header=True, schema=schemaSpotifyGenre)
df.show()  # To display the first few rows of the dataframe

+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
| id|            track_id|             artists|          track_name|          album_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|
+---+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+
|  0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   false|       0.676| 0.461|  1|  -6.746|   0|      0.143|      0.0322|         1.01E-6|   0.358|  0.715| 87.917|            

23/12/15 12:03:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Unnamed: 0, track_id, artists, album_name, track_name, popularity, duration_ms, explicit, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature, track_genre
 Schema: id, track_id, artists, track_name, album_name, popularity, duration_ms, explicit, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, time_signature, track_genre
Expected: id but found: Unnamed: 0
CSV file: file:///Users/casarf/Documents/USI/Master/semester1/DataDesign/assignment3/spark_sus/data/spotify_tracks_genre.csv


### Python Query 1: Danceability Statistic for a Specific Author and Genre

In [26]:
# Filter for 'Jason Mraz' in the artist column and 'acoustic' in the genre column
filtered_df = df.filter(
    (col("artists").like("Bad Bunny")) & 
    (col("track_genre") == "acoustic")
)

# Select the 'danceability' column and calculate the average, minimum, and maximum
aggregated_df = filtered_df.agg(
    avg("danceability").alias("avg_danceability"),
    min("danceability").alias("min_danceability"),
    max("danceability").alias("max_danceability")
)

# Show the results
aggregated_df.show()

# Stop the SparkSession
spark.stop()

AssertionError: 

### Python Query 2: Explicit Authors
This query aims to find all the authors that have published at least an explicit song.\
They are then sorted by the number of explicit songs published.

In [29]:
from pyspark.sql.functions import col, count
# Put query here FRUAH
result = (
    df.filter(col('explicit') == True)  # Filter for explicit songs
    .groupBy('artists')  # Group by artists
    .agg(count('track_id').alias('explicit_count'))  # Count the number of explicit songs for each artist
    .orderBy(col('explicit_count').desc(), 'artists')  # Sort by explicit count in descending order, then alphabetically
)

result.show()

+--------------------+--------------+
|             artists|explicit_count|
+--------------------+--------------+
|         Jack Harlow|            71|
|    Jhayco;Bad Bunny|            64|
|      Marilyn Manson|            61|
|        XXXTENTACION|            59|
|Daddy Yankee;Bad ...|            58|
|         Vybz Kartel|            54|
|             Asspera|            51|
|              Jhayco|            46|
|     Alejo;Feid;ROBI|            44|
|                Feid|            43|
|                 Jax|            38|
|Five Finger Death...|            37|
|Bring Me The Horizon|            36|
|Chris Jedi;Anuel ...|            33|
|             KAROL G|            33|
|             Pantera|            32|
|           blink-182|            32|
|         Chronic Law|            31|
|          Burgerkill|            29|
|    Kathleen Madigan|            29|
+--------------------+--------------+
only showing top 20 rows



### Python Query 3: Query Name

In [None]:
# Put query here LOLLO

### Python Query 4: Best Danceable Dance Songs

This query aims to find all the danceable songs that are not explicit.\
It only shows the songs that have a danceability that's more than 0.8.


In [39]:
genre = 'dance'  # Replace 'dance' with your desired genre

result = (
    df.filter(
        (col('explicit') == False) & 
        (col('danceability') > 0.8) & 
        (col('track_genre').contains(genre))
    )  # Filter for non-explicit dance songs with danceability > 0.8 and specific genre
    .select('track_name', 'danceability')  # Select only 'track_name' and 'danceability' columns
    .orderBy('danceability', ascending=False)  # Sort by danceability in descending order
)

result.show(truncate=False)

+--------------------------------+------------+
|track_name                      |danceability|
+--------------------------------+------------+
|Mentally Free                   |0.975       |
|Afrowave 2                      |0.966       |
|Queda poco para la PAES         |0.965       |
|King of Parole                  |0.961       |
|Loaded                          |0.961       |
|Money Dance                     |0.958       |
|The Trinity                     |0.951       |
|The Trinity                     |0.951       |
|Temperature                     |0.951       |
|The Trinity                     |0.95        |
|No Reservations                 |0.948       |
|Bunx Up (feat. Marcy Chin)      |0.947       |
|Bunx Up [The Official Street LP]|0.944       |
|Banga                           |0.943       |
|Find It                         |0.94        |
|Play Tune (So so So)            |0.935       |
|Pa' mayores de edad             |0.931       |
|Retro 13 (Special Dj Greg C)    |0.931 

23/12/15 13:51:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: album_name, explicit, danceability, track_genre
 Schema: track_name, explicit, danceability, track_genre
Expected: track_name but found: album_name
CSV file: file:///Users/casarf/Documents/USI/Master/semester1/DataDesign/assignment3/spark_sus/data/spotify_tracks_genre.csv


### Python Query 5: Query Name

In [None]:
# Put query here LOLLO

### SQL Queries

### SQL Query 1: Top 5 Longest Albums in Dataset
This query aims to find the top 5 longest albums in the Dataset.\
At first the songs are grouped by their album name, then the duration time of each song is summed up.\
Then the top 5 are showed and sorted in a Descending order.

In [38]:
from pyspark.sql import SparkSession

# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("spotify_genre_table")

# SQL query to identify albums with the longest total duration
sql_query = """
    SELECT album_name, SUM(duration_ms) AS total_duration
    FROM spotify_genre_table
    GROUP BY album_name
    ORDER BY total_duration DESC
    LIMIT 5
"""

# Execute the SQL query
result = spark.sql(sql_query)

result.show(truncate=False)

23/12/15 13:44:12 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: track_name, duration_ms
 Schema: album_name, duration_ms
Expected: album_name but found: track_name
CSV file: file:///Users/casarf/Documents/USI/Master/semester1/DataDesign/assignment3/spark_sus/data/spotify_tracks_genre.csv


+----------------------+--------------+
|album_name            |total_duration|
+----------------------+--------------+
|Run Rudolph Run       |24605335      |
|Christmas Time        |16785840      |
|Halloween             |16031062      |
|RUMBATÓN              |14926080      |
|CÓMO SE SIENTE - Remix|14561280      |
+----------------------+--------------+



### SQL Query 2: Query Name

In [None]:
# Put query here LOLLO

### SQL Query 3: Top 5 Popular Genres

This query aims to find the top 5 genres by popularity.\
At first it groups all the songs by genre and computes the average popularity.
Then genres are sorted by popularity in descending order.

In [40]:
# Put query here FRUAH
df.createOrReplaceTempView("spotify_genre_table")

# SQL query to identify albums with the longest total duration
sql_query = """
    SELECT track_genre, AVG(popularity) AS avg_popularity
    FROM spotify_genre_table
    GROUP BY track_genre
    ORDER BY avg_popularity DESC
    LIMIT 5
"""
# Execute the SQL query
result = spark.sql(sql_query)

result.show(truncate=False)


+-----------+------------------+
|track_genre|avg_popularity    |
+-----------+------------------+
|pop-film   |59.287575150300604|
|k-pop      |56.896            |
|chill      |53.651            |
|sad        |52.379            |
|grunge     |49.594            |
+-----------+------------------+

