In [8]:
import os
import sys
import math

import subprocess
import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from prompt_toolkit.styles.style import default_priority
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [9]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [10]:
spark = (
    SparkSession.builder
    .appName('Spotify Songs Analysis')
    .master('local[4]')
    .config('spark.executor.memory', '2g')
    .config('spark.executor.cores', '2')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '512m')
    .config("spark.driver.memory", "2g")
    .config("spark.driver.maxResultSize", "2g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '64mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .config("spark.ui.showConsoleProgress", "false")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

In [11]:
# df_path_songs = r'/Users/zygimantas/Documents/DataSets/Songs/songs.csv'
# df_path_artists = r'/Users/zygimantas/Documents/DataSets/Songs/artists.csv'

In [12]:
spark = SparkSession.builder.appName("CSVToParquet").getOrCreate()

In [13]:
# df_songs_csv = spark.read.csv(df_path_songs, header=True, inferSchema=True)

In [14]:
# df_songs_csv.write.parquet(r'/Users/zygimantas/Documents/DataSets/Songs/songs.parquet')

In [15]:
# df_artists_csv = spark.read.csv(df_path_artists, header=True, inferSchema=True)

In [16]:
# df_artists_csv.write.parquet(r'/Users/zygimantas/Documents/DataSets/Songs/artists.parquet')

In [17]:
df_artist_path = r'/Users/zygimantas/Documents/DataSets/Songs/artists.parquet'
df_songs_path = r'/Users/zygimantas/Documents/DataSets/Songs/songs.parquet'

In [18]:
df_artists = spark.read.parquet(df_artist_path, header=True, inferSchema=True)

In [19]:
df_songs = spark.read.parquet(df_songs_path, header=True, inferSchema=True)

In [20]:
df_songs.limit(5).show(truncate=False)

+--------------------------------+----+----------+-------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+-----------+------+----+-----+----------+----------------------+---------------------+----------+------------+
|id                              |name|album_name|artists|danceability|energy|key |loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo|duration_ms|lyrics|year|genre|popularity|total_artist_followers|avg_artist_popularity|artist_ids|niche_genres|
+--------------------------------+----+----------+-------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-----+-----------+------+----+-----+----------+----------------------+---------------------+----------+------------+
|But knew your kisses thrilled me|NULL|NULL      |NULL   |NULL        |NULL  |NULL|NULL    |NULL|NULL       |NULL        |NULL            |NULL    |NULL   |NULL |NULL       |NULL  |

In [21]:
df_artists.limit(5).show(truncate=False)

+----------------------+-----------------+---------+----------+------------------------------+----------+
|id                    |name             |followers|popularity|genres                        |main_genre|
+----------------------+-----------------+---------+----------+------------------------------+----------+
|6YROFUbu5zRCHi2xkir5pk|Brian Hyland     |67223    |47        |[]                            |Pop       |
|5tFRohaO5yEsuJxmMnlCO9|Barns Courtney   |602647   |62        |[]                            |Electronic|
|3w1Q754jb31h5CXQCcnLNL|Capcom Sound Team|210392   |58        |['japanese vgm', 'soundtrack']|Electronic|
|3oDbviiivRWhXwIE8hxkVV|The Beach Boys   |5139194  |76        |['baroque pop']               |Classical |
|60zvRmhQHRxokEB1taAVpN|Beth Malone      |1569     |29        |['musicals']                  |Classical |
+----------------------+-----------------+---------+----------+------------------------------+----------+



In [22]:
df_artists.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- genres: string (nullable = true)
 |-- main_genre: string (nullable = true)



In [23]:
df_songs.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- lyrics: string (nullable = true)
 |-- year: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- total_artist_followers: string (nullable = true)
 |-- avg_artist_popularity: string (nullable = true)
 |-- artist_ids: string (nullable = true)
 |-- niche_genres: string (nullable = true)

