In [17]:
import os

import altair as alt
import pandas as pd
import plotly.express as px

import pyspark.sql.functions as F
from bokeh.layouts import column
from holoviews.core.util import label_sanitizer
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
spark = (
    SparkSession.builder
    .appName('Airport Traffic')
    .master('local[20]')
    .config('spark.executor.memory', '12g')
    .config('spark.executor.cors', '4')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "20")
    .config('spark.executors.memoryOverhead', '2g')
    .config("spark.driver.memory", "12g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')# 128 default
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '120')
    .config('spark.sql.autoBroadcastJoinThreshold', '512mb')
    .getOrCreate()
)

In [4]:
df_path = r"F:\Datasets\CSV datasets\Top_spotify_songs.csv"

In [5]:
df = (
    spark
    .read
    .format('csv')
    .option('header', True)
    .option('inferSchema', True)
    .load(df_path)
)

In [8]:
df.count()

1728382

In [13]:
null_count = df.select([
    F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column)
          for column in df.columns
]).toPandas()

In [23]:
null_df = pd.melt(null_count, value_vars=null_count.columns)
null_df.columns = ['Column', 'Null_Count']

In [26]:
fig = px.bar(
    null_df,
    x='Column',
    y='Null_Count',
    title='Null Count',
    color='Null_Count'
).update_layout(
    width=1000,
    height=500
)

fig.show()

In [28]:
df.printSchema()

root
 |-- spotify_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- daily_rank: string (nullable = true)
 |-- daily_movement: string (nullable = true)
 |-- weekly_movement: string (nullable = true)
 |-- country: string (nullable = true)
 |-- snapshot_date: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- is_explicit: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- album_release_date: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = t

In [31]:
df.show(truncate=False, n=5)

+----------------------+------------------+---------------------+----------+--------------+---------------+-------+-------------+----------+-----------+-----------+--------------------+------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|spotify_id            |name              |artists              |daily_rank|daily_movement|weekly_movement|country|snapshot_date|popularity|is_explicit|duration_ms|album_name          |album_release_date|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |time_signature|
+----------------------+------------------+---------------------+----------+--------------+---------------+-------+-------------+----------+-----------+-----------+--------------------+------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+
|2plbrEY59IikOBgBGLjaoe|D

In [36]:
df = df.withColumn(
    'snapshot_date', F.to_date('snapshot_date', format='yyyy-MM-dd')) \
    .withColumn(
    'album_release_date', F.to_date('album_release_date', format='yyyy-MM-dd')
)

In [40]:
df.select(
    F.min('snapshot_date'),
    F.max('snapshot_date')
).show()

+------------------+------------------+
|min(snapshot_date)|max(snapshot_date)|
+------------------+------------------+
|        2023-10-18|        2025-02-17|
+------------------+------------------+

