# Dynamic Partitioning Pruning
- Pruning partitions at runtime
- Problem Statement: Analyse the listening activity of users on the release date of a song

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "10g")
    .master("local[*]")
    .appName("6_1_dynamic_partition_pruning")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/11 18:11:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
df_listening_actv = spark.read.csv("../data/partitioning/raw/Spotify_Listening_Activity.csv", header=True, inferSchema=True)
# Partitioning listening activity by the listen date
(
    df_listening_actv
    .write
    .partitionBy("listen_date")
    .mode("overwrite")
    .parquet("../data/partitioning/partitioned/listening_activity_pt")
)

                                                                                

In [7]:
df_listening_actv_pt = spark.read.parquet("../data/partitioning/partitioned/listening_activity_pt")
df_listening_actv_pt.show(5, False)

                                                                                

+-----------+-------+---------------+--------------------------+
|activity_id|song_id|listen_duration|listen_date               |
+-----------+-------+---------------+--------------------------+
|7526       |60     |132            |2023-07-13 10:15:47.032387|
|7527       |88     |97             |2023-07-13 10:15:47.032387|
|7528       |2      |298            |2023-07-13 10:15:47.032387|
|7529       |18     |221            |2023-07-13 10:15:47.032387|
|7530       |98     |291            |2023-07-13 10:15:47.032387|
+-----------+-------+---------------+--------------------------+
only showing top 5 rows



In [9]:
df_songs = spark.read.csv("../data/partitioning/raw/Spotify_Songs.csv", header=True, inferSchema=True)
df_songs.printSchema()

root
 |-- song_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- release_date: string (nullable = true)



In [10]:
df_songs = (
    df_songs
    .withColumnRenamed("release_date", "release_datetime")
    .withColumn("release_date", F.to_date("release_datetime", "yyyy-MM-dd HH:mm:ss.SSSSSS"))
)
df_songs.show(5, False)
df_songs.printSchema()

+-------+------+---------+--------------------------+------------+
|song_id|title |artist_id|release_datetime          |release_date|
+-------+------+---------+--------------------------+------------+
|1      |Song_1|2        |2021-10-15 10:15:47.006571|2021-10-15  |
|2      |Song_2|45       |2020-12-07 10:15:47.006588|2020-12-07  |
|3      |Song_3|25       |2022-07-11 10:15:47.006591|2022-07-11  |
|4      |Song_4|25       |2019-03-09 10:15:47.006593|2019-03-09  |
|5      |Song_5|26       |2019-09-07 10:15:47.006596|2019-09-07  |
+-------+------+---------+--------------------------+------------+
only showing top 5 rows

root
 |-- song_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- artist_id: integer (nullable = true)
 |-- release_datetime: string (nullable = true)
 |-- release_date: date (nullable = true)



In [11]:
# Pick songs released in 2020
df_selected_songs = df_songs.filter(F.col("release_date") > F.lit("2019-12-31"))


df_listening_actv_of_selected_songs = df_listening_actv_pt.join(
    df_selected_songs, 
    on=(df_songs.release_date == df_listening_actv_pt.listen_date) & (df_songs.song_id == df_listening_actv_pt.song_id), 
    how="inner"
)

In [12]:
df_listening_actv_of_selected_songs.explain(True)

== Parsed Logical Plan ==
Join Inner, ((release_date#111 = cast(listen_date#36 as date)) AND (song_id#98 = song_id#34))
:- Relation [activity_id#33,song_id#34,listen_duration#35,listen_date#36] parquet
+- Filter (release_date#111 > cast(2019-12-31 as date))
   +- Project [song_id#98, title#99, artist_id#100, release_datetime#106, to_date('release_datetime, Some(yyyy-MM-dd HH:mm:ss.SSSSSS)) AS release_date#111]
      +- Project [song_id#98, title#99, artist_id#100, release_date#101 AS release_datetime#106]
         +- Relation [song_id#98,title#99,artist_id#100,release_date#101] csv

== Analyzed Logical Plan ==
activity_id: int, song_id: int, listen_duration: int, listen_date: string, song_id: int, title: string, artist_id: int, release_datetime: string, release_date: date
Join Inner, ((release_date#111 = cast(listen_date#36 as date)) AND (song_id#98 = song_id#34))
:- Relation [activity_id#33,song_id#34,listen_duration#35,listen_date#36] parquet
+- Filter (release_date#111 > cast(2019-1

In [13]:
df_listening_actv_of_selected_songs.show()

+-----------+-------+---------------+--------------------+-------+-------+---------+--------------------+------------+
|activity_id|song_id|listen_duration|         listen_date|song_id|  title|artist_id|    release_datetime|release_date|
+-----------+-------+---------------+--------------------+-------+-------+---------+--------------------+------------+
|       9760|     89|             81|2023-07-24 10:15:...|     89|Song_89|       33|2023-07-24 10:15:...|  2023-07-24|
|       9768|     89|            295|2023-07-24 10:15:...|     89|Song_89|       33|2023-07-24 10:15:...|  2023-07-24|
|       9799|     89|            272|2023-07-24 10:15:...|     89|Song_89|       33|2023-07-24 10:15:...|  2023-07-24|
|       7322|     64|             95|2023-10-25 10:15:...|     64|Song_64|       32|2023-10-25 10:15:...|  2023-10-25|
+-----------+-------+---------------+--------------------+-------+-------+---------+--------------------+------------+

