# Optimization Techniques for PySpark ETL

In [0]:
# Load our data
file_path = "/pyspark/video-streaming-data/module3-transform/optimization/complex_events.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

user_path = "/pyspark/video-streaming-data/module3-transform/joins_aggregations/users.csv"
users_df = spark.read.option("header", "true").option("inferSchema", "true").csv(user_path)

#### OPTIMIZATION 1: BROADCAST JOINS

In [0]:
# 1. Regular join (will use shuffle)
# First, let's disable automatic broadcasting to clearly show the difference
print("Disabling automatic broadcast join optimization...")
original_broadcast_threshold = spark.conf.get("spark.sql.autoBroadcastJoinThreshold")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")  # Disable automatic broadcasting

# Clear everything to ensure a clean demonstration
spark.catalog.clearCache()

# 1. Regular join (will use shuffle)
print("\n1. REGULAR JOIN WITH SHUFFLE:")
print("-----------------------------")
regular_join = df.join(users_df, on="user_id")
print("Execution plan for regular join:")
regular_join.explain(mode="formatted")

# Look for Exchange hashpartitioning in the plan - this indicates shuffling

Disabling automatic broadcast join optimization...

1. REGULAR JOIN WITH SHUFFLE:
-----------------------------
Execution plan for regular join:
== Physical Plan ==
AdaptiveSparkPlan (11)
+- == Initial Plan ==
   Project (10)
   +- SortMergeJoin Inner (9)
      :- Sort (4)
      :  +- Exchange (3)
      :     +- Filter (2)
      :        +- Scan csv  (1)
      +- Sort (8)
         +- Exchange (7)
            +- Filter (6)
               +- Scan csv  (5)


(1) Scan csv 
Output [12]: [event_id#149, user_id#150, content_id#151, timestamp#152, duration_seconds#153, device_type#154, quality#155, buffering_count#156, error_type#157, ip_address#158, country#159, session_id#160]
Batched: false
Location: InMemoryFileIndex [dbfs:/pyspark/video-streaming-data/module3-transform/optimization/complex_events.csv]
PushedFilters: [IsNotNull(user_id)]
ReadSchema: struct<event_id:string,user_id:string,content_id:string,timestamp:timestamp,duration_seconds:int,device_type:string,quality:string,buffering_c

In [0]:
# 2. Explicit broadcast join
from pyspark.sql.functions import broadcast

print("\n2. BROADCAST JOIN:")
print("-----------------")
broadcast_join = df.join(broadcast(users_df), on="user_id")
print("Execution plan for broadcast join:")
broadcast_join.explain(mode="formatted")

# Look for BroadcastExchange or BroadcastHashJoin in the plan - this indicates broadcasting



2. BROADCAST JOIN:
-----------------
Execution plan for broadcast join:
== Physical Plan ==
AdaptiveSparkPlan (8)
+- == Initial Plan ==
   Project (7)
   +- BroadcastHashJoin Inner BuildRight (6)
      :- Filter (2)
      :  +- Scan csv  (1)
      +- Exchange (5)
         +- Filter (4)
            +- Scan csv  (3)


(1) Scan csv 
Output [12]: [event_id#149, user_id#150, content_id#151, timestamp#152, duration_seconds#153, device_type#154, quality#155, buffering_count#156, error_type#157, ip_address#158, country#159, session_id#160]
Batched: false
Location: InMemoryFileIndex [dbfs:/pyspark/video-streaming-data/module3-transform/optimization/complex_events.csv]
PushedFilters: [IsNotNull(user_id)]
ReadSchema: struct<event_id:string,user_id:string,content_id:string,timestamp:timestamp,duration_seconds:int,device_type:string,quality:string,buffering_count:int,error_type:string,ip_address:string,country:string,session_id:string>

(2) Filter
Input [12]: [event_id#149, user_id#150, content_id

In [0]:
# Compare query times
import time

print("\nComparing performance:")
print("---------------------")

# Time the regular join
start_time = time.time()
regular_count = regular_join.count()
regular_time = time.time() - start_time
print(f"Regular join time: {regular_time:.2f} seconds for {regular_count} records")

# Time the broadcast join
start_time = time.time()
broadcast_count = broadcast_join.count()
broadcast_time = time.time() - start_time
print(f"Broadcast join time: {broadcast_time:.2f} seconds for {broadcast_count} records")

print(f"Speedup: {regular_time/broadcast_time:.2f}x faster with broadcast join")

# Restore original broadcast threshold
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", original_broadcast_threshold)
print(f"\nRestored broadcast threshold to original value: {original_broadcast_threshold}")


Comparing performance:
---------------------
Regular join time: 2.65 seconds for 5054 records
Broadcast join time: 0.97 seconds for 5054 records
Speedup: 2.72x faster with broadcast join

Restored broadcast threshold to original value: -1


#### OPTIMIZATION 2: CACHING STRATEGIES

In [0]:
# Cache intermediate results that are used multiple times
import time

print("Caching demonstration:")
# First, clear any existing cache
df.unpersist(blocking=True)

# Non-cached approach
start_time = time.time()
result1 = df.groupBy("device_type").count()
result1.show()

result2 = df.groupBy("device_type").agg({"duration_seconds": "mean"})
result2.show()
end_time = time.time()
print(f"Non-cached execution time: {end_time - start_time:.2f} seconds")

# Cached approach
df.cache() # Cache the dataframe
df.count() # Force cache evaluation
start_time = time.time()
result1 = df.groupBy("device_type").count()
result1.show()

result2 = df.groupBy("device_type").agg({"duration_seconds": "mean"})
result2.show()
end_time = time.time()
print(f"Cached execution time: {end_time - start_time:.2f} seconds")

# Remember to unpersist when done
df.unpersist()

Caching demonstration:
+-----------+-----+
|device_type|count|
+-----------+-----+
|         TV|12550|
|     Mobile|12328|
|     Tablet|12461|
|        Web|12661|
+-----------+-----+

+-----------+---------------------+
|device_type|avg(duration_seconds)|
+-----------+---------------------+
|         TV|    4352.044302788845|
|     Mobile|   4258.6459279688515|
|     Tablet|    4335.961399566648|
|        Web|    4215.865808387963|
+-----------+---------------------+

Non-cached execution time: 2.62 seconds
+-----------+-----+
|device_type|count|
+-----------+-----+
|         TV|12550|
|     Mobile|12328|
|     Tablet|12461|
|        Web|12661|
+-----------+-----+

+-----------+---------------------+
|device_type|avg(duration_seconds)|
+-----------+---------------------+
|         TV|    4352.044302788845|
|     Mobile|   4258.6459279688515|
|     Tablet|    4335.961399566648|
|        Web|    4215.865808387963|
+-----------+---------------------+

Cached execution time: 1.05 seconds


DataFrame[event_id: string, user_id: string, content_id: string, timestamp: timestamp, duration_seconds: int, device_type: string, quality: string, buffering_count: int, error_type: string, ip_address: string, country: string, session_id: string]

#### OPTIMIZATION 3: PARTITIONED WRITES

In [0]:
# Partition by columns that are frequently used in filters

# Save to parquet with partitioning
print("Partitioned Writes Demonstration:")
print("--------------------------------")
output_path = "/pyspark/video-streaming-data/module3-transform/optimization/optimized_output"

# See the impact of partitioning
print("Writing data partitioned by device_type...")
df.write.partitionBy("device_type").mode("overwrite").parquet(output_path)

print(f"\nPartition structure created at {output_path}:")
display(dbutils.fs.ls(output_path))

Partitioned Writes Demonstration:
--------------------------------
Writing data partitioned by device_type...

Partition structure created at /pyspark/video-streaming-data/module3-transform/optimization/optimized_output:


path,name,size,modificationTime
dbfs:/pyspark/video-streaming-data/module3-transform/optimization/optimized_output/_SUCCESS,_SUCCESS,0,1744664659000
dbfs:/pyspark/video-streaming-data/module3-transform/optimization/optimized_output/_committed_6586057143507670899,_committed_6586057143507670899,35,1744499442000
dbfs:/pyspark/video-streaming-data/module3-transform/optimization/optimized_output/device_type=Mobile/,device_type=Mobile/,0,1744497246000
dbfs:/pyspark/video-streaming-data/module3-transform/optimization/optimized_output/device_type=TV/,device_type=TV/,0,1744497246000
dbfs:/pyspark/video-streaming-data/module3-transform/optimization/optimized_output/device_type=Tablet/,device_type=Tablet/,0,1744497247000
dbfs:/pyspark/video-streaming-data/module3-transform/optimization/optimized_output/device_type=Web/,device_type=Web/,0,1744497246000


In [0]:
print("\nBenefits of partitioned writes:")
print("1. Enables partition pruning - Spark can skip irrelevant partitions")
print("2. Enables parallel reads - Different partitions can be read simultaneously")
print("3. Supports partition-aware queries - Filters on partition columns are much faster")