In [2]:
# Week 3 - Day 1: Advanced PySpark Optimizations
#**Objective:** Learn about partitioning, caching, and how Spark optimizes execution.

from pyspark.sql import SparkSession

# Creating or reusing an existing Spark session
spark = SparkSession.builder \
    .appName("Week3_Day1_Optimizations") \
    .getOrCreate()

print("✅ Spark Session Active")

✅ Spark Session Active


In [3]:
from pyspark.sql.functions import rand

# Generate a sample dataset with 1 million rows
df = spark.range(0, 1_000_000).withColumn("value", rand())

df.show(5)
print("Total rows:", df.count())b

+---+-------------------+
| id|              value|
+---+-------------------+
|  0| 0.5132871941508156|
|  1|0.20142071738666545|
|  2|0.09755448488351526|
|  3| 0.2506853995880579|
|  4| 0.2881690899273799|
+---+-------------------+
only showing top 5 rows

Total rows: 1000000


In [4]:
print("Default number of partitions:", df.rdd.getNumPartitions())

Default number of partitions: 8


In [5]:
df_repartitioned = df.repartition(8)
print("After repartitioning:", df_repartitioned.rdd.getNumPartitions())

After repartitioning: 8


In [6]:
df_repartitioned.cache()
df_repartitioned.count()  # Triggers computation and stores the data in memory

1000000

In [7]:
df_repartitioned.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   ShuffleQueryStage 0
   +- Exchange RoundRobinPartitioning(8), REPARTITION_BY_NUM, [plan_id=73]
      +- *(1) Project [id#0L, rand(1620440083077161892) AS value#2]
         +- *(1) Range (0, 1000000, step=1, splits=8)
+- == Initial Plan ==
   Exchange RoundRobinPartitioning(8), REPARTITION_BY_NUM, [plan_id=66]
   +- Project [id#0L, rand(1620440083077161892) AS value#2]
      +- Range (0, 1000000, step=1, splits=8)




In [8]:
df_repartitioned.write.mode("overwrite").parquet("data/optimized_data.parquet")
print("✅ Optimized data written to parquet format")

✅ Optimized data written to parquet format
