In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week3_Day3_PySpark_Storage_Optimization") \
    .getOrCreate()

print("✅ Spark Session Ready")

✅ Spark Session Ready


In [2]:
from pyspark.sql.functions import expr

df = spark.range(0, 1_000_000) \
    .withColumn("category", expr("CASE WHEN id % 5 = 0 THEN 'A' WHEN id % 5 = 1 THEN 'B' WHEN id % 5 = 2 THEN 'C' WHEN id % 5 = 3 THEN 'D' ELSE 'E' END")) \
    .withColumn("value", expr("rand() * 100"))

df.show(5)
print(f"Total rows: {df.count()}")

+---+--------+------------------+
| id|category|             value|
+---+--------+------------------+
|  0|       A| 77.59360960017673|
|  1|       B| 92.54651086956007|
|  2|       C| 94.80803080062599|
|  3|       D|32.060780506571675|
|  4|       E| 38.49879434646528|
+---+--------+------------------+
only showing top 5 rows

Total rows: 1000000


In [3]:
df.cache()
df.count()  # triggers caching
print("✅ Cached in memory")

# Try persistence (both memory and disk)
from pyspark import StorageLevel
df.persist(StorageLevel.MEMORY_AND_DISK)

✅ Cached in memory


DataFrame[id: bigint, category: string, value: double]

In [4]:
print("Partitions before:", df.rdd.getNumPartitions())

df_repart = df.repartition(10, "category")
print("Partitions after:", df_repart.rdd.getNumPartitions())

Partitions before: 8
Partitions after: 10


In [5]:
# Parquet
df_repart.write.mode("overwrite").parquet("output/day3_parquet")

# ORC
df_repart.write.mode("overwrite").orc("output/day3_orc")

# CSV
df_repart.write.mode("overwrite").csv("output/day3_csv", header=True)

In [6]:
import time

def read_time(path, format):
    start = time.time()
    _ = spark.read.format(format).load(path).count()
    print(f"{format.upper()} read time: {round(time.time() - start, 2)} sec")

read_time("output/day3_parquet", "parquet")
read_time("output/day3_orc", "orc")
read_time("output/day3_csv", "csv")

PARQUET read time: 1.3 sec
ORC read time: 1.08 sec
CSV read time: 2.1 sec


In [7]:
df_repart.write.mode("overwrite").partitionBy("category").parquet("output/day3_partitioned")