In [1]:
!pip install pyspark



# DATASET 1 — RIDES (Large Fact Table)

In [2]:
rides_data = [
("R001","U001","Hyderabad",12.5,240,"Completed"),
("R002","U002","Delhi",8.2,180,"Completed"),
("R003","U003","Mumbai",15.0,300,"Cancelled"),
("R004","U004","Bangalore",5.5,120,"Completed"),
("R005","U005","Hyderabad",20.0,360,"Completed"),
("R006","U006","Delhi",25.0,420,"Completed"),
("R007","U007","Mumbai",7.5,150,"Completed"),
("R008","U008","Bangalore",18.0,330,"Completed"),
("R009","U009","Delhi",6.0,140,"Cancelled"),
("R010","U010","Hyderabad",10.0,200,"Completed")
]

In [3]:
rides_cols = [
"ride_id",
"user_id",
"city",
"distance_km",
"duration_seconds",
"status"
]

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Rides Example") \
    .getOrCreate()


rides_df = spark.createDataFrame(rides_data, rides_cols)

# DATASET 2 — CITY SURGE MULTIPLIERS (Small Lookup)

In [6]:
surge_data = [
("Hyderabad",1.2),
("Delhi",1.5),
("Mumbai",1.8),
("Bangalore",1.3)
]

In [7]:
surge_cols = ["city","surge_multiplier"]

In [8]:
surge_df = spark.createDataFrame(surge_data, surge_cols)

# Create a transformation pipeline

- No action triggered: Spark uses lazy evaluation. That means it doesn’t actually run the query or touch the data until an action is called.


In [9]:
from pyspark.sql.functions import col

transformed_rides = (
    rides_df
    .filter(col("status") == "Completed")
    .select("ride_id", "city", "distance_km")
)


#Trigger a single action on the pipeline.

In [10]:
transformed_rides = (
    rides_df
    .filter(col("status") == "Completed")
    .select("ride_id", "city", "distance_km")
)

transformed_rides.show(5)

+-------+---------+-----------+
|ride_id|     city|distance_km|
+-------+---------+-----------+
|   R001|Hyderabad|       12.5|
|   R002|    Delhi|        8.2|
|   R004|Bangalore|        5.5|
|   R005|Hyderabad|       20.0|
|   R006|    Delhi|       25.0|
+-------+---------+-----------+
only showing top 5 rows


# Create a transformation chain and run explain.

In [11]:
from pyspark.sql.functions import col

pipeline_df = (
    rides_df
    .filter(col("status") == "Completed")
    .filter(col("distance_km") > 10)
    .select("ride_id", "city", "distance_km")
)

pipeline_df.explain(True)

== Parsed Logical Plan ==
'Project ['ride_id, 'city, 'distance_km]
+- Filter (distance_km#3 > cast(10 as double))
   +- Filter (status#5 = Completed)
      +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Analyzed Logical Plan ==
ride_id: string, city: string, distance_km: double
Project [ride_id#0, city#2, distance_km#3]
+- Filter (distance_km#3 > cast(10 as double))
   +- Filter (status#5 = Completed)
      +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Optimized Logical Plan ==
Project [ride_id#0, city#2, distance_km#3]
+- Filter ((isnotnull(status#5) AND isnotnull(distance_km#3)) AND ((status#5 = Completed) AND (distance_km#3 > 10.0)))
   +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Physical Plan ==
*(1) Project [ride_id#0, city#2, distance_km#3]
+- *(1) Filter ((isnotnull(status#5) AND isnotnull(distance_km#3)) AND ((s

# Reorder transformations (filter after join vs before join)

In [18]:
result_df = (
    rides_df.join(surge_df, "city")
            .filter(col("status") == "Completed")
            .select("ride_id", "city", "distance_km")
)

In [19]:
filtered_rised = rides_df.filter(col("status") == "Completed")

result_df = (
    filtered_rised.join(surge_df, "city")
                  .select("ride_id", "city", "distance_km")
)

# PARTITIONS & SHUFFLE

In [20]:

print("Partitions:", rides_df.rdd.getNumPartitions())

Partitions: 2


# Repartitions

In [21]:
rides_repart = rides_df.repartition(4)

print("Partitions after repartition:", rides_repart.rdd.getNumPartitions())

Partitions after repartition: 4


# Coalesce

In [22]:

rides_coalesce = rides_repart.coalesce(1)

print("Partitions after coalesce:", rides_coalesce.rdd.getNumPartitions())

Partitions after coalesce: 1


In [23]:

rides_repart.write.mode("overwrite").parquet("output_repart")
rides_coalesce.write.mode("overwrite").parquet("output_coalesce")

#Repartition rides by city

In [24]:
from pyspark.sql.functions import col

rides_by_city = rides_df.repartition("city")

rides_by_city.explain(True)

== Parsed Logical Plan ==
'RepartitionByExpression ['city]
+- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Analyzed Logical Plan ==
ride_id: string, user_id: string, city: string, distance_km: double, duration_seconds: bigint, status: string
RepartitionByExpression [city#2]
+- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Optimized Logical Plan ==
RepartitionByExpression [city#2]
+- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange hashpartitioning(city#2, 200), REPARTITION_BY_COL, [plan_id=160]
   +- Scan ExistingRDD[ride_id#0,user_id#1,city#2,distance_km#3,duration_seconds#4L,status#5]



# JOIN WITHOUT BROADCAST (BAD DAG)

In [25]:
from pyspark.sql.functions import col

joined_df = rides_df.join(surge_df, "city")

joined_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- LogicalRDD [city#6, surge_multiplier#7], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#7]
+- Join Inner, (city#2 = city#6)
   :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- LogicalRDD [city#6, surge_multiplier#7], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#7]
+- Join Inner, (city#2 = city#6)
   :- Filter isnotnull(city#2)
   :  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- Filter isnotnull(city#6)
    

# Apply a filter ( distance_km > 10 ) before the join.

In [26]:
from pyspark.sql.functions import col

filtered_rides = rides_df.filter(col("distance_km") > 10)

joined_df = filtered_rides.join(surge_df, "city")

joined_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- Filter (distance_km#3 > cast(10 as double))
:  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- LogicalRDD [city#6, surge_multiplier#7], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#7]
+- Join Inner, (city#2 = city#6)
   :- Filter (distance_km#3 > cast(10 as double))
   :  +- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- LogicalRDD [city#6, surge_multiplier#7], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#7]
+- Join Inner, (city#2 = city#6)
   :- Filter ((isnotnull(distance_km#3) AND (distance_km#3 > 10.0))

# BROADCAST JOIN (GOOD DAG)

## Apply a broadcast hint to surge_df .

In [27]:
from pyspark.sql.functions import col
from pyspark.sql import DataFrame

from pyspark.sql.functions import broadcast

joined_df = rides_df.join(broadcast(surge_df), "city")

joined_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [city])
:- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [city#6, surge_multiplier#7], false

== Analyzed Logical Plan ==
city: string, ride_id: string, user_id: string, distance_km: double, duration_seconds: bigint, status: string, surge_multiplier: double
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#7]
+- Join Inner, (city#2 = city#6)
   :- LogicalRDD [ride_id#0, user_id#1, city#2, distance_km#3, duration_seconds#4L, status#5], false
   +- ResolvedHint (strategy=broadcast)
      +- LogicalRDD [city#6, surge_multiplier#7], false

== Optimized Logical Plan ==
Project [city#2, ride_id#0, user_id#1, distance_km#3, duration_seconds#4L, status#5, surge_multiplier#7]
+- Join Inner, (city#2 = city#6), rightHint=(strategy=broadcast)
   :- Filter isnotnull(city#2)
   :  +- LogicalRDD [rid


# From the physical plan, identify all expensive operators and classify them as CPU, memory, or network heavy.

- Join operations → CPU heavy, because they require comparing keys and combining rows.
- Shuffle exchanges → Network heavy, because data must be moved across machines to align by keys.
- Sort operations → CPU and memory heavy, since they require ordering large datasets and holding them in memory buffers.
- Aggregation → CPU heavy, with memory overhead if intermediate results are large.


# Why does Spark default to SortMergeJoin?

Spark defaults to SortMergeJoin because it is the most scalable join strategy for large datasets. It sorts both sides of the join on the key and then merges them efficiently. This avoids broadcasting huge tables and handles cases where both inputs are large, making it a safe default for distributed systems.

# Create a long transformation pipeline without any action. Explain what Spark has done so far.

Spark has only recorded the sequence of steps in a logical plan. It has not processed any data yet. The system builds a blueprint of operations but delays execution until a result is explicitly requested.

# Trigger different actions (count, show, write) separately. Observe whether Spark recomputes the DAG and explain behavior.

Each action forces Spark to execute the entire pipeline. Unless results are cached, Spark recomputes the DAG from the beginning for every action. This happens because transformations are lazy, and Spark ensures correctness by re-running the plan whenever an action is invoked.

# Why does broadcast remove shuffle from the DAG?

Broadcast removes shuffle because the smaller dataset is copied to all machines. This means the larger dataset can be processed locally without redistributing rows across the network.

# Why does repartition always introduce shuffle?

Repartition introduces shuffle because data must be redistributed to create new partition boundaries. Rows are moved across machines to balance or align with the new partitioning scheme.

# Why is coalesce cheaper than repartition?

Coalesce is cheaper because it only reduces the number of partitions by merging existing ones. It avoids a full redistribution of data, so there is no network shuffle unless explicitly requested.

# Why does Spark delay execution until an action?

Spark delays execution to optimize performance. By waiting, it can combine multiple steps, eliminate redundancies, and minimize data movement. This lazy evaluation ensures resources are used efficiently and only when results are actually needed.

