In [9]:
%run nb0.spark-session.ipynb
from pyspark.sql.functions import col, lit, when, upper, avg

In [10]:
data = [
    (1, "Laptop", "Electronics", 1200, 5),
    (2, "Smartphone", "Electronics", 800, 10),
    (3, "Coffee Maker", "Home", 150, 20),
    (4, "Desk Chair", "Furniture", 250, 15),
    (5, "Monitor", "Electronics", 300, 8)
]

schema = ["id", "product", "category", "price", "stock"]

# Transformation: Creating the DataFrame
df = spark.createDataFrame(data, schema)

# Note: No computation has happened yet. Spark only knows the schema.
print("DataFrame created (Transformations recorded).")

DataFrame created (Transformations recorded).


In [11]:
# 1. Filter for Electronics
# 2. Add a 'total_value' column (price * stock)
# 3. Uppercase the product name
transformed_df = df.filter(col("category") == "Electronics") \
                   .withColumn("total_value", col("price") * col("stock")) \
                   .withColumn("product", upper(col("product")))

# This is still lazy! Let's look at the execution plan Spark has built:
transformed_df.explain()

== Physical Plan ==
LocalTableScan [id#926L, product#934, category#928, price#929L, stock#930L, total_value#932L]




In [12]:
# Action: show()
print("Executing Action: show()")
transformed_df.show()

# Action: count()
print(f"Total Electronics items: {transformed_df.count()}")

Executing Action: show()
+---+----------+-----------+-----+-----+-----------+
| id|   product|   category|price|stock|total_value|
+---+----------+-----------+-----+-----+-----------+
|  1|    LAPTOP|Electronics| 1200|    5|       6000|
|  2|SMARTPHONE|Electronics|  800|   10|       8000|
|  5|   MONITOR|Electronics|  300|    8|       2400|
+---+----------+-----------+-----+-----+-----------+

Total Electronics items: 3


In [13]:
# Chaining complex logic
final_analysis = df.groupBy("category") \
    .agg(avg("price").alias("avg_price")) \
    .filter(col("avg_price") > 200)

# Trigger Action
final_analysis.show()

+-----------+-----------------+
|   category|        avg_price|
+-----------+-----------------+
|Electronics|766.6666666666666|
|  Furniture|            250.0|
+-----------+-----------------+



In [14]:
final_analysis.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(avg_price#1024) AND (avg_price#1024 > 200.0))
   +- HashAggregate(keys=[category#1027], functions=[avg(price#1028L)])
      +- Exchange hashpartitioning(category#1027, 200), ENSURE_REQUIREMENTS, [plan_id=817]
         +- HashAggregate(keys=[category#1027], functions=[partial_avg(price#1028L)])
            +- LocalTableScan [category#1027, price#1028L]




- **HashAggregate**: Groups input rows by hash-based keys and computes aggregations in-memory for each partition.  
- **Exchange HashPartitioning**: Redistributes data across executors by hashing partition keys so that rows sharing a key land on the same partition for downstream operations.  
- **LocalTableScan**: Reads already available in-memory data within the executor without additional shuffles or I/O, serving as the starting point for local query fragments.