In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week2_Day3_Transformations") \
    .getOrCreate()

print("✅ Spark Session Created")

✅ Spark Session Created


In [6]:
# Load sales.csv into DataFrame
df = spark.read.csv("../data/sales.csv", header=True, inferSchema=True)

df.show(5)   # Preview first 5 rows

+--------+-----------+----------+--------+--------+----------+
|order_id|customer_id|order_date| product|quantity|unit_price|
+--------+-----------+----------+--------+--------+----------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|
+--------+-----------+----------+--------+--------+----------+
only showing top 5 rows



In [7]:
rdd = df.rdd
print(rdd.take(2))  # show 2 rows from RDD

[Row(order_id=1, customer_id=1001, order_date=datetime.date(2024, 1, 1), product='Widget A', quantity=2, unit_price=9.99), Row(order_id=2, customer_id=1002, order_date=datetime.date(2024, 1, 3), product='Widget B', quantity=1, unit_price=19.99)]


In [9]:
# Filter sales with quantity > 2
filtered_rdd = rdd.filter(lambda row: row['quantity'] > 2)
b
# Map each row to (product, revenue)
product_revenue_rdd = filtered_rdd.map(lambda row: (row['product'], row['quantity'] * row['unit_price']))

# ReduceByKey → sum revenue per product
revenue_by_product_rdd = product_revenue_rdd.reduceByKey(lambda a, b: a + b)

In [10]:
# Collect results
results = revenue_by_product_rdd.collect()
print(results)

[('Widget C', 22.5), ('Widget A', 129.87), ('Widget B', 79.96), ('Widget D', 89.97)]


In [11]:
from pyspark.sql.functions import col, sum

df.withColumn("revenue", col("quantity") * col("unit_price")) \
  .groupBy("product") \
  .agg(sum("revenue").alias("total_revenue")) \
  .show()

+--------+------------------+
| product|     total_revenue|
+--------+------------------+
|Widget C|              31.5|
|Widget B|            139.93|
|Widget A|149.85000000000002|
|Widget D|            119.96|
+--------+------------------+

