In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week2_Day5_Joins_Aggregations") \
    .getOrCreate()

print("✅ Spark Session Created")

✅ Spark Session Created


In [2]:
# Sales data
sales_df = spark.read.csv("../data/sales.csv", header=True, inferSchema=True)

# Customer data (create a sample customers.csv if not already there)
customers_df = spark.read.csv("../data/customers.csv", header=True, inferSchema=True)

sales_df.show(5)
customers_df.show(5)

+--------+-----------+----------+--------+--------+----------+
|order_id|customer_id|order_date| product|quantity|unit_price|
+--------+-----------+----------+--------+--------+----------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|
+--------+-----------+----------+--------+--------+----------+
only showing top 5 rows

+-----------+-------------+------+
|customer_id|customer_name|region|
+-----------+-------------+------+
|       1001|         Amit| North|
|       1002|         Neha| South|
|       1003|          Raj|  East|
|       1004|        Pooja|  West|
|       1005|        Karan| North|
+-----------+-------------+------+



In [3]:
# Inner Join
inner_join = sales_df.join(customers_df, on="customer_id", how="inner")
print("🔹 Inner Join Result:")
inner_join.show()

# Left Join
left_join = sales_df.join(customers_df, on="customer_id", how="left")
print("🔹 Left Join Result:")
left_join.show()

🔹 Inner Join Result:
+-----------+--------+----------+--------+--------+----------+-------------+------+
|customer_id|order_id|order_date| product|quantity|unit_price|customer_name|region|
+-----------+--------+----------+--------+--------+----------+-------------+------+
|       1001|       1|2024-01-01|Widget A|       2|      9.99|         Amit| North|
|       1002|       2|2024-01-03|Widget B|       1|     19.99|         Neha| South|
|       1001|       3|2024-01-07|Widget C|       5|       4.5|         Amit| North|
|       1003|       4|2024-02-10|Widget A|       3|      9.99|          Raj|  East|
|       1004|       5|2024-02-15|Widget B|       2|     19.99|        Pooja|  West|
|       1002|       6|2024-03-01|Widget D|       1|     29.99|         Neha| South|
|       1005|       7|2024-03-05|Widget A|      10|      9.99|        Karan| North|
|       1001|       8|2024-03-20|Widget B|       4|     19.99|         Amit| North|
+-----------+--------+----------+--------+--------+----

In [4]:
from pyspark.sql.functions import sum, avg, count, col

# Total revenue per customer
revenue_per_customer = inner_join.withColumn("revenue", col("quantity") * col("unit_price")) \
    .groupBy("customer_name") \
    .agg(sum("revenue").alias("total_revenue"))

revenue_per_customer.show()

# Average revenue per region
avg_revenue_region = inner_join.withColumn("revenue", col("quantity") * col("unit_price")) \
    .groupBy("region") \
    .agg(avg("revenue").alias("avg_revenue"))

avg_revenue_region.show()

+-------------+-------------+
|customer_name|total_revenue|
+-------------+-------------+
|        Pooja|        39.98|
|          Raj|        29.97|
|         Amit|       122.44|
|         Neha|        49.98|
|        Karan|         99.9|
+-------------+-------------+

+------+------------------+
|region|       avg_revenue|
+------+------------------+
| South|             24.99|
|  East|             29.97|
|  West|             39.98|
| North|55.584999999999994|
+------+------------------+

