In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week2_Day4_SQL") \
    .getOrCreate()

print("✅ Spark Session Created")

✅ Spark Session Created


In [3]:
df = spark.read.csv("../data/sales.csv", header=True, inferSchema=True)
df.show(5)

+--------+-----------+----------+--------+--------+----------+
|order_id|customer_id|order_date| product|quantity|unit_price|
+--------+-----------+----------+--------+--------+----------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|
+--------+-----------+----------+--------+--------+----------+
only showing top 5 rows



In [4]:
df.createOrReplaceTempView("sales")
print("✅ Temporary view 'sales' created")

✅ Temporary view 'sales' created


In [5]:
# Total revenue by product
query1 = spark.sql("""
    SELECT product, SUM(quantity * unit_price) AS total_revenue
    FROM sales
    GROUP BY product
""")
query1.show()

+--------+------------------+
| product|     total_revenue|
+--------+------------------+
|Widget C|              31.5|
|Widget B|            139.93|
|Widget A|149.85000000000002|
|Widget D|            119.96|
+--------+------------------+



In [6]:
# Filter: Customers with orders > 2 items
query2 = spark.sql("""
    SELECT order_id, customer_id, product, quantity
    FROM sales
    WHERE quantity > 2
""")
query2.show()

+--------+-----------+--------+--------+
|order_id|customer_id| product|quantity|
+--------+-----------+--------+--------+
|       3|       1001|Widget C|       5|
|       4|       1003|Widget A|       3|
|       7|       1005|Widget A|      10|
|       8|       1001|Widget B|       4|
|      10|       1007|Widget D|       3|
+--------+-----------+--------+--------+



In [9]:
# Monthly sales trend
query3 = spark.sql("""
    SELECT date_format(order_date, 'yyyy-MM') AS order_month, SUM(quantity*unit_price) AS revenue
    FROM sales
    GROUP BY order_month
    ORDER BY order_month
""")
query3.show()

+-----------+------------------+
|order_month|           revenue|
+-----------+------------------+
|    2024-01|             62.47|
|    2024-02| 69.94999999999999|
|    2024-03|209.85000000000002|
|    2024-04|             98.97|
+-----------+------------------+



In [10]:
from pyspark.sql.functions import col, sum, date_format

df.withColumn("revenue", col("quantity") * col("unit_price")) \
  .groupBy("product") \
  .agg(sum("revenue").alias("total_revenue")) \
  .show()

df.filter(col("quantity") > 2).select("order_id", "customer_id", "product", "quantity").show()

df.withColumn("revenue", col("quantity") * col("unit_price")) \
  .withColumn("order_month", date_format(col("order_date"), "yyyy-MM")) \
  .groupBy("order_month") \
  .agg(sum("revenue").alias("monthly_revenue")) \
  .orderBy("order_month") \
  .show()

+--------+------------------+
| product|     total_revenue|
+--------+------------------+
|Widget C|              31.5|
|Widget B|            139.93|
|Widget A|149.85000000000002|
|Widget D|            119.96|
+--------+------------------+

+--------+-----------+--------+--------+
|order_id|customer_id| product|quantity|
+--------+-----------+--------+--------+
|       3|       1001|Widget C|       5|
|       4|       1003|Widget A|       3|
|       7|       1005|Widget A|      10|
|       8|       1001|Widget B|       4|
|      10|       1007|Widget D|       3|
+--------+-----------+--------+--------+

+-----------+------------------+
|order_month|   monthly_revenue|
+-----------+------------------+
|    2024-01|             62.47|
|    2024-02| 69.94999999999999|
|    2024-03|209.85000000000002|
|    2024-04|             98.97|
+-----------+------------------+

