In [0]:
from pyspark.sql import SparkSession
spark

In [0]:
#1. Ingest all 3 CSVs as Delta Tables
# Load as DataFrames
df_orders = spark.read.option("header", True).option("inferSchema", True) \
    .csv("file:/Workspace/Shared/orders.csv")

df_customers = spark.read.option("header", True).option("inferSchema", True) \
    .csv("file:/Workspace/Shared/customers.csv")

df_products = spark.read.option("header", True).option("inferSchema", True) \
    .csv("file:/Workspace/Shared/products.csv")

# Save as Delta
df_orders.write.mode("overwrite").format("delta").save("/Workspace/Shared/delta/orders")
df_customers.write.mode("overwrite").format("delta").save("/Workspace/Shared/delta/customers")
df_products.write.mode("overwrite").format("delta").save("/Workspace/Shared/delta/products")
#show the tables
df_customers.show()
df_products.show()
df_orders.show()

+----------+------------+------+----------+
|CustomerID|CustomerName|Region|SignupDate|
+----------+------------+------+----------+
|      C001|        Amit| North|2023-11-12|
|      C002|        Sara| South|2024-01-08|
|      C003|        John|  West|2023-06-20|
|      C004|       Priya|  East|2024-03-15|
+----------+------------+------+----------+

+---------+-----------+-----------+-----+------------+
|ProductID|ProductName|   Category|Stock|ReorderLevel|
+---------+-----------+-----------+-----+------------+
|    P1001|     Laptop|Electronics|    5|           2|
|    P1002|      Phone|Electronics|   10|           3|
|    P1003|     Tablet|Electronics|    7|           2|
|    P1004|   Keyboard|Accessories|   15|           5|
+---------+-----------+-----------+-----+------------+

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3

In [0]:
#2. Total Revenue per Product
from pyspark.sql.functions import col, sum

df_revenue_product = df_orders.withColumn("Revenue", col("Quantity") * col("Price")) \
    .groupBy("ProductID").agg(sum("Revenue").alias("TotalRevenue"))

df_revenue_product.show()

+---------+------------+
|ProductID|TotalRevenue|
+---------+------------+
|    P1001|       75000|
|    P1002|      150000|
|    P1004|       30000|
|    P1003|       30000|
+---------+------------+



In [0]:
#3. Revenue by Region (Join Orders + Customers)
from pyspark.sql.functions import col, sum

df_joined = df_orders.join(df_customers, "CustomerID")

df_revenue_region = df_joined.withColumn("Revenue", col("Quantity") * col("Price")) \
    .groupBy("Region").agg(sum("Revenue").alias("TotalRevenue"))
df_revenue_region.show()

+------+------------+
|Region|TotalRevenue|
+------+------------+
| South|      100000|
|  East|       30000|
|  West|       30000|
| North|      125000|
+------+------------+



In [0]:
#4. Update Status of Pending Orders to 'Cancelled'
from delta.tables import DeltaTable
from pyspark.sql.functions import expr

orders_table = DeltaTable.forPath(spark, "/Workspace/Shared/delta/orders")

orders_table.update(
    condition=expr("Status = 'Pending'"),
    set={"Status": expr("'Cancelled'")}
)
spark.read.format('delta').load('/Workspace/Shared/delta/orders').show()

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3005|      C004|    P1004|       3|10000|2024-05-05|Cancelled|
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
+-------+----------+---------+--------+-----+----------+---------+



In [0]:
 #5. Merge a New Return Record into Orders
from pyspark.sql import Row

new_return = [Row(OrderID=3006, CustomerID='C002', ProductID='P1003',
                  Quantity=1, Price=30000, OrderDate='2024-05-06', Status='Returned')]

df_new = spark.createDataFrame(new_return)

orders_table.alias("target").merge(
    df_new.alias("source"),
    "target.OrderID = source.OrderID"
).whenNotMatchedInsertAll().execute()
df_new.show()

+-------+----------+---------+--------+-----+----------+--------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|  Status|
+-------+----------+---------+--------+-----+----------+--------+
|   3006|      C002|    P1003|       1|30000|2024-05-06|Returned|
+-------+----------+---------+--------+-----+----------+--------+



In [0]:
#6. DLT Pipeline Simulation: Raw → Cleaned → Aggregated
df_orders_raw = spark.read.format("delta").load("/Workspace/Shared/delta/orders")

# Cleaned table: remove rows with any NULLs
df_cleaned = df_orders_raw.na.drop()
df_cleaned.show()
# 11Optionally register as temp view if needed later
df_cleaned.createOrReplaceTempView("orders_cleaned")
from pyspark.sql.functions import col, sum

df_products = spark.read.format("delta").load("/Workspace/Shared/delta/products")

df_cleaned = spark.read.table("orders_cleaned")

df_joined = df_cleaned.join(df_products, on="ProductID")

df_category_revenue = df_joined.withColumn("Revenue", col("Quantity") * col("Price")) \
    .groupBy("Category").agg(sum("Revenue").alias("TotalRevenue"))

df_category_revenue.show()


+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
|   3006|      C002|    P1003|       1|30000|2024-05-06| Returned|
+-------+----------+---------+--------+-----+----------+---------+

+-----------+------------+
|   Category|TotalRevenue|
+-----------+------------+
|Electronics|      285000|
|Accessories|       30000|
+-----------+------------+



In [0]:
#7. View Data Before the Status Update (Time Travel)
df_v0 = spark.read.format("delta").option("versionAsOf", 0) \
    .load("/Workspace/Shared/delta/orders")

df_v0.select("OrderID", "Status").show()

+-------+---------+
|OrderID|   Status|
+-------+---------+
|   3001|Delivered|
|   3002| Returned|
|   3003|Delivered|
|   3004|Delivered|
|   3005|  Pending|
+-------+---------+



In [0]:
#8. Restore to an Older Version (e.g., version 0)
df_old = spark.read.format("delta").option("versionAsOf", 0) \
    .load("/Workspace/Shared/delta/orders")

df_old.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("/Workspace/Shared/delta/orders")
df_old.show()    

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



In [0]:
#9. VACUUM with Shortened Retention Period

spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", False)

spark.sql("VACUUM delta.`/Workspace/Shared/delta/orders` RETAIN 0 HOURS")

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
#10. Expectations (Validation)
from pyspark.sql.functions import col

df_orders = spark.read.format("delta").load("/Workspace/Shared/delta/orders")

df_valid = df_orders.filter(
    (col("Quantity") > 0) &
    (col("Price") > 0) &
    (col("OrderDate").isNotNull())
)

df_valid.show()

+-------+----------+---------+--------+-----+----------+---------+
|OrderID|CustomerID|ProductID|Quantity|Price| OrderDate|   Status|
+-------+----------+---------+--------+-----+----------+---------+
|   3001|      C001|    P1001|       1|75000|2024-05-01|Delivered|
|   3002|      C002|    P1002|       2|50000|2024-05-02| Returned|
|   3003|      C003|    P1003|       1|30000|2024-05-03|Delivered|
|   3004|      C001|    P1002|       1|50000|2024-05-04|Delivered|
|   3005|      C004|    P1004|       3|10000|2024-05-05|  Pending|
+-------+----------+---------+--------+-----+----------+---------+



In [0]:
#11. Bonus – OrderType Column with when-otherwise

from pyspark.sql.functions import when

df_orders = spark.read.format("delta").load("/Workspace/Shared/delta/orders")

df_with_order_type = df_orders.withColumn(
    "OrderType",
    when(col("Status") == "Returned", "Return").otherwise("Regular")
)

df_with_order_type.select("OrderID", "Status", "OrderType").show()

+-------+---------+---------+
|OrderID|   Status|OrderType|
+-------+---------+---------+
|   3001|Delivered|  Regular|
|   3002| Returned|   Return|
|   3003|Delivered|  Regular|
|   3004|Delivered|  Regular|
|   3005|  Pending|  Regular|
+-------+---------+---------+

