Scenario 1: Inventory Alerting System

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.appName("RetailInventory").getOrCreate()
spark

# 1. Load data
inventory_df = spark.read.option("header", True).csv("/Volumes/workspace/default/shared/inventory_supply.csv")

# 2. Create NeedsReorder flag
inventory_df = inventory_df.withColumn("NeedsReorder", col("StockQty") < col("ReorderLevel"))

# 3. Create restocking view
restock_view = inventory_df.filter(col("NeedsReorder") == True)
restock_view.createOrReplaceTempView("NeedsRestocking")

# 4. Highlight problematic warehouses
warehouse_alerts = restock_view.groupBy("Warehouse").agg(count("*").alias("ItemsNeedingRestock")).filter(col("ItemsNeedingRestock") > 1)
display(warehouse_alerts)

Warehouse,ItemsNeedingRestock
WarehouseB,2


Scenario 2: Supplier Price Optimization

In [0]:
# 1. Group by supplier and compute average price
supplier_stats = inventory_df.groupBy("Supplie").agg(avg("UnitPrice").alias("AvgSupplierPrice"))
display(supplier_stats)

# 2. Find items below average price in their category
category_avg = inventory_df.groupBy("Category").agg(avg("UnitPrice").alias("CategoryAvgPrice"))
price_comparison = inventory_df.join(category_avg, "Category") \
.withColumn("BelowMarketPrice", col("UnitPrice") < col("CategoryAvgPrice"))
display(price_comparison)

# 3. Tag suppliers with "Good Deal"
supplier_deal_quality = price_comparison.groupBy("Supplie").agg((avg(when(col("BelowMarketPrice") == True, 1).otherwise(0)) * 100).alias("PercentBelowMarket")).withColumn("DealQuality",when(col("PercentBelowMarket") > 50, "Good Deal").otherwise("Standard"))
display(supplier_deal_quality)

Supplie,AvgSupplierPrice
AVTech,30000.0
FreezeIt,25000.0
TechWorld,70000.0
ChairCo,6000.0
PrintFast,8000.0


Category,ItemID,ItemName,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplie,NeedsReorder,CategoryAvgPrice,BelowMarketPrice
Electronics,I001,LED TV,WarehouseA,50,20,2024-03-15,30000,AVTech,False,36000.0,True
Electronics,I002,Laptop,WarehouseB,10,15,2024-04-01,70000,TechWorld,True,36000.0,False
Furniture,I003,Office Chair,WarehouseA,40,10,2024-03-25,6000,ChairCo,False,6000.0,False
Appliances,I004,Refrigerator,WarehouseC,5,10,2024-02-20,25000,FreezeIt,False,25000.0,False
Electronics,I005,Printer,WarehouseB,3,5,2024-03-30,8000,PrintFast,True,36000.0,True


Supplie,PercentBelowMarket,DealQuality
AVTech,100.0,Good Deal
FreezeIt,0.0,Standard
TechWorld,0.0,Standard
ChairCo,0.0,Standard
PrintFast,100.0,Good Deal


Scenario 3: Cost Forecasting

In [0]:
from pyspark.sql.functions import col

# 1. Calculate TotalStockValue
inventory_df = inventory_df.withColumn("TotalStockValue", col("StockQty").cast("float") * col("UnitPrice").cast("float"))

# 2. Identify top 3 highest-value items
top_items = inventory_df.orderBy(col("TotalStockValue").desc()).limit(3)

# 3. Export as Parquet
top_items.write.partitionBy("Warehouse") \
.mode("overwrite") \
.parquet("/Volumes/workspace/default/shared/top_value_items_parquet")
display(top_items)

ItemID,ItemName,Category,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplier,TotalStockValue
I001,LED TV,Electronics,WarehouseA,50.0,20.0,2024-03-15,30000.0,AVTech,1500000.0
I002,Laptop,Electronics,WarehouseB,10.0,15.0,2024-04-01,70000.0,TechWorld,700000.0
I003,Office Chair,Furniture,WarehouseA,40.0,10.0,2024-03-25,6000.0,ChairCo,240000.0


Scenario 4: Warehouse Utilization

In [0]:
# 1. Count items per warehouse
items_per_warehouse = inventory_df.groupBy("Warehouse").agg(count("*").alias("TotalItems"))
display(items_per_warehouse)

# 2. Average stock per category in each warehouse
category_stock = inventory_df.groupBy("Warehouse", "Category").agg(avg("StockQty").alias("AvgStockPerCategory"))
display(category_stock)

# 3. Identify underutilized warehouses
underutilized = inventory_df.groupBy("Warehouse").agg(sum("StockQty").alias("TotalStock")).filter(col("TotalStock") < 100)

display(underutilized)

Warehouse,TotalItems
WarehouseB,2
WarehouseA,2
WarehouseC,1


Warehouse,Category,AvgStockPerCategory
WarehouseB,Electronics,6.5
WarehouseA,Electronics,50.0
WarehouseA,Furniture,40.0
WarehouseC,Appliances,5.0


Warehouse,TotalStock
WarehouseB,13.0
WarehouseA,90.0
WarehouseC,5.0


Scenario 5: Delta Audit Trail

In [0]:
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, "/Volumes/workspace/default/shared/retail_inventory")

# 1. Save as Delta table
inventory_df.write.format("delta").save("/Volumes/workspace/default/shared/retail_inventory")

#2. Update stock of 'Laptop' to 20.
delta_table.update(condition = "ItemName = 'Laptop'",set = {"StockQty": "20"})

# 3. Delete any item with StockQty = 0 .
delta_table.delete("StockQty = 0")

#4. Run DESCRIBE HISTORY and query VERSION AS OF previous state.
display(delta_table.history())
previous_version = spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .load("/Volumes/workspace/default/shared/retail_inventory")
display(previous_version)

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2025-06-19T07:37:29.000Z,5873923491206719,ahmedashiq2k17@gmail.com,DELETE,"Map(predicate -> [""(StockQty#29919 = 0.0)""])",,,0619-053659-98ecxxfe-v2n,1.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 0, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 155, conflictDetectionTimeMs -> 32, numDeletionVectorsUpdated -> 0, numDeletedRows -> 0, scanTimeMs -> 155, numAddedFiles -> 0, numAddedBytes -> 0, rewriteTimeMs -> 0)",,Databricks-Runtime/16.4.x-photon-scala2.12
2,2025-06-19T07:37:28.000Z,5873923491206719,ahmedashiq2k17@gmail.com,OPTIMIZE,"Map(predicate -> [], auto -> true, clusterBy -> [], zOrderBy -> [], batchId -> 0)",,,0619-053659-98ecxxfe-v2n,1.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 5274, p25FileSize -> 2797, numDeletionVectorsRemoved -> 1, minFileSize -> 2797, numAddedFiles -> 1, maxFileSize -> 2797, p75FileSize -> 2797, p50FileSize -> 2797, numAddedBytes -> 2797)",,Databricks-Runtime/16.4.x-photon-scala2.12
1,2025-06-19T07:37:27.000Z,5873923491206719,ahmedashiq2k17@gmail.com,UPDATE,"Map(predicate -> [""(ItemName#29001 = Laptop)""])",,,0619-053659-98ecxxfe-v2n,0.0,WriteSerializable,False,"Map(numRemovedFiles -> 0, numRemovedBytes -> 0, numCopiedRows -> 0, numDeletionVectorsAdded -> 1, numDeletionVectorsRemoved -> 0, numAddedChangeFiles -> 0, executionTimeMs -> 1126, numDeletionVectorsUpdated -> 0, scanTimeMs -> 532, numAddedFiles -> 1, numUpdatedRows -> 1, numAddedBytes -> 2478, rewriteTimeMs -> 594)",,Databricks-Runtime/16.4.x-photon-scala2.12
0,2025-06-19T07:37:25.000Z,5873923491206719,ahmedashiq2k17@gmail.com,WRITE,"Map(mode -> ErrorIfExists, statsOnLoad -> false, partitionBy -> [])",,,0619-053659-98ecxxfe-v2n,,WriteSerializable,True,"Map(numFiles -> 1, numOutputRows -> 5, numOutputBytes -> 2796)",,Databricks-Runtime/16.4.x-photon-scala2.12


ItemID,ItemName,Category,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplier,TotalStockValue
I001,LED TV,Electronics,WarehouseA,50.0,20.0,2024-03-15,30000.0,AVTech,1500000.0
I002,Laptop,Electronics,WarehouseB,10.0,15.0,2024-04-01,70000.0,TechWorld,700000.0
I003,Office Chair,Furniture,WarehouseA,40.0,10.0,2024-03-25,6000.0,ChairCo,240000.0
I004,Refrigerator,Appliances,WarehouseC,5.0,10.0,2024-02-20,25000.0,FreezeIt,125000.0
I005,Printer,Electronics,WarehouseB,3.0,5.0,2024-03-30,8000.0,PrintFast,24000.0


Scenario 6: Alerts from Restock Logs

In [0]:
restock_logs = spark.read.option("header", True).csv("/Volumes/workspace/default/shared/restock_logs.csv")

#1. Join with inventory table to update StockQty.
restocked_items = inventory_df.join(restock_logs, "ItemID") \
    .withColumn("NewStockQty", col("StockQty") + col("QuantityAdded")) \
    .withColumn("RestockedRecently", lit(True))
display(restocked_items)

#2. Calculate new stock and flag RestockedRecently = true for updated items.
#3. Use MERGE INTO to update in Delta.
delta_table.alias("target").merge(restocked_items.alias("source"),"target.ItemID = source.ItemID") \
.whenMatchedUpdate(set = {"StockQty": "source.NewStockQty","LastRestocked": "current_date()"}).execute()

display(delta_table.toDF())

ItemID,ItemName,Category,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplier,TotalStockValue,RestockDate,QuantityAdded,NewStockQty,RestockedRecently
I001,LED TV,Electronics,WarehouseA,50.0,20.0,2024-03-15,30000.0,AVTech,1500000.0,2024-04-25,20,70.0,True
I002,Laptop,Electronics,WarehouseB,10.0,15.0,2024-04-01,70000.0,TechWorld,700000.0,2024-04-20,10,20.0,True
I005,Printer,Electronics,WarehouseB,3.0,5.0,2024-03-30,8000.0,PrintFast,24000.0,2024-04-22,5,8.0,True


ItemID,ItemName,Category,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplier,TotalStockValue
I003,Office Chair,Furniture,WarehouseA,40.0,10.0,2024-03-25,6000.0,ChairCo,240000.0
I004,Refrigerator,Appliances,WarehouseC,5.0,10.0,2024-02-20,25000.0,FreezeIt,125000.0
I002,Laptop,Electronics,WarehouseB,20.0,15.0,2025-06-19,70000.0,TechWorld,700000.0
I001,LED TV,Electronics,WarehouseA,70.0,20.0,2025-06-19,30000.0,AVTech,1500000.0
I005,Printer,Electronics,WarehouseB,8.0,5.0,2025-06-19,8000.0,PrintFast,24000.0


Scenario 7: Report Generation with SQL Views

In [0]:
#1. Create SQL view inventory_summary with:
inventory_df.createOrReplaceTempView("inventory_view")
##ItemName, Category, StockQty, NeedsReorder, TotalStockValue
spark.sql("""create or replace temp view inventory_summary as
select itemname,category,stockqty,(stockqty < reorderlevel) as needsreorder,(stockqty * unitprice) as totalstockvalue
from inventory_view""")

#2. Create view supplier_leaderboard sorted by average price
spark.sql("""create or replace temp view supplier_leaderboard as
select supplier,avg(unitprice) as avgprice,count(*) as itemssupplied
from inventory_view
group by supplier
order by avgprice desc""")

display(spark.sql("select * from inventory_summary"))
display(spark.sql("select * from supplier_leaderboard"))

itemname,category,stockqty,needsreorder,totalstockvalue
LED TV,Electronics,50.0,False,1500000.0
Laptop,Electronics,10.0,True,700000.0
Office Chair,Furniture,40.0,False,240000.0
Refrigerator,Appliances,5.0,True,125000.0
Printer,Electronics,3.0,True,24000.0


supplier,avgprice,itemssupplied
TechWorld,70000.0,1
AVTech,30000.0,1
FreezeIt,25000.0,1
PrintFast,8000.0,1
ChairCo,6000.0,1


Scenario 8: Advanced Filtering

In [0]:
#1. Use when / otherwise to categorize items:
##"Overstocked" (>2x ReorderLevel)
##"LowStock"
inventory_df = inventory_df.withColumn("StockStatus",when(col("StockQty") > 2 * col("ReorderLevel"), "Overstocked").when(col("StockQty") < col("ReorderLevel"), "LowStock").otherwise("Normal"))

#2. Use .filter() and .where() for the same and compare.
filter_result = inventory_df.filter(col("StockStatus") == "LowStock")
where_result = inventory_df.where(col("StockStatus") == "LowStock")

display(filter_result)
display(where_result)

ItemID,ItemName,Category,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplier,TotalStockValue,StockStatus
I002,Laptop,Electronics,WarehouseB,10.0,15.0,2024-04-01,70000.0,TechWorld,700000.0,LowStock
I004,Refrigerator,Appliances,WarehouseC,5.0,10.0,2024-02-20,25000.0,FreezeIt,125000.0,LowStock
I005,Printer,Electronics,WarehouseB,3.0,5.0,2024-03-30,8000.0,PrintFast,24000.0,LowStock


ItemID,ItemName,Category,Warehouse,StockQty,ReorderLevel,LastRestocked,UnitPrice,Supplier,TotalStockValue,StockStatus
I002,Laptop,Electronics,WarehouseB,10.0,15.0,2024-04-01,70000.0,TechWorld,700000.0,LowStock
I004,Refrigerator,Appliances,WarehouseC,5.0,10.0,2024-02-20,25000.0,FreezeIt,125000.0,LowStock
I005,Printer,Electronics,WarehouseB,3.0,5.0,2024-03-30,8000.0,PrintFast,24000.0,LowStock


Scenario 9: Feature Engineering

In [0]:
from pyspark.sql.functions import datediff, current_date

#1. Extract RestockMonth from LastRestocked .
inventory_df = inventory_df.withColumn("RestockMonth", month(col("LastRestocked")))

#2. Create feature: StockAge = CURRENT_DATE - LastRestocked
inventory_df = inventory_df.withColumn("StockAge", datediff(current_date(), col("LastRestocked")))

#3. Bucket StockAge into: New, Moderate, Stale
inventory_df = inventory_df.withColumn("StockFreshness",when(col("StockAge") < 30, "New").when(col("StockAge") < 90, "Moderate").otherwise("Stale"))

display(inventory_df.select("ItemName", "StockAge", "StockFreshness"))

ItemName,StockAge,StockFreshness
LED TV,461,Stale
Laptop,444,Stale
Office Chair,451,Stale
Refrigerator,485,Stale
Printer,446,Stale


Scenario 10: Export Options

In [0]:
#1. Write full DataFrame to:
# CSV for analysts
inventory_df.write.option("header", True) \
.mode("overwrite") \
.csv("/Volumes/workspace/default/shared/full_dataset_csv")

# JSON for integration
inventory_df.write.mode("overwrite") \
.json("/Volumes/workspace/default/shared/full_dataset_json")

# Delta for pipelines
inventory_df.write.format("delta") \
.mode("overwrite") \
.save("/Volumes/workspace/default/shared/full_dataset_delta")

# Partitioned export for stale items
inventory_df.filter(col("StockFreshness") == "Stale") \
.write.partitionBy("Warehouse") \
.mode("overwrite") \
.parquet("/Volumes/workspace/default/shared/stale_items")