In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("June19Assignment3").getOrCreate()
spark

In [0]:
# Tasks:
# 1. Load the data using PySpark.
spark.conf.set("fs.azure.account.key.hestore.blob.core.windows.net","---------AccessKeyyy----------")

inventory_supply_df=spark.read.csv("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply.csv",header=True,inferSchema=True)
inventory_supply_df.show()
inventory_supply_df.printSchema()
# 2. Create a new column NeedsReorder = StockQty < ReorderLevel .
inventory_supply_df=inventory_supply_df.withColumn("NeedsReorder",inventory_supply_df["StockQty"]<inventory_supply_df["ReorderLevel"])
# 3. Create a view of all items that need restocking.
inventory_supply_df.createOrReplaceTempView("inventory_supply")
# 4. Highlight warehouses with more than 2 such items.
from pyspark.sql.functions import *

inventory_supply_df.groupBy("Warehouse").agg(sum(col("NeedsReorder").cast("int")).alias("ItemsNeedingRestock")).filter(col("ItemsNeedingRestock") > 2).show()


+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|PrintFast|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+

root
 |-- ItemID: string (nullable = true)
 |-- ItemName: string (nullable = true)
 |-- Ca

In [0]:
# Tasks:
# 1. Group items by Supplier and compute average price.
inventory_supply_df.groupBy("Supplier").avg("UnitPrice").show()
# 2. Find which suppliers offer items below average price in their category.
inventory_supply_df.groupBy("Supplier").agg({"UnitPrice":"avg"}).withColumnRenamed("avg(UnitPrice)","avg_UnitPrice").join(inventory_supply_df,"Supplier").filter("UnitPrice<avg_UnitPrice").groupBy("Supplier").count().show()
# 3. Tag suppliers with Good Deal if >50% of their items are below market average.
from pyspark.sql.functions import *
inventory_supply_df.groupBy("Supplier").agg({"UnitPrice":"avg"}).withColumnRenamed("avg(UnitPrice)","avg_UnitPrice").join(inventory_supply_df,"Supplier").filter("UnitPrice<avg_UnitPrice").groupBy("Supplier").count().withColumnRenamed("count","below_avg").join(inventory_supply_df.groupBy("Supplier").count().withColumnRenamed("count","total"),on="Supplier").withColumn("GoodDeal",col("below_avg")/col("total")>0.5).select("Supplier","GoodDeal").show()

+---------+--------------+
| Supplier|avg(UnitPrice)|
+---------+--------------+
|   AVTech|       30000.0|
|TechWorld|       70000.0|
|PrintFast|        8000.0|
| FreezeIt|       25000.0|
|  ChairCo|        6000.0|
+---------+--------------+

+--------+-----+
|Supplier|count|
+--------+-----+
+--------+-----+

+--------+--------+
|Supplier|GoodDeal|
+--------+--------+
+--------+--------+



In [0]:
# Tasks:
# 1. Calculate TotalStockValue = StockQty * UnitPrice .
inventory_supply_df=inventory_supply_df.withColumn("TotalStockValue",inventory_supply_df["StockQty"]*inventory_supply_df["UnitPrice"])
inventory_supply_df.show()
# 2. Identify top 3 highest-value items.
inventory_supply_df.orderBy(col("TotalStockValue").desc()).limit(3).show()
# 3. Export the result as a Parquet file partitioned by Warehouse .
inventory_supply_df.write.partitionBy("Warehouse").mode("overwrite").parquet("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply_parquet")

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2024-02-20|    25000| FreezeIt|        true|         125000|
|  I005|     Printer|Electronics|WarehouseB|       3|           5|   2024-03-30|     8000|

In [0]:
# Tasks:
# 1. Count items stored per warehouse.
inventory_supply_df.groupBy("Warehouse").count().show()
# 2. Average stock per category in each warehouse.
inventory_supply_df.groupBy("Warehouse","Category").agg({"StockQty":"avg"}).withColumnRenamed("avg(StockQty)","avg_stock").show()
# 3. Determine underutilized warehouses ( total stock < 100 ).
inventory_supply_df.groupBy("Warehouse").agg({"StockQty":"sum"}).withColumnRenamed("sum(StockQty)","total_stock").filter("total_stock<100").show()

+----------+-----+
| Warehouse|count|
+----------+-----+
|WarehouseA|    2|
|WarehouseC|    1|
|WarehouseB|    2|
+----------+-----+

+----------+-----------+---------+
| Warehouse|   Category|avg_stock|
+----------+-----------+---------+
|WarehouseB|Electronics|      6.5|
|WarehouseA|  Furniture|     40.0|
|WarehouseC| Appliances|      5.0|
|WarehouseA|Electronics|     50.0|
+----------+-----------+---------+

+----------+-----------+
| Warehouse|total_stock|
+----------+-----------+
|WarehouseA|         90|
|WarehouseC|          5|
|WarehouseB|         13|
+----------+-----------+



In [0]:
# Task:
# 1. Save as Delta table retail_inventory .
inventory_supply_df.write.format("delta").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply_delta")
# 2. Update stock of 'Laptop' to 20.
inventory_supply_df.filter("Category='Laptop'").write.format("delta").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply_delta")
# 3. Delete any item with StockQty = 0 .
inventory_supply_df.filter("StockQty=0").write.format("delta").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply_delta")
# 4. Run DESCRIBE HISTORY and query VERSION AS OF previous state.
inventory_supply_df.filter("StockQty=0").write.format("delta").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply_delta")

In [0]:
restock_logs=spark.read.csv("wasbs://june19assignment3@hestore.blob.core.windows.net/restock_logs.csv",header=True,inferSchema=True)
restock_logs.show()
restock_logs.printSchema()
# Tasks:
# 1. Join with inventory table to update StockQty.
inventory_restock_df=inventory_supply_df.join(restock_logs,on="ItemID",how="left")
inventory_restock_df.show()
# 2. Calculate new stock and flag RestockedRecently = true for updated items.
inventory_restock_df=inventory_restock_df.withColumn("StockQty",inventory_restock_df["StockQty"]+inventory_restock_df["QuantityAdded"]).withColumn("RestockedRecently",lit(True)).withColumn("RestockedRecently",when(col("RestockedRecently").isNull(),lit(False)).otherwise(col("RestockedRecently"))).withColumn("RestockedRecently",when(col("RestockedRecently")==True,lit(True)).otherwise(lit(False)))
inventory_restock_df.show()
# 3. Use MERGE INTO to update in Delta.
inventory_restock_df.write.format("delta").option("mergeSchema", "true").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/inventory_supply_delta")

+------+-----------+-------------+
|ItemID|RestockDate|QuantityAdded|
+------+-----------+-------------+
|  I002| 2024-04-20|           10|
|  I005| 2024-04-22|            5|
|  I001| 2024-04-25|           20|
+------+-----------+-------------+

root
 |-- ItemID: string (nullable = true)
 |-- RestockDate: date (nullable = true)
 |-- QuantityAdded: integer (nullable = true)

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+-------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|RestockDate|QuantityAdded|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+-------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000| 2024-04-25|           20|

In [0]:
# Tasks:
# 1. Create SQL view inventory_summary with:
# ItemName, Category, StockQty, NeedsReorder, TotalStockValue
inventory_supply_df.createOrReplaceTempView("inventory_supply")
# Save as a table so it becomes permanent in catalog
inventory_supply_df.write.mode("overwrite").saveAsTable("inventory_supply")
spark.sql("create or replace temporary view inventory_summary as select ItemName,Category,StockQty,NeedsReorder,TotalStockValue from inventory_supply")
# 2. Create view supplier_leaderboard sorted by average price
spark.sql("create or replace temporary view supplier_leaderboard as select Supplier,avg(UnitPrice) as avg_price from inventory_supply group by Supplier order by avg_price")

DataFrame[]

In [0]:

# Tasks:
# 1. Use when / otherwise to categorize items:
# "Overstocked" (>2x ReorderLevel)
# "LowStock"
from pyspark.sql.functions import col, when

inventory_supply_df = inventory_supply_df.withColumn("StockStatus",when(col("StockQty") > 2 * col("ReorderLevel"), "Overstocked").when(col("StockQty") < col("ReorderLevel"), "LowStock").otherwise("Normal"))

inventory_supply_df.select("ItemID", "ItemName", "StockQty", "ReorderLevel", "StockStatus").show()

# 2. Use .filter() and .where() for the same and compare.
#Filter overstocked items using .filter()
inventory_supply_df.filter(col("StockStatus") == "Overstocked").show()
#using where
inventory_supply_df.where(col("StockStatus") == "Overstocked").show()



+------+------------+--------+------------+-----------+
|ItemID|    ItemName|StockQty|ReorderLevel|StockStatus|
+------+------------+--------+------------+-----------+
|  I001|      LED TV|      50|          20|Overstocked|
|  I002|      Laptop|      10|          15|   LowStock|
|  I003|Office Chair|      40|          10|Overstocked|
|  I004|Refrigerator|       5|          10|   LowStock|
|  I005|     Printer|       3|           5|   LowStock|
+------+------------+--------+------------+-----------+

+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+---------------+-----------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice|Supplier|NeedsReorder|TotalStockValue|StockStatus|
+------+------------+-----------+----------+--------+------------+-------------+---------+--------+------------+---------------+-----------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   202

In [0]:
# Tasks:
from pyspark.sql.functions import to_date, date_format
# 1. Extract RestockMonth from LastRestocked .
inventory_supply_df = inventory_supply_df.withColumn("LastRestocked", to_date("LastRestocked"))

inventory_supply_df = inventory_supply_df.withColumn("RestockMonth", date_format("LastRestocked", "MMMM"))
inventory_supply_df.show()
# 2. Create feature: StockAge = CURRENT_DATE - LastRestocked
from pyspark.sql.functions import current_date, datediff

inventory_supply_df = inventory_supply_df.withColumn("StockAge", datediff(current_date(), col("LastRestocked")))
inventory_supply_df.show()
# 3. Bucket StockAge into: New, Moderate, Stale
from pyspark.sql.functions import when

inventory_supply_df = inventory_supply_df.withColumn("StockAgeBucket",when(col("StockAge") <= 30, "New").when((col("StockAge") > 30) & (col("StockAge") <= 90), "Moderate").otherwise("Stale"))
inventory_supply_df.show()

+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+
|ItemID|    ItemName|   Category| Warehouse|StockQty|ReorderLevel|LastRestocked|UnitPrice| Supplier|NeedsReorder|TotalStockValue|StockStatus|RestockMonth|
+------+------------+-----------+----------+--------+------------+-------------+---------+---------+------------+---------------+-----------+------------+
|  I001|      LED TV|Electronics|WarehouseA|      50|          20|   2024-03-15|    30000|   AVTech|       false|        1500000|Overstocked|       March|
|  I002|      Laptop|Electronics|WarehouseB|      10|          15|   2024-04-01|    70000|TechWorld|        true|         700000|   LowStock|       April|
|  I003|Office Chair|  Furniture|WarehouseA|      40|          10|   2024-03-25|     6000|  ChairCo|       false|         240000|Overstocked|       March|
|  I004|Refrigerator| Appliances|WarehouseC|       5|          10|   2

In [0]:

# Tasks:
# 1. Write full DataFrame to:
# CSV for analysts
inventory_supply_df.write.format("csv").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/export/inventory")
# JSON for integration
inventory_supply_df.write.format("json").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/export/inventory")
# Delta for pipelines
inventory_supply_df.write.format("delta").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/export/inventory/delta/")
# 2. Save with meaningful file and partition names like
# /export/inventory/stale_items/
inventory_supply_df.filter(col("StockAgeBucket") == "Stale").write.format("delta").mode("overwrite").save("wasbs://june19assignment3@hestore.blob.core.windows.net/export/inventory/stale_items")