In [0]:
nf=spark.read.csv("/Volumes/hexaware_training_workspace_3/default/supply_chain_management_uncleaned/inventory.csv", header=True, inferSchema=True)
vf=spark.read.csv("/Volumes/hexaware_training_workspace_3/default/supply_chain_management_uncleaned/orders.csv", header=True, inferSchema=True)
sf=spark.read.csv("/Volumes/hexaware_training_workspace_3/default/supply_chain_management_uncleaned/suppliers.csv", header=True, inferSchema=True)
display(nf)
display(vf)
display(sf)
nf.printSchema()

item_id,supplier_id,stock_level,reorder_threshold
1,1,250,50
2,1,45,20
3,2,0,30
4,3,500,100
5,4,10,10
6,5,5,10
7,2,200,25
8,3,75,15
9,4,0,5
10,5,120,30


order_id,supplier_id,order_date,delivery_date,quantity,status
1,1,2025-07-01,,100,pending
2,2,2025-07-02,2025-07-06,200,fulfilled
3,3,2025-07-03,,150,cancelled
4,1,2025-07-05,2025-07-08,50,fulfilled
5,4,2025-07-07,,75,pending
6,5,2025-07-08,2025-07-15,20,fulfilled
7,2,2025-07-10,2025-07-12,300,fulfilled
8,3,2025-07-11,,50,pending
9,4,2025-07-12,,100,cancelled
10,5,2025-07-13,2025-07-14,60,fulfilled


supplier_id,name,contact_email
1,Alpha Components,alpha@components.com
2,Beta Industrial,beta@industrial.com
3,Catalyst Co.,contact@catalyst.co
4,Delta Supplies,info@deltasupplies.com
5,Epsilon Goods,epsilon@goods.net


root
 |-- item_id: integer (nullable = true)
 |-- supplier_id: integer (nullable = true)
 |-- stock_level: integer (nullable = true)
 |-- reorder_threshold: integer (nullable = true)



In [0]:
from pyspark.sql.functions import col,when,date_diff
nf=nf.dropna(subset=['item_id','supplier_id'])
nf=nf.fillna({'stock_level':0})
nf=nf.withColumn('reorder_threshold',when(col('reorder_threshold').isNull(),col('stock_level')*0.5).otherwise(col('reorder_threshold')))
nf.show()


+-------+-----------+-----------+-----------------+
|item_id|supplier_id|stock_level|reorder_threshold|
+-------+-----------+-----------+-----------------+
|      1|          1|        250|             50.0|
|      2|          1|         45|             20.0|
|      3|          2|          0|             30.0|
|      4|          3|        500|            100.0|
|      5|          4|         10|             10.0|
|      6|          5|          5|             10.0|
|      7|          2|        200|             25.0|
|      8|          3|         75|             15.0|
|      9|          4|          0|              5.0|
|     10|          5|        120|             30.0|
+-------+-----------+-----------+-----------------+



In [0]:
vf=vf.dropna(subset=['order_id','supplier_id'])
vf=vf.withColumn('order_date',col('order_date').cast('date'))
vf=vf.withColumn('delivery_date',col('delivery_date').cast('date'))
vf=vf.fillna({'quantity':0})
vf=vf.withColumn('delivery_date',when((col('status')=="pending")&(col('delivery_date').isNull()),col('order_date')+7).otherwise(col('delivery_date')))
vf=vf.withColumn('delivery_date',when(col('status')=="cancelled",col('order_date')).otherwise(col('delivery_date')))
vf=vf.withColumn('is_delayed',when(col('status')=='pending',1).otherwise(0))
vf=vf.withColumn('delayed_days',when(col('is_delayed')==1,date_diff(col('delivery_date'),col('order_date'))).otherwise(0))
vf.show(40)

+--------+-----------+----------+-------------+--------+---------+----------+------------+
|order_id|supplier_id|order_date|delivery_date|quantity|   status|is_delayed|delayed_days|
+--------+-----------+----------+-------------+--------+---------+----------+------------+
|       1|          1|2025-07-01|   2025-07-08|     100|  pending|         1|           7|
|       2|          2|2025-07-02|   2025-07-06|     200|fulfilled|         0|           0|
|       3|          3|2025-07-03|   2025-07-03|     150|cancelled|         0|           0|
|       4|          1|2025-07-05|   2025-07-08|      50|fulfilled|         0|           0|
|       5|          4|2025-07-07|   2025-07-14|      75|  pending|         1|           7|
|       6|          5|2025-07-08|   2025-07-15|      20|fulfilled|         0|           0|
|       7|          2|2025-07-10|   2025-07-12|     300|fulfilled|         0|           0|
|       8|          3|2025-07-11|   2025-07-18|      50|  pending|         1|           7|

In [0]:
sf=sf.dropna(subset=['supplier_id','name'])
sf.show()

+-----------+----------------+--------------------+
|supplier_id|            name|       contact_email|
+-----------+----------------+--------------------+
|          1|Alpha Components|alpha@components.com|
|          2| Beta Industrial| beta@industrial.com|
|          3|    Catalyst Co.| contact@catalyst.co|
|          4|  Delta Supplies|info@deltasupplie...|
|          5|   Epsilon Goods|   epsilon@goods.net|
+-----------+----------------+--------------------+



saving the cleaned and filtered data in delta format

In [0]:
sf.write.format("delta").mode("overwrite").save("dbfs:/FileStore/supply_chain_management_cleaned/suppliers")
nf.write.format("delta").mode("overwrite").save("dbfs:/FileStore/supply_chain_management_cleaned/inventory")
vf.write.format("delta").mode("overwrite").save("dbfs:/FileStore/supply_chain_management_cleaned/orders")


basic analysis and querying

In [0]:
#creating delta tables from locations
spark.sql("use catalog hive_metastore")
spark.sql("create database if not exists supplier_chain_management")
spark.sql("create table if not exists suppliers using delta location 'dbfs:/FileStore/supply_chain_management_cleaned/suppliers'")
spark.sql("create table if not exists inventory using delta location 'dbfs:/FileStore/supply_chain_management_cleaned/inventory'")
spark.sql("create table if not exists orders using delta location 'dbfs:/FileStore/supply_chain_management_cleaned/orders'")


DataFrame[]

In [0]:
spark.sql("select*from suppliers").show()


+-----------+----------------+--------------------+
|supplier_id|            name|       contact_email|
+-----------+----------------+--------------------+
|          1|Alpha Components|alpha@components.com|
|          2| Beta Industrial| beta@industrial.com|
|          3|    Catalyst Co.| contact@catalyst.co|
|          4|  Delta Supplies|info@deltasupplie...|
|          5|   Epsilon Goods|   epsilon@goods.net|
+-----------+----------------+--------------------+



In [0]:
#how many orders each supplier placed
spark.sql("""select s.supplier_id,name  as company_name,count(*) from orders as o join suppliers as s on o.supplier_id=s.supplier_id 
          group by s.supplier_id, s.name order by s.supplier_id""").show()

+-----------+----------------+--------+
|supplier_id|    company_name|count(*)|
+-----------+----------------+--------+
|          1|Alpha Components|       3|
|          2| Beta Industrial|       3|
|          3|    Catalyst Co.|       3|
|          4|  Delta Supplies|       3|
|          5|   Epsilon Goods|       3|
+-----------+----------------+--------+



In [0]:
spark.sql("""select contact_email as top_2_emails_of_orders_cancelled_companys from(select s.supplier_id,contact_email,count(*) as cancelled_orders from orders as o join suppliers as s on o.supplier_id=s.supplier_id where status='cancelled'
          group by s.supplier_id , contact_email
          order by cancelled_orders desc) as filtered """).show(2)

+-----------------------------------------+
|top_2_emails_of_orders_cancelled_companys|
+-----------------------------------------+
|                     info@deltasupplie...|
|                        epsilon@goods.net|
+-----------------------------------------+
only showing top 2 rows
