In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pyspark").getOrCreate()
df=spark.read.option("multiline", True).format("json").load("/Volumes/workspace/default/my_volume/chemical_supply_chain_100.json")
#df=spark.read.option("multiline", True).format("json").load("/Volumes/workspace/default/my_volume/chemical_supply_chain.json")


In [0]:
df.printSchema()

In [0]:
from pyspark.sql.functions import *

flat_df = df.select(
    "material_id",
    "material_name",
    "uom",
    "batch.batch_number",
    "batch.manufacturing_date",
    "batch.expiry_date",
    "batch.quality_status",
    "vendor.vendor_name",
    "vendor.country",
    "inventory.available_quantity",
    "inventory.blocked_quantity",
    "delivery.delivery_note",
    "quality_inspection.result"
)
flat_df.show(truncate=False)

In [0]:
# Changing Column Names to Uppercase
df_upper = flat_df.toDF(*[c.upper() for c in flat_df.columns])
df_translated = df_upper.withColumn(
    "UOM_DESC",
    when(col("UOM") == "L", "Liter")
    .when(col("UOM") == "KG", "Kilogram")
    .when(col("UOM") == "EA", "Each")
    .when(col("UOM") == "BOX", "Box")
    .when(col("UOM") == "PAL", "Pallet")
    .otherwise("Unknown")
)
display(df_translated)

In [0]:
df_removedcolumn=df_translated.drop("UOM")
display(df_removedcolumn)

In [0]:
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StructType, StructField, StringType, LongType
try:
    # Try to read a table (or any other Spark operation)
    df = spark.table("chemical_supply_chain")
    df.show()
except AnalysisException as e:
    print(f"{table_name} does not exist. Creating it.")
    
    schema = StructType([
        StructField('MATERIAL_ID', StringType(), True),
        StructField('MATERIAL_NAME', StringType(), True),
        StructField('BATCH_NUMBER', StringType(), True),
        StructField('MANUFACTURING_DATE', StringType(), True),
        StructField('EXPIRY_DATE', StringType(), True),
        StructField('QUALITY_STATUS', StringType(), True),
        StructField('VENDOR_NAME', StringType(), True),
        StructField('COUNTRY', StringType(), True),
        StructField('AVAILABLE_QUANTITY', LongType(), True),
        StructField('BLOCKED_QUANTITY', LongType(), True),
        StructField('DELIVERY_NOTE', StringType(), True),
        StructField('RESULT', StringType(), True),
        StructField('UOM_DESC', StringType(), True)
    ])
    
    chemical_supply_chain_df = spark.createDataFrame([], schema)
    chemical_supply_chain_df.write.format("delta").saveAsTable(table_name)
except Exception as e:
    print("Some other error occurred:", e)

In [0]:
df_removedcolumn.createOrReplaceTempView("chemical_supply_chain_view")

In [0]:
%sql
select * from chemical_supply_chain_view

In [0]:

%sql
MERGE INTO chemical_supply_chain AS target
USING chemical_supply_chain_view AS source
ON target.material_id = source.material_id
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *

In [0]:
%sql
select * from chemical_supply_chain

In [0]:
%sql
select material_id, count(material_id) 
from chemical_supply_chain 
group by material_id 
having count(material_id) > 1

In [0]:
%sql
select * from chemical_supply_chain 

In [0]:
table_name='chemical_supply_chain'
print(f"\n--- Table size BEFORE OPTIMIZE ({table_name}) ---")
initial_describe_detail = spark.sql(f"DESCRIBE DETAIL {table_name}")
initial_describe_detail.show(truncate=False)
initial_size_in_bytes = initial_describe_detail.select("sizeInBytes").collect()[0][0]
initial_num_files = initial_describe_detail.select("numFiles").collect()[0][0]
print(f"Initial Size: {initial_size_in_bytes} bytes")
print(f"Initial Number of Files: {initial_num_files}")

In [0]:
%sql
OPTIMIZE chemical_supply_chain;

In [0]:
print(f"\n--- Table size AFTER OPTIMIZE ({table_name}) ---")
optimized_describe_detail = spark.sql(f"DESCRIBE DETAIL {table_name}")
optimized_describe_detail.show(truncate=False)
optimized_size_in_bytes = optimized_describe_detail.select("sizeInBytes").collect()[0][0]
optimized_num_files = optimized_describe_detail.select("numFiles").collect()[0][0]
print(f"Optimized Size: {optimized_size_in_bytes} bytes")
print(f"Optimized Number of Files: {optimized_num_files}")