In [0]:
%python
# Databricks Notebook: silver_orders_processed
# Language: Python

# COMMAND ----------
# DBTITLE 1,Configuration
bronze_table_name = "ordercatalog.bronze_schema.bronze_orders_raw"
silver_table_name = "ordercatalog.silver_schema.silver_orders_processed"

# COMMAND ----------
# DBTITLE 1,Read from Bronze Layer
# Read all data from the bronze table
df_bronze = spark.read.table(bronze_table_name)

# COMMAND ----------
# DBTITLE 1,Data Cleaning and Transformation
from pyspark.sql.functions import col, to_timestamp, lit, upper, current_timestamp
from pyspark.sql.types import StringType, IntegerType, DoubleType, TimestampType

df_silver = df_bronze.select(
    col("order_id").cast(StringType()).alias("order_id"),
    col("customer_id").cast(StringType()).alias("customer_id"),
    col("product_id").cast(StringType()).alias("product_id"),
    col("quantity").cast(IntegerType()).alias("quantity"),
    col("price").cast(DoubleType()).alias("price"),
    col("order_timestamp").cast(TimestampType()).alias("order_timestamp"), # Ensure correct type
    upper(col("status")).alias("order_status"), # Standardize status to uppercase
    current_timestamp().alias("processing_timestamp") # Add silver layer processing timestamp
)

# You can add more cleaning rules here:
# - Handle nulls (e.g., fillna, dropna)
# - Data type conversions if not done already
# - Deduplication within the current batch if necessary (though Merge Into handles this across batches)
# - Filtering out malformed records

# COMMAND ----------
# DBTITLE 1,Implement Upsert Logic (Merge Into)
from delta.tables import DeltaTable

# Check if silver table exists
if not spark.catalog.tableExists(silver_table_name):
    # If not, create it by writing the processed data
    df_silver.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true").partitionBy("order_status").saveAsTable(silver_table_name)
    print(f"Created silver table: {silver_table_name}")
else:
    # If table exists, perform a MERGE (UPSERT) operation
    deltaTable = DeltaTable.forName(spark, f"{silver_table_name}")

    # Define the merge key (primary key of your order data)
    merge_key = "order_id"
    from pyspark.sql.functions import col
    df_deduplicated = df_silver.orderBy(col("order_id"), col("processing_timestamp").desc_nulls_last()) \
                           .dropDuplicates(["order_id"])

    deduplicated_count = df_deduplicated.count()
    print(f"Deduplicated row count: {deduplicated_count}")
    # Perform the merge operation
    deltaTable.alias("target") \
        .merge(
            df_silver.alias("source"),
            f"target.{merge_key} = source.{merge_key}"
        ) \
        .whenMatchedUpdate(set = {
            "customer_id": "source.customer_id",
            "product_id": "source.product_id",
            "quantity": "source.quantity",
            "price": "source.price",
            "order_timestamp": "source.order_timestamp",
            "order_status": "source.order_status",
            "processing_timestamp": "source.processing_timestamp"
        }) \
        .whenNotMatchedInsertAll() \
        .execute()
    print(f"Performed upsert on silver table: {silver_table_name}")

# COMMAND ----------
# DBTITLE 1,Verify Silver Table
# spark.sql(f"SELECT * FROM {silver_table_name} LIMIT 10").display()
# spark.sql(f"SELECT COUNT(*) FROM {silver_table_name}").display()
# spark.sql(f"DESCRIBE HISTORY {silver_table_name}").display() # See the history of changes

In [0]:
select * from ordercatalog.silver_schema.silver_orders_processed

In [0]:
select count(*),order_id  from ordercatalog.silver_schema.silver_orders_processed group by order_id having count(order_id) > 1

In [0]:
select * from ordercatalog.silver_schema.silver_orders_processed where order_id='ORD-76319'

In [0]:
drop table if exists ordercatalog.silver_schema.silver_orders_processed