In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import *

# Define the schema
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", StringType(), True),
    StructField("price", StringType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True)
])


# Read JSON from external path with the defined schema
df = spark.read.option("multiLine", True).schema(schema).json("abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/orders/")

# Save as a managed table (Unity Catalog or Hive Metastore)
df.write.format("delta").mode("overwrite").saveAsTable("ordercatalog.rawdata_schema.allorders")

In [0]:
%sql
select count(*) from ordercatalog.rawdata_schema.allorders

In [0]:
%sql
select * from ordercatalog.rawdata_schema.allorders

In [0]:
# Databricks Notebook: bronze_orders_load
# Language: Python

# COMMAND ----------
# DBTITLE 1,Configuration
raw_data_path = "abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/orders/"
bronze_table_name = "ordercatalog.bronze_schema.bronze_orders_raw"

# COMMAND ----------
# DBTITLE 1,Read Raw Data
# Read all JSON files from the raw data path
# For production, consider using Auto Loader for incremental ingestion:
# df = spark.readStream.format("cloudFiles") \
#   .option("cloudFiles.format", "json") \
#   .option("cloudFiles.schemaLocation", f"{raw_data_path}_schemas/orders_bronze") \
#   .load(raw_data_path)

# For this example, we'll use a batch read
df_raw = spark.read.option("multiLine", True).schema(schema).json(raw_data_path)
# COMMAND ----------
# DBTITLE 1,Add Ingestion Metadata
from pyspark.sql.functions import current_timestamp, input_file_name

df_bronze = df_raw.withColumn("ingestion_timestamp", current_timestamp()) \
                  .withColumn("source_file", input_file_name())

# COMMAND ----------
# DBTITLE 1,Read Raw Data (using Auto Loader for production)
from pyspark.sql.functions import current_timestamp, input_file_name

# For production, use Auto Loader for incremental ingestion and resilience
# Note: For strict record-level deduplication directly in bronze,
# Auto Loader typically needs to be combined with MERGE INTO.
# For this example, we'll simulate a batch load that you'd then merge.

# Read all new JSON files from the raw data path that haven't been processed yet
# If this is a batch run, you'd read the new files that landed since last run.
# For simplicity in this example, we'll re-read all in the folder, but in a real scenario
# Auto Loader's file tracking (via checkpointLocation) would handle this.
df_raw_incoming = spark.read.format("json").load(raw_data_path)

# Add ingestion metadata
df_bronze_stg = df_raw_incoming.withColumn("ingestion_timestamp", current_timestamp()) \
                              .withColumn("source_file", input_file_name())

# Optional: Deduplicate the incoming batch itself before merging,
# in case a single batch file contains duplicates for the merge key.
# We'll keep the one with the latest timestamp from the source.
from pyspark.sql.functions import col
df_bronze_stg = df_bronze_stg.orderBy(col("order_id"), col("order_timestamp").desc_nulls_last()) \
                             .dropDuplicates(["order_id"])


# COMMAND ----------
# DBTITLE 1,Implement Idempotent Load using MERGE INTO
from delta.tables import DeltaTable

# Define the unique key for your bronze table
# For raw orders, 'order_id' is a common choice for logical uniqueness.
# If order_id can have multiple valid versions in source, you might need a composite key
# or decide how to handle updates vs. new inserts (see WHEN MATCHED below).
merge_key1 = "order_id"
merge_key2="price"

# Check if bronze table exists
if not spark.catalog.tableExists(bronze_table_name):
    # If not, create it by writing the processed data
    df_bronze_stg.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(bronze_table_name)
    print(f"Created bronze table: {database_name}.{bronze_table_name}")
else:
    # If table exists, perform a MERGE operation
    deltaTable = DeltaTable.forName(spark, f"{bronze_table_name}")

    print(f"Performing MERGE INTO on bronze table: {bronze_table_name}")

    # Merge logic:
    # WHEN MATCHED: DO NOTHING. This means if an 'order_id' from the incoming data
    #               already exists in the bronze table, we do not update it.
    #               This ensures the bronze layer always keeps the FIRST instance of that order_id.
    #               If you want to allow updates to bronze (e.g., if a new record with same order_id
    #               represents an update from source and you want the latest version in bronze),
    #               you would use `WHEN MATCHED THEN UPDATE SET *`. However, this is less common
    #               for a pure "raw immutable" bronze layer.
    # WHEN NOT MATCHED: INSERT ALL. Insert records where the 'order_id' does not exist in bronze.

    deltaTable.alias("target") \
        .merge(
            df_bronze_stg.alias("source"),
            f"target.{merge_key1} = source.{merge_key1} "
        ) \
        .whenNotMatchedInsertAll() \
        .execute()
    print(f"MERGE INTO operation complete for {bronze_table_name}.")

# COMMAND ----------
# DBTITLE 1,Verify Bronze Table
# spark.sql(f"SELECT * FROM {bronze_table_name} LIMIT 10").display()
# spark.sql(f"SELECT COUNT(*) FROM {bronze_table_name}").display()
# spark.sql(f"SELECT COUNT(DISTINCT order_id) FROM {bronze_table_name}").display() # Should be equal to total count if successful

In [0]:
%sql
select * from ordercatalog.bronze_schema.bronze_orders_raw

In [0]:
%sql
SELECT order_id, COUNT(order_id) AS order_count
FROM ordercatalog.bronze_schema.bronze_orders_raw
GROUP BY order_id
HAVING COUNT(order_id) > 1;

In [0]:
%sql
select * from ordercatalog.bronze_schema.bronze_orders_raw where order_id='ORD-77697'

In [0]:
%sql
delete from  ordercatalog.bronze_schema.bronze_orders_raw where order_id='ORD-77697' and price=155.81
    



In [0]:
spark.sql("DELETE FROM ordercatalog.bronze_schema.bronze_orders_raw")

In [0]:
%sql
select count(*),order_id  from ordercatalog.bronze_schema.bronze_orders_raw group by order_id having count(order_id) > 1