In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import *

# Define the schema
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", StringType(), True),
    StructField("price", StringType(), True),
    StructField("order_timestamp", TimestampType(), True),
    StructField("status", StringType(), True)
])


# Read JSON from external path with the defined schema
df = spark.read.option("multiLine", True).schema(schema).json("abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/orders/")

# Save as a managed table (Unity Catalog or Hive Metastore)
df.write.format("delta").mode("overwrite").saveAsTable("ordercatalog.rawdata_schema.allorders")

In [0]:
%sql
select count(*) from ordercatalog.rawdata_schema.allorders

In [0]:
%sql
select * from ordercatalog.rawdata_schema.allorders

In [0]:
# Databricks Notebook: bronze_orders_load
# Language: Python

# COMMAND ----------
# DBTITLE 1,Configuration
raw_data_path = "abfss://rawdata@adlsexternalfororders.dfs.core.windows.net/orders/"
bronze_table_name = "ordercatalog.rawdata_schema.bronze_orders_raw"
database_name = "data_mart" # Or your Unity Catalog schema/database

spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")
spark.catalog.setCurrentDatabase(database_name)

# COMMAND ----------
# DBTITLE 1,Read Raw Data
# Read all JSON files from the raw data path
# For production, consider using Auto Loader for incremental ingestion:
# df = spark.readStream.format("cloudFiles") \
#   .option("cloudFiles.format", "json") \
#   .option("cloudFiles.schemaLocation", f"{raw_data_path}_schemas/orders_bronze") \
#   .load(raw_data_path)

# For this example, we'll use a batch read
df_raw = spark.read.option("multiLine", True).schema(schema).json(raw_data_path)
# COMMAND ----------
# DBTITLE 1,Add Ingestion Metadata
from pyspark.sql.functions import current_timestamp, input_file_name

df_bronze = df_raw.withColumn("ingestion_timestamp", current_timestamp()) \
                  .withColumn("source_file", input_file_name())

# COMMAND ----------
# DBTITLE 1,Write to Bronze Delta Table
# Bronze layer typically appends new data
df_bronze.write \
  .format("delta") \
  .mode("append") \
  .option("mergeSchema", "true") \
  .saveAsTable(bronze_table_name)

print(f"Successfully loaded raw data into bronze table: {database_name}.{bronze_table_name}")

# COMMAND ----------
# DBTITLE 1,Verify Bronze Table
# spark.sql(f"SELECT * FROM {bronze_table_name} LIMIT 10").display()
# spark.sql(f"SELECT COUNT(*) FROM {bronze_table_name}").display()

In [0]:
%sql
select * from ordercatalog.rawdata_schema.bronze_orders_raw