In [0]:
 %run ./00setupconfig


In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 01 - Bronze Layer: Direct Autoloader
# MAGIC **Goal:** Stream CSVs directly from Volume to Delta Bronze table.

# COMMAND ----------
# MAGIC %run ./00_setup_config

# COMMAND ----------
from pyspark.sql import functions as F

# Verify paths from 00_setup_config
print(f"Ingesting from: {SOURCE_DATA_PATH}")
print(f"Writing to:     {CATALOG}.{SCHEMA}.{BRONZE_TABLE}")

# COMMAND ----------
# MAGIC %md
# MAGIC ## 1. Configure and Run Autoloader
# MAGIC This replaces manual batching with a continuous/scheduled stream.

# COMMAND ----------
# Define the stream
bronze_stream = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    # Autoloader handles schema inference and evolution automatically
    .option("cloudFiles.schemaLocation", f"{CHECKPOINT_LOCATION}schema_bronze")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("header", "true")
    .load(SOURCE_DATA_PATH)
    # Add metadata for audit
    .withColumn("ingestion_timestamp", F.current_timestamp())
    .withColumn("source_file", F.input_file_name())
)

# COMMAND ----------
# MAGIC %md
# MAGIC ## 2. Write to Bronze Table
# MAGIC Using `availableNow=True` for a straightforward "Trigger Once" style load, 
# MAGIC or remove the trigger line for a continuous background stream.

# COMMAND ----------
query = (
    bronze_stream.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", f"{CHECKPOINT_LOCATION}bronze_stream")
    .option("mergeSchema", "true")
    .trigger(availableNow=True)
    .toTable(f"`{CATALOG}`.`{SCHEMA}`.`{BRONZE_TABLE}`")
)

query.awaitTermination()

print(f"âœ… Data successfully loaded into {CATALOG}.{SCHEMA}.{BRONZE_TABLE}")

# Quick Verification with backticked identifiers
display(
    spark.table(f"`{CATALOG}`.`{SCHEMA}`.`{BRONZE_TABLE}`").limit(5)
)

In [0]:
%sql
select * from `na-dbxtraining`.biju_bronze.bronze_green_trips