In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 00 - Setup and Configuration 
# MAGIC 
# MAGIC **Purpose:** Configuration only 

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration Parameters

# COMMAND ----------

# Unity Catalog Configuration
CATALOG = "na-dbxtraining"

# Separate Schemas for Each Layer
BRONZE_SCHEMA = "biju_bronze"
SILVER_SCHEMA = "biju_silver"
GOLD_SCHEMA = "biju_gold"

# Bronze Layer Table Names
BRONZE_TABLE = "bronze_green_trips"

# Silver Layer Table Names (Star Schema)
FACT_TABLE = "fact_green_trips"
DIM_VENDOR = "dim_vendor"
DIM_RATECODE = "dim_ratecode"
DIM_PAYMENT_TYPE = "dim_payment_type"
DIM_TRIP_TYPE = "dim_trip_type"
DIM_DATE = "dim_date"
DIM_LOCATION = "dim_location"

# Gold Layer Table Names
GOLD_DAILY_SUMMARY = "gold_daily_summary"
GOLD_PAYMENT_SUMMARY = "gold_payment_summary"

# Storage Paths - Using your paths
CHECKPOINT_LOCATION = "/Volumes/na-dbxtraining/biju_raw/biju_vol/greencabs/checkpoints/"
SOURCE_DATA_PATH = "/Volumes/na-dbxtraining/biju_raw/biju_vol/greencabs/raw_data/"

# Liquid Clustering Configuration
CLUSTER_COLUMNS = ["pickup_date", "vendor_id", "payment_type_id"]

# Broadcast Join Threshold (10MB)
BROADCAST_THRESHOLD = 10 * 1024 * 1024

print("âœ… Configuration loaded")
print(f"   Catalog: {CATALOG}")
print(f"   Bronze Schema: {BRONZE_SCHEMA}")
print(f"   Silver Schema: {SILVER_SCHEMA}")
print(f"   Gold Schema: {GOLD_SCHEMA}")
print(f"   Source Path: {SOURCE_DATA_PATH}")
print(f"   Checkpoint: {CHECKPOINT_LOCATION}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Schemas

# COMMAND ----------

# Create schemas for each layer
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.{BRONZE_SCHEMA}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.{SILVER_SCHEMA}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.{GOLD_SCHEMA}")

print("âœ… Schemas created:")
print(f"   Bronze: `{CATALOG}`.{BRONZE_SCHEMA}")
print(f"   Silver: `{CATALOG}`.{SILVER_SCHEMA}")
print(f"   Gold: `{CATALOG}`.{GOLD_SCHEMA}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Spark Configurations

# COMMAND ----------

# Broadcast join threshold
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", BROADCAST_THRESHOLD)

# Adaptive Query Execution
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

# Liquid Clustering
spark.conf.set("spark.databricks.delta.clusteredTable.enableClusteringTablePreview", "true")

# Auto Optimize
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

print("âœ… Spark configurations set")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Save Configuration

# COMMAND ----------

# Create configuration dictionary
config = {
    "catalog": CATALOG,
    "bronze_schema": BRONZE_SCHEMA,
    "silver_schema": SILVER_SCHEMA,
    "gold_schema": GOLD_SCHEMA,
    "bronze_table": BRONZE_TABLE,
    "fact_table": FACT_TABLE,
    "dim_vendor": DIM_VENDOR,
    "dim_ratecode": DIM_RATECODE,
    "dim_payment_type": DIM_PAYMENT_TYPE,
    "dim_trip_type": DIM_TRIP_TYPE,
    "dim_date": DIM_DATE,
    "dim_location": DIM_LOCATION,
    "gold_daily_summary": GOLD_DAILY_SUMMARY,
    "gold_payment_summary": GOLD_PAYMENT_SUMMARY,
    "checkpoint_location": CHECKPOINT_LOCATION,
    "source_data_path": SOURCE_DATA_PATH,
    "cluster_columns": CLUSTER_COLUMNS,
    "broadcast_threshold": BROADCAST_THRESHOLD
}

# Save to temp view for access in other notebooks
spark.createDataFrame([config]).createOrReplaceTempView("pipeline_config")

print("âœ… Configuration saved to pipeline_config temp view")
print("\nðŸ“‹ Full Configuration:")
for key, value in config.items():
    print(f"   {key}: {value}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Ready
# MAGIC 
# MAGIC Configuration complete. Run next notebook: `01_bronze_autoloader.py`