In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 00 - Setup and Configuration (Simplified)
# MAGIC 
# MAGIC **Purpose:** Configuration only - no data generation or quality checks
# MAGIC 
# MAGIC **Author:** Data Engineering Team  
# MAGIC **Last Updated:** December 2024

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration Parameters

# COMMAND ----------

# Unity Catalog Configuration
CATALOG = "na-dbxtraining"
SCHEMA = "biju_bronze"

# Table Names
BRONZE_TABLE = "bronze_green_trips"
FACT_TABLE = "fact_green_trips"
DIM_VENDOR = "dim_vendor"
DIM_RATECODE = "dim_ratecode"
DIM_PAYMENT_TYPE = "dim_payment_type"
DIM_TRIP_TYPE = "dim_trip_type"
DIM_DATE = "dim_date"
DIM_LOCATION = "dim_location"

# Storage Paths - Using your paths
CHECKPOINT_LOCATION = "/Volumes/na-dbxtraining/biju_raw/biju_vol/greencabs/checkpoints/"
SOURCE_DATA_PATH = "/Volumes/na-dbxtraining/biju_raw/biju_vol/greencabs/raw_data/"

# Liquid Clustering Configuration
CLUSTER_COLUMNS = ["pickup_date", "vendor_id", "payment_type_id"]

# Broadcast Join Threshold (10MB)
BROADCAST_THRESHOLD = 10 * 1024 * 1024

print("âœ… Configuration loaded")
print(f"   Catalog: {CATALOG}")
print(f"   Schema: {SCHEMA}")
print(f"   Source Path: {SOURCE_DATA_PATH}")
print(f"   Checkpoint: {CHECKPOINT_LOCATION}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Spark Configurations

# COMMAND ----------

# Broadcast join threshold
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", BROADCAST_THRESHOLD)

# Adaptive Query Execution
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

# Liquid Clustering
spark.conf.set("spark.databricks.delta.clusteredTable.enableClusteringTablePreview", "true")

# Auto Optimize
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

print("âœ… Spark configurations set")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Save Configuration

# COMMAND ----------

# Create configuration dictionary
config = {
    "catalog": CATALOG,
    "schema": SCHEMA,
    "bronze_table": BRONZE_TABLE,
    "fact_table": FACT_TABLE,
    "dim_vendor": DIM_VENDOR,
    "dim_ratecode": DIM_RATECODE,
    "dim_payment_type": DIM_PAYMENT_TYPE,
    "dim_trip_type": DIM_TRIP_TYPE,
    "dim_date": DIM_DATE,
    "dim_location": DIM_LOCATION,
    "checkpoint_location": CHECKPOINT_LOCATION,
    "source_data_path": SOURCE_DATA_PATH,
    "cluster_columns": CLUSTER_COLUMNS,
    "broadcast_threshold": BROADCAST_THRESHOLD
}

# Save to temp view for access in other notebooks
spark.createDataFrame([config]).createOrReplaceTempView("pipeline_config")

print("âœ… Configuration saved to pipeline_config temp view")
print("\nðŸ“‹ Full Configuration:")
for key, value in config.items():
    print(f"   {key}: {value}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Ready
# MAGIC 
# MAGIC Configuration complete. Run next notebook: `01_bronze_autoloader.py`