![./ImageLab.png](./Images/ImageLab.png "./ImageLab.png")

# Data Engineering with Lakeflow, Jobs, AutoLoader and more

## Create Gold Layer

In [0]:
CHECKPOINT_LOCATION = dbutils.widgets.get("param_location")+"/checkpoint"
from pyspark.sql import functions as F

Customer Dimension

In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Configuration
TARGET_TABLE_GOLD = "medallion_autoloader.gold.dim_customer"
SOURCE_TABLE_SILVER = "medallion_autoloader.silver.dim_customer"

# --- Upsert Function (SCD Type 1) ---
def upsert_gold_batch(batch_df, batch_id):
    # 1. If table does not exist, create it (with basic concurrency handling)
    if not spark.catalog.tableExists(TARGET_TABLE_GOLD):
        try:
            batch_df.write.format("delta").mode("error").saveAsTable(TARGET_TABLE_GOLD)
            return
        except Exception as e:
            # If error is due to existence (race condition), proceed to Merge
            if "DELTA_TABLE_ALREADY_EXISTS" not in str(e) and "already exists" not in str(e):
                raise e

    # 2. Define Merge Logic (Upsert)
    delta_gold = DeltaTable.forName(spark, TARGET_TABLE_GOLD)
    
    # Dynamic dictionary: maps all DF columns for the update operation
    # E.g.: "customer_name": "src.customer_name", "region": "src.region"...
    update_columns = {col: f"src.{col}" for col in batch_df.columns}

    (
        delta_gold.alias("tgt")
        .merge(
            source=batch_df.alias("src"),
            condition="tgt.customer_bk = src.customer_bk" # Business Key is the unique link in Gold
        )
        .whenMatchedUpdate(set=update_columns) # Update all if changed
        .whenNotMatchedInsert(values=update_columns) # Insert if new
        .execute()
    )

# --- Reading from Silver ---
df_silver_stream = spark.readStream \
    .option("ignoreChanges", "true") \
    .table(SOURCE_TABLE_SILVER)

# --- Transformation ---
# We take only the current version.
df_gold_input = df_silver_stream \
    .filter(F.col("is_current") == True) \
    .drop("valid_from", "valid_to", "is_current") 

# --- Writing ---
query = (
    df_gold_input
    .writeStream
    .foreachBatch(upsert_gold_batch)
    .option("checkpointLocation", CHECKPOINT_LOCATION + "/gold/dim_customer")
    .trigger(availableNow=True)
    .start()
)

query.awaitTermination()

# --- Recreate View ---
spark.sql(f"""
  CREATE OR REPLACE VIEW medallion_autoloader.gold.vw_dim_customer AS
  SELECT * FROM {TARGET_TABLE_GOLD}
""")