In [0]:
display(spark.read.option("header", "true").csv("abfss://data@anirvandecodesdata.dfs.core.windows.net/staging/diagnosis/diagnosis_raw.csv"))

In [0]:
df_patient_bronze = (
    spark.readStream
        .format("delta")
        .load("/mnt/bronze/patient_raw")
)


df_patient_clean = (
    df_patient_bronze
        .withColumn("birth_date", to_date("birth_date"))
        .dropDuplicates(["patient_id"])
)


In [0]:
def merge_dim_patient(batch_df, batch_id):

    from delta.tables import DeltaTable

    silver_path = "/mnt/silver/dim_patient"

    # If table does not exist â€” create it
    if not DeltaTable.isDeltaTable(spark, silver_path):
        (
            batch_df.withColumn("patient_sk", monotonically_increasing_id())
                    .write
                    .format("delta")
                    .save(silver_path)
        )
        return

    # Load existing delta table
    dim_patient = DeltaTable.forPath(spark, silver_path)

    dim_patient.alias("t").merge(
        batch_df.alias("s"),
        "t.patient_id = s.patient_id"
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()


In [0]:
(
    df_patient_clean.writeStream
        .format("delta")
        .foreachBatch(merge_dim_patient)
        .outputMode("update")
        .trigger(availableNow=True)
        .option("checkpointLocation", "/mnt/chk/silver/dim_patient")
        .start()
)
