In [0]:
%sql

create schema if not exists anirvandecodes.silver;

In [0]:
bronze_table = 'anirvandecodes.bronze.visit_raw'
silver_table = 'anirvandecodes.silver.fact_visit'
checkpoint_path = "abfss://data@anirvandecodesdata.dfs.core.windows.net/silver/fact_visit/checkpoint/"

In [0]:

from pyspark.sql.functions import col, lag, to_date, datediff, current_timestamp
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
%sql

select * from anirvandecodes.bronze.visit_raw;

In [0]:
silver_patient_table = "anirvandecodes.silver.dim_patient"
silver_hospital_table = "anirvandecodes.silver.dim_hospital"
silver_diagnosis_table = "anirvandecodes.silver.dim_diagnosis"
bronze_table = "anirvandecodes.bronze.visit_raw"

In [0]:

df_patient = spark.read.table(silver_patient_table)
df_hospital = spark.read.table(silver_hospital_table)
df_diagnosis = spark.read.table(silver_diagnosis_table)

In [0]:
df_visit_bronze = (
    spark.readStream.table(bronze_table)
)

In [0]:
# -------------------------
# Join Raw Visit with Dimension tables
# -------------------------
df_fact_new = (
    df_visit_bronze
        .join(df_patient, "patient_id", "left")
        .join(df_hospital, "hospital_id", "left")
        .join(df_diagnosis, "diagnosis_code", "left")
        .withColumn("admission_date", to_date("admission_date"))
        .withColumn("discharge_date", to_date("discharge_date"))
        .withColumn("load_timestamp", current_timestamp())
)


In [0]:
# -------------------------
# Combine with existing fact (to calculate prev_discharge)
# -------------------------

if spark.catalog.tableExists(silver_table):
    df_fact_existing = spark.read.table(silver_table)
    df_all = df_fact_existing.select("visit_id","patient_id","admission_date","discharge_date","cost",
                                     "hospital_id","diagnosis_code") \
                             .unionByName(df_fact_new.select("visit_id","patient_id","admission_date","discharge_date","cost",
                                                             "hospital_id","diagnosis_code"))
else:
    df_all = df_fact_new

In [0]:
# -------------------------
# Compute prev_discharge and readmission info
# -------------------------
window_patient = Window.partitionBy("patient_id").orderBy("admission_date")

df_fact_processed = (
    df_all
        .withColumn("prev_discharge", lag("discharge_date").over(window_patient))
        .withColumn("days_since_last_discharge", datediff(col("admission_date"), col("prev_discharge")))
        .withColumn("is_readmission_30d", (col("days_since_last_discharge") <= 30).cast("int"))
        .withColumn("load_timestamp", current_timestamp())
)


In [0]:
# -------------------------
# Merge into Silver fact_visit
# -------------------------
def merge_fact_visit(batch_df, batch_id):

    if not spark.catalog.tableExists(silver_table):
        batch_df.write.format("delta").mode("overwrite").saveAsTable(silver_table)
        return

    fact = DeltaTable.forName(spark, silver_table)
    fact.alias("t").merge(
        batch_df.alias("s"),
        "t.visit_id = s.visit_id"
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

In [0]:
# -------------------------
# Run as availableNow incremental
# -------------------------
(
    df_fact_processed.writeStream
        .foreachBatch(merge_fact_visit)
        .outputMode("update")
        .trigger(availableNow=True)
        .option("checkpointLocation", checkpoint_path)
        .start()
)

In [0]:
%sql

select * from anirvandecodes.silver.fact_visit;