In [0]:
from pyspark import pipelines as pl
from pyspark.sql.functions import *
from pyspark.sql.types import *

# All JSON files inside schema_drift folder in the datastore volume
volume_path = "/Volumes/workspace/damg7370/datastore/schema_drift/customer_data_*.json"


In [0]:
# ---------- Helper: handle DATATYPE changes from _rescued_data ----------
def process__rescue_data_datatype_change(df, target_schema: StructType):
    df = df.withColumn(
        "_rescued_data_modified",
        from_json(col("_rescued_data"), MapType(StringType(), StringType()))
    )

    for field in target_schema:
        column_name = field.name
        data_type = field.dataType

        key_condition = expr(
            f"_rescued_data_modified IS NOT NULL AND "
            f"map_contains_key(_rescued_data_modified, '{column_name}')"
        )

        rescued_value = when(
            key_condition,
            col("_rescued_data_modified").getItem(column_name).cast(data_type)
        ).otherwise(col(column_name).cast(data_type))

        df = df.withColumn(column_name, rescued_value)
        df = df.withColumn(column_name, col(column_name).cast(data_type))

    df = df.drop("_rescued_data_modified")
    df = df.withColumn("_rescued_data", lit(None).cast(StringType()))
    return df


# ---------- Helper: handle NEW FIELDS from _rescued_data ----------
def process__rescue_data_new_fields(df):
    df = df.withColumn(
        "_rescued_data_json_to_map",
        from_json(col("_rescued_data"), MapType(StringType(), StringType()))
    )

    df = df.withColumn(
        "_rescued_data_map_keys",
        map_keys(col("_rescued_data_json_to_map"))
    )

    all_keys_df = (
        df.select(explode_outer(col("_rescued_data_map_keys")).alias("key"))
          .distinct()
    )

    new_keys = [row["key"] for row in all_keys_df.collect()]

    for key in new_keys:
        if key != "_file_path":
            df = df.withColumn(
                key,
                col("_rescued_data_json_to_map").getItem(key).cast(StringType())
            )

    return df


In [0]:
# ---------- BRONZE (RESCUE) ----------
pl.create_streaming_table("demo_cust_bronze_sd")

@pl.append_flow(
    target = "demo_cust_bronze_sd",
    name   = "demo_cust_bronze_sd_ingest_flow"
)
def demo_cust_bronze_sd_ingest_flow():
    df = (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "json")
            .option("cloudFiles.inferColumnTypes", "true")
            .option("cloudFiles.schemaEvolutionMode", "rescue")
            .load(volume_path)
    )
    return (
        df.withColumn("ingestion_datetime", current_timestamp())
          .withColumn("source_filename", col("_metadata.file_path"))
    )


In [0]:
from pyspark.sql.types import StructType, StructField, DateType

updated_datatypes = StructType([
    StructField("signupDate", DateType(), True)
])

@pl.table(
    name="demo_cust_silver_sd"
)
def demo_cust_silver_sd():
    # Batch read from Bronze streaming table
    df = spark.read.table("demo_cust_bronze_sd")
    # (Optional) mimic professorâ€™s expectations with filters
    df = df.filter(
        "_rescued_data IS NULL AND CustomerID IS NOT NULL"
    )
    # Add new columns and fix datatypes from _rescued_data
    df = process__rescue_data_new_fields(df)
    df = process__rescue_data_datatype_change(
        df,
        updated_datatypes
    )
    return df