In [0]:
from pyspark import pipelines as pl
from pyspark.sql import functions as F
from pyspark.sql.types import (
    MapType, StringType, DateType, IntegerType, StructType, StructField
)

volume_path = "/Volumes/workspace/new_schema_drift/json_new/Customer_New_Json/*.json"

base_schema = StructType([
    StructField("CustomerID", StringType(), True),
    StructField("FullName", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("PhoneNumber", StringType(), True),
    StructField("City", StringType(), True),
    StructField("SignupDate", StringType(), True)
])

updated_datatypes = StructType([
    StructField("SignupDate", DateType(), True),
    StructField("Age", IntegerType(), True)
])

# BRONZE TABLE
pl.create_streaming_table("workspace.new_schema_drift.demo_cust_bronze_rescue")

@pl.append_flow(
    target="workspace.new_schema_drift.demo_cust_bronze_rescue",
    name="demo_cust_bronze_rescue_ingest_flow"
)
def demo_cust_bronze_rescue_ingest_flow():
    return (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "json")
            .option("cloudFiles.inferColumnTypes", "false")
            .option("cloudFiles.schemaEvolutionMode", "rescue")
            .schema(base_schema)
            .load(volume_path)
            .withColumn("ingestion_datetime", F.current_timestamp())
            .withColumn("source_filename", F.col("_metadata.file_path"))
    )

def process__rescue_data_datatype_change(df, target_schema):
    df = df.withColumn(
        "_rescued_map",
        F.from_json(F.col("_rescued_data"), MapType(StringType(), StringType()))
    )

    for field in target_schema.fields:
        df = df.withColumn(
            field.name,
            F.when(
                F.col("_rescued_map").getItem(field.name).isNotNull(),
                F.col("_rescued_map").getItem(field.name).cast(field.dataType)
            ).otherwise(F.col(field.name).cast(field.dataType))
        )

    return df.drop("_rescued_map")


def process__rescue_data_new_fields(df):
    fields = ["Age", "Gender", "LoyaltyStatus"]

    df = df.withColumn(
        "_rescued_json",
        F.from_json(F.col("_rescued_data"), MapType(StringType(), StringType()))
    )

    for col in fields:
        if col not in df.columns:
            df = df.withColumn(col, F.col("_rescued_json").getItem(col))

    return df.drop("_rescued_json")


pl.create_streaming_table(
    name="workspace.new_schema_drift.demo_cust_silver_rescue",
    expect_all_or_drop={
        "rescued_is_null": "_rescued_data IS NULL",
        "valid_id": "CustomerID IS NOT NULL"
    }
)

@pl.append_flow(
    target="workspace.new_schema_drift.demo_cust_silver_rescue",
    name="demo_cust_silver_rescue_clean_flow"
)
def demo_cust_silver_rescue_clean_flow():
    df = spark.readStream.table("workspace.new_schema_drift.demo_cust_bronze_rescue")
    df = process__rescue_data_new_fields(df)
    df = process__rescue_data_datatype_change(df, updated_datatypes)
    return df


In [0]:
from pyspark import pipelines as pl
from pyspark.sql import functions as F
from pyspark.sql.types import *

volume_path = "/Volumes/workspace/new_schema_drift/json_new/Customer_New_Json/*.json"

# ======================================================
# BRONZE - addNewColumns MODE
# ======================================================
pl.create_streaming_table("demo_cust_bronze_addnew")

@pl.append_flow(
    target="demo_cust_bronze_addnew",
    name="demo_cust_bronze_addnew_ingest_flow"
)
def demo_cust_bronze_addnew_ingest_flow():
    df = (
        spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "json")
            .option("cloudFiles.inferColumnTypes", "true")
            .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
            .load(volume_path)
    )

    return (
        df.withColumn("ingestion_datetime", F.current_timestamp())
          .withColumn("source_filename", F.col("_metadata.file_path"))
    )

# ======================================================
# SILVER - addNewColumns MODE
# ======================================================
pl.create_streaming_table(
    name="demo_cust_silver_addnew",
    expect_all_or_drop={
        "valid_id": "CustomerID IS NOT NULL"
    }
)

@pl.append_flow(
    target="demo_cust_silver_addnew",
    name="demo_cust_silver_addnew_clean_flow"
)
def demo_cust_silver_addnew_clean_flow():

    # IMPORTANT â€” READ USING SAME NAME AS CREATED
    df = spark.readStream.table("demo_cust_bronze_addnew")

    # SIMPLE DATATYPE CLEANUP
    df = df.withColumn("SignupDate", F.to_date("SignupDate"))

    return df
