In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType
from pyspark.sql.functions import col
from delta.tables import DeltaTable
from datetime import date

# Sample data for the updates with customerId, new address, and effective date of the change
updates_data = [
    (1, 'kumar street', date(2023, 6, 17)),
    (3, 'porur street', date(2023, 6, 17)),
    (6, 'thuvarnkuruchi street', date(2023, 6, 17)),
    (7, 'thiruvanmiyur street', date(2023, 6, 17))
]

# Define schema for the updates DataFrame
updates_schema = StructType([
    StructField("customerId", IntegerType(), False),
    StructField("address", StringType(), False),
    StructField("effectiveDate", DateType(), False)
])

# Create DataFrame for updates using the sample data and schema
updates_df = spark.createDataFrame(updates_data, updates_schema)

# Define the Delta table name where customer data is stored
delta_table_name = "customers_scd1"

# Check if the Delta table exists in the metastore
if not spark.catalog.tableExists(delta_table_name):
    # If table does not exist, prepare initial data from updates
    initial_data = [(row.customerId, row.address, row.effectiveDate) for row in updates_df.collect()]

    # Define schema for the initial data
    initial_schema = StructType([
        StructField("customerId", IntegerType(), False),
        StructField("address", StringType(), False),
        StructField("effectiveDate", DateType(), False)
    ])

    # Create DataFrame for initial data to seed the Delta table
    initial_df = spark.createDataFrame(initial_data, initial_schema)

    # Write the initial DataFrame to a new Delta table, overwriting if exists
    initial_df.write.format("delta").mode("overwrite").saveAsTable(delta_table_name)

# Load the existing Delta table as a DeltaTable object for merge operations
customersTable = DeltaTable.forName(spark, delta_table_name)

# Perform SCD Type 1 merge:
# - When matched, update the address and effectiveDate
# - When not matched, insert new record
customersTable.alias("customers").merge(
    updates_df.alias("updates"),
    "customers.customerId = updates.customerId"
).whenMatchedUpdate(
    set={
        "address": col("updates.address"),
        "effectiveDate": col("updates.effectiveDate")
    }
).whenNotMatchedInsert(
    values={
        "customerId": col("updates.customerId"),
        "address": col("updates.address"),
        "effectiveDate": col("updates.effectiveDate")
    }
).execute()

# Display the final state of the Delta table after applying updates
display(customersTable.toDF())

In [0]:
%sql
select * from workspace.default.customers_scd1