# Implementing Incremental Updates


In [0]:

#display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/VideoStreamingData/"))

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit, concat, when
from delta.tables import DeltaTable
import datetime

# Define a timestamped Delta table path for a fresh run each time
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
delta_table_path = f"abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/VideoStreamingData/delta_events_{timestamp}"
print(f"Using new Delta table path: {delta_table_path}")

# Create initial data instead of reading existing data
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import current_timestamp

# Create a simple schema for this demo
schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("content_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("duration_seconds", IntegerType(), True),
    StructField("device_type", StringType(), True),
    StructField("quality", StringType(), True),
    StructField("buffering_count", IntegerType(), True),
    StructField("error_type", StringType(), True)
])

# Create sample data
data = [
    ("EVT10001", "USR10123", "CON10456", datetime.datetime.now(), 3600, "TV", "HD", 0, None),
    ("EVT10002", "USR10124", "CON10457", datetime.datetime.now(), 1800, "Mobile", "SD", 2, None),
    ("EVT10003", "USR10125", "CON10458", datetime.datetime.now(), 2400, "Web", "4K", 1, None),
    ("EVT10004", "USR10126", "CON10459", datetime.datetime.now(), 5400, "TV", "4K", 0, None),
    ("EVT10005", "USR10127", "CON10460", datetime.datetime.now(), 1200, "Tablet", "HD", 3, "network_error")
]

# Create our baseline DataFrame
existing_df = spark.createDataFrame(data, schema)
print(f"Created sample data with {existing_df.count()} records")

# Write to Delta format to initialize our table
existing_df.write.format("delta").mode("overwrite").save(delta_table_path)
print("Created initial Delta table with sample data")


# Let's first examine the existing Delta table schema to ensure we match it exactly
print("Existing Delta table schema:")
existing_delta = spark.read.format("delta").load(delta_table_path)
existing_delta.printSchema()

# Get a sample of the existing data
print("Sample of existing data:")
existing_delta.limit(5).display()



#### METHOD 1: APPEND

In [0]:

initial_count = spark.read.format('delta').load(delta_table_path).count()
print(initial_count)

sample_data = existing_delta.limit(2)
display(sample_data)

new_records_df = sample_data.withColumn("event_id", concat(lit("APEND_"), col('event_id')))
display(new_records_df)

#Append new data
new_records_df.write.format('delta').mode('append').save(delta_table_path)

# # Get updated count
updated_count = spark.read.format("delta").load(delta_table_path).count()
print(f"Updated record count: {updated_count}")
print(f"Added {updated_count - initial_count} records")


# # Get initial count
# initial_count = spark.read.format("delta").load(delta_table_path).count()
# print(f"Initial record count: {initial_count}")

# # Take an existing record and modify it for the append demo
# sample_records = existing_delta.limit(2)
# new_records_df = sample_records.withColumn("event_id", concat(lit("APPEND_"), col("event_id")))

# # Show the new records
# print("New records to append:")
# new_records_df.display()

# # Append new data
# new_records_df.write.format("delta").mode("append").save(delta_table_path)

# # Get updated count
# updated_count = spark.read.format("delta").load(delta_table_path).count()
# print(f"Updated record count: {updated_count}")
# print(f"Added {updated_count - initial_count} records")


In [0]:
spark.read.format("delta").load(delta_table_path).display()


#### METHOD 2: MERGE (UPSERT)

In [0]:
# Overwrite the existing data to reset the table for this Method
existing_df.write.format("delta").mode("overwrite").save(delta_table_path)
print("Created initial Delta table with sample data")
existing_df.display()

In [0]:
# Get a record to update and a record to insert
records_to_process = existing_delta.limit(2)
record_to_update = records_to_process.limit(1)
record_to_insert = records_to_process.limit(1).withColumn("event_id", lit("MERGE_NEW_RECORD"))

record_to_update.display()
record_to_insert.display()

# Modify the record to update
update_id = record_to_update.select("event_id").collect()[0][0]
print(f"Event ID to update: {update_id}")

update_df = record_to_update.withColumn("duration_seconds", lit(9999))
update_df = update_df.withColumn("quality", lit("SUPER-HD"))

# Combine for the merge operation
incremental_df = update_df.union(record_to_insert)
print("Records for merge operation:")
incremental_df.display()

# Perform MERGE operation
delta_table = DeltaTable.forPath(spark, delta_table_path)

# Check which columns exist in both dataframes to ensure a clean merge
target_columns = set(spark.read.format("delta").load(delta_table_path).columns)
source_columns = set(incremental_df.columns)
common_updatable_cols = target_columns.intersection(source_columns) - {"event_id"}

# Build the update dictionary dynamically based on common columns
update_dict = {col_name: f"source.{col_name}" for col_name in common_updatable_cols}

# Perform the merge
delta_table.alias("target").merge(
    incremental_df.alias("source"),
    f"target.event_id = source.event_id"
).whenMatchedUpdate(
    set=update_dict
).whenNotMatchedInsertAll().execute()

In [0]:
spark.read.format("delta").load(delta_table_path).display()

#### METHOD 3: SCHEMA EVOLUTION

In [0]:
# Overwrite the existing data to reset the table for this Method
existing_df.write.format("delta").mode("overwrite").save(delta_table_path)

# View the current schema
existing_df.printSchema(); 

In [0]:
# Let's create data with a new column
# Get a few records to evolve
base_records = spark.read.format("delta").load(delta_table_path).limit(2)

# Add a new column
evolved_df = base_records.withColumn("user_rating", lit(4.5))
evolved_df = evolved_df.withColumn("event_id", concat(lit("EVOLVED_"), col("event_id")))

print("Records with new schema:")
evolved_df.printSchema()
evolved_df.display()

# Write with mergeSchema option
evolved_df.write.format("delta").option("mergeSchema", "true").mode("append").save(delta_table_path)

In [0]:
spark.read.format("delta").load(delta_table_path).display()