In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType# Define schema for employee data

employee_schema = StructType([
    StructField("employee_id", IntegerType(), False),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("department", StringType(), True),
    StructField("hire_date", StringType(), True)
])

In [0]:
# Initial employee data (DataFrame 1)
initial_employee_data = [
    (101, "Alice", "Smith", "alice.s@example.com", "Sales", "2020-01-15"),
    (102, "Bob", "Johnson", "bob.j@example.com", "Marketing", "2019-03-22"),
    (103, "Charlie", "Brown", "charlie.b@example.com", "Engineering", "2021-06-01"),
    (104, "Diana", "Prince", "diana.p@example.com", "HR", "2018-11-10")
]

In [0]:
from pyspark.sql.functions import to_date, current_timestamp
# Convert to DataFrame
df_initial_employees = spark.createDataFrame(initial_employee_data, schema=employee_schema) \
                            .withColumn("created_at", current_timestamp()) \
                            .withColumn("updated_at", current_timestamp())

print("Initial Employee Data:")
df_initial_employees.display()

In [0]:
# Write the initial DataFrame to a Delta table
# We are creating an external table using 'path' option. If you want a managed table,
# remove .option("path", ...) and use .saveAsTable("catalog.schema.table_name")
spark.sql("use general_catalog.silver_schema")
df_initial_employees.write \
  .format("delta") \
  .mode("overwrite") \
  .saveAsTable("silver_employee")
    




In [0]:
%sql
select * from silver_employee

In [0]:
# MAGIC %md
# MAGIC ## 3. Simulate Incremental Data - First Batch (DataFrame 2)
# MAGIC
# MAGIC This batch includes:
# MAGIC - A new employee (ID 105).
# MAGIC - An update to an existing employee (ID 101 - department change).
# MAGIC - An update to an existing employee (ID 102 - email change).

# COMMAND ----------

# Incremental employee data (DataFrame 2)

from pyspark.sql.functions import to_date, current_timestamp
spark.sql("use general_catalog.silver_schema")
incremental_employee_data_1 = [
    (101, "Alice", "Smith", "alice.smith.new@example.com", "Marketing", "2020-01-15"), # Update existing (email and department)
    (102, "Bob", "Johnson", "bobjohnson@example.com", "Marketing", "2019-03-22"),   # Update existing (email)
    (105, "Emily", "Clark", "emily.c@example.com", "Finance", "2022-09-01")       # New employee
]

df_incrimental_employee_data_1=spark.createDataFrame(incremental_employee_data_1,schema=employee_schema) \
    .withColumn("created_at", current_timestamp()) \
    .withColumn("updated_at", current_timestamp())

df_incrimental_employee_data_1.show()
                           

In [0]:
# Get the DeltaTable object for the target table
from delta.tables import DeltaTable
print("Performing first MERGE operation...")
delta_table = DeltaTable.forName(spark, "general_catalog.silver_schema.silver_employee")
# Perform the merge operation
delta_table.alias("target") \
  .merge(
    df_incrimental_employee_data_1.alias("source"),
    "target.employee_id = source.employee_id" # Match condition
  ) \
  .whenMatchedUpdate(set = { # What to do when a match is found (update existing record)
    "first_name": "source.first_name",
    "last_name": "source.last_name",
    "email": "source.email",
    "department": "source.department",
    "hire_date": "source.hire_date",
    "updated_at": "source.updated_at"
  }) \
  .whenNotMatchedInsert(values = { # What to do when no match is found (insert new record)
    "employee_id": "source.employee_id",
    "first_name": "source.first_name",
    "last_name": "source.last_name",
    "email": "source.email",
    "department": "source.department",
    "hire_date": "source.hire_date",
    "created_at": "source.updated_at", # For new inserts, created_at is the current updated_at
    "updated_at": "source.updated_at"
  }) \
  .execute()

print("First MERGE operation completed.")


In [0]:
%sql
select * from general_catalog.silver_schema.silver_employee