In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from delta import *

# Create a Spark session
# spark = SparkSession.builder \
#     .appName("Merge Practice") \
#     .master("local[*]") \
#     .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
#     .getOrCreate()

spark = SparkSession.builder \
    .appName("Merge Practice") \
    .master("local[*]") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()


In [10]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .appName("Merge Practice") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.databricks.delta.properties.defaults.enableColumnDefaults", "true") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [11]:
sql = """
CREATE OR REPLACE TEMPORARY VIEW action1
AS
SELECT 'Customer1' as customer, current_timestamp() as insert_dt, 100.00 as amount
"""
# Execute the SQL command
spark.sql(sql)

DataFrame[]

In [12]:
spark.sql("select * from action1").show()

+---------+--------------------+------+
| customer|           insert_dt|amount|
+---------+--------------------+------+
|Customer1|2025-07-10 15:19:...|100.00|
+---------+--------------------+------+



## Create Table and Merge

In [7]:
# get the current catalog
current_catalog = spark.catalog.currentCatalog()
print(f"Current Catalog: {current_catalog}")

Current Catalog: spark_catalog


In [21]:
spark.sql("drop table if exists new_table")


DataFrame[]

In [25]:
create_sql = """
CREATE TABLE new_table2 (
    customer STRING,
    insert_dt TIMESTAMP,
    update_dt TIMESTAMP DEFAULT current_timestamp(),
    amount DECIMAL(10, 2),
    updt_cnt INT DEFAULT 0
    --, avg_updt_time_seconds FLOAT GENERATED ALWAYS as (to_unix_timestamp(update_dt) - to_unix_timestamp(insert_dt)) 
) USING DELTA
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')
"""
spark.sql(create_sql)



DataFrame[]

In [26]:
spark.sql("select * from new_table2").show(truncate=False)

+--------+---------+---------+------+--------+
|customer|insert_dt|update_dt|amount|updt_cnt|
+--------+---------+---------+------+--------+
+--------+---------+---------+------+--------+



### Merge Command
The order of merge conditions must be 
1. MATCHED
2. NOT MATCHED BY TARGET
3. NOT MATCHED BY SOURCE 

In [38]:
merge_sql = """
MERGE INTO new_table2 as t
USING action1 as s
ON t.customer = s.customer
WHEN MATCHED
    THEN UPDATE SET
        update_dt = current_timestamp(),
        t.amount = s.amount,
        t.updt_cnt = t.updt_cnt + 1
WHEN NOT MATCHED BY TARGET
    THEN INSERT (customer, insert_dt, amount)
    VALUES (s.customer, s.insert_dt, s.amount)
"""
spark.sql(merge_sql)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [39]:
spark.sql("select * from new_table2").show(truncate=False)


+---------+--------------------------+--------------------------+------+--------+
|customer |insert_dt                 |update_dt                 |amount|updt_cnt|
+---------+--------------------------+--------------------------+------+--------+
|Customer1|2025-07-10 15:36:42.171487|2025-07-10 15:43:01.369167|100.00|3       |
+---------+--------------------------+--------------------------+------+--------+



## Action 2

In [32]:
sql = """
CREATE OR REPLACE TEMPORARY VIEW action2
AS
SELECT 'Customer1' as customer, current_timestamp() as insert_dt, 522.00 as amount
UNION ALL
SELECT 'Customer2' as customer, current_timestamp() as insert_dt, 33.22 as amount
UNION ALL
SELECT 'Customer3' as customer, current_timestamp() as insert_dt, 44.44 as amount
UNION ALL
SELECT 'Customer3' as customer, current_timestamp() as insert_dt, 55.55 as amount


"""
# Execute the SQL command
spark.sql(sql)

DataFrame[]

In [40]:
merge2_sql = """
MERGE INTO new_Table2 as t
USING action2 as s
ON t.customer = s.customer
WHEN MATCHED
    THEN UPDATE SET
        update_dt = current_timestamp(),
        amount = s.amount,
        updt_cnt = t.updt_cnt + 1
WHEN NOT MATCHED BY TARGET
    THEN INSERT (customer, insert_dt, amount)
    VALUES (s.customer, s.insert_dt, s.amount)
"""
spark.sql(merge2_sql)

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [41]:
spark.sql("select * from new_table2").show(truncate=False)


+---------+--------------------------+-------------------------+------+--------+
|customer |insert_dt                 |update_dt                |amount|updt_cnt|
+---------+--------------------------+-------------------------+------+--------+
|Customer1|2025-07-10 15:36:42.171487|2025-07-10 15:44:31.32122|522.00|4       |
|Customer2|2025-07-10 15:44:31.32122 |2025-07-10 15:44:31.32022|33.22 |0       |
|Customer3|2025-07-10 15:44:31.32122 |2025-07-10 15:44:31.32022|44.44 |0       |
|Customer3|2025-07-10 15:44:31.32122 |2025-07-10 15:44:31.32022|55.55 |0       |
+---------+--------------------------+-------------------------+------+--------+



In [None]:
sql = """
CREATE OR REPLACE TEMPORARY VIEW action3
AS
SELECT 'Customer1' as customer
UNION
SELECT 'Customer2' as customer
UNION ALL
SELECT 'Customer3' as customer
UNION ALL
SELECT 'Customer3' as customer