In [1]:
import random
from faker import Faker

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth

### Spark Setup

In [3]:
spark_jar_packages = ",".join([
    "io.delta:delta-spark_2.12:3.2.0",
    "io.delta:delta-hive_2.12:3.2.0",
    "org.apache.hive:hive-metastore:3.1.3",
    "org.apache.hive:hive-exec:3.1.3",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
])

In [4]:
spark = (
    SparkSession.builder
    .appName("delta-hive-playground")
    .config("spark.jars.packages", spark_jar_packages)

    # Delta-Hive Integration
     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .config("spark.sql.catalog.spark_catalog.type", "hive")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.uris", "thrift://localhost:10000")

    # S3 (MinIO Integration)
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .config("spark.hadoop.fs.s3a.region", "us-east-1")

    .getOrCreate()
)

25/01/10 16:38:01 WARN Utils: Your hostname, baptvit resolves to a loopback address: 127.0.1.1; using 192.168.2.129 instead (on interface wlp4s0)
25/01/10 16:38:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/baptvit/.ivy2/cache
The jars for the packages stored in: /home/baptvit/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
io.delta#delta-hive_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a2b809c4-813d-4f6b-be8d-5ff21090b631;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
	found io.delta#delta-hive_2.12;3.2.0 in central
	found io.delta#delta-standalone_2.12;3.2.0 in central
	found com.chuusai#shapeless_2.12;2.3.4 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.4.3 in central
:: resolution report :: resolve 150ms :: artifacts dl 9ms
	:: modules in use:
	com.chuusai#shapeless_2.12;2.3.4 from central in [default]
	io.delta#delta-hive_2.12;3.2.0 from central in [default]
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-standalone_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from local-m2-cache in [default]
	org.scala-lang.modules#scala-collection-compat_2.12;2.4.3 from central in [default]
	---------------------------------------------------------------------
	|                  

### Dataset Generation

In [5]:
def generate_entry(faker: Faker, country_codes: list):
    return {
        "id": faker.unique.uuid4(),
        "name":  faker.name(),
        "email": faker.email(),
        "passport": faker.passport_number(),
        "country_code": random.choice(country_codes),
        "iban": faker.iban(),
        "swift": faker.swift11(),
        "created_at": faker.past_date(start_date='-90d').strftime('%Y-%m-%d')
    }

In [6]:
def generate_dataset(num: int, seed: int):
    country_codes = ['US', 'CA', 'JP', 'KR', 'FR', 'GE', 'UK', 'BR', 'AR']
    Faker.seed(seed)
    faker = Faker()
    return [generate_entry(faker, country_codes) for _ in range(num)]

In [7]:
dataset = generate_dataset(num=5, seed=739)

In [8]:
df = spark.createDataFrame(dataset)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

In [9]:
df.show(1)

+------------+----------+--------------------+--------------------+--------------------+-----------+---------+-----------+----+-----+---+
|country_code|created_at|               email|                iban|                  id|       name| passport|      swift|year|month|day|
+------------+----------+--------------------+--------------------+--------------------+-----------+---------+-----------+----+-----+---+
|          JP|2024-10-17|powelljason@examp...|GB77AKMZ560580635...|5a424412-b127-4f8...|Cody Taylor|895549199|INSEGB5PR6S|2024|   10| 17|
+------------+----------+--------------------+--------------------+--------------------+-----------+---------+-----------+----+-----+---+
only showing top 1 row



### Delta-Hive Integration

In [10]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS deltalake_raw
    LOCATION 'file:///home/baptvit/Documents/github/lakehouse-labs'
""")

DataFrame[]

In [None]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS deltalake_raw
    LOCATION 's3a://lakehouse-raw/delta/'
""")

In [None]:
df.write.format("parquet") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .partitionBy("year", "month") \
    .saveAsTable("deltalake_raw.accounts")

In [11]:
df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .partitionBy("year", "month") \
    .saveAsTable("deltalake_raw.accounts")

                                                                                

### Upsert Dataset

In [None]:
entries = [
    # Existing entries
    dataset[2], 
    dataset[4], 
    dataset[7],
    dataset[11],
    # New entries
    *generate_dataset(4, seed=1037)
]

In [None]:
for entry in entries:
    username = entry['name'].lower().replace(" ", ".")
    entry['email'] = f"{username}@domain.com"

In [None]:
upsert_df = spark.createDataFrame(entries)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

In [None]:
upsert_df.show(8, truncate=False)

In [None]:
upsert_df.createOrReplaceTempView("upsert_data")

### Upsert Strategy

In [None]:
spark.sql("""
    MERGE INTO deltalake_raw.accounts AS target
    USING upsert_data AS source ON 
        target.id = source.id
    WHEN MATCHED THEN UPDATE SET
        target.country_code = source.country_code,
        target.email = source.email,
        target.name = source.name,
        target.iban = source.iban,
        target.swift = source.swift,
        target.passport = source.passport
    WHEN NOT MATCHED THEN INSERT *
""")

### Delta Metadata (Changes)

In [12]:
spark.sql("""
    select 
        *
    from
        table_changes('deltalake_raw.accounts', 0)
    where
        _change_type in ('insert', 'update_postimage')
""").show(truncate=False)

+------------+----------+--------------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+------------+---------------+-----------------------+
|country_code|created_at|email                     |iban                  |id                                  |name            |passport |swift      |year|month|day|_change_type|_commit_version|_commit_timestamp      |
+------------+----------+--------------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+------------+---------------+-----------------------+
|GE          |2024-12-05|harrisondeanna@example.com|GB96EZYO16306776005871|bef2df38-4a7a-4b73-904d-c949dc8140c6|Destiny Jimenez |F75210547|WSHOGBQ55I9|2024|12   |5  |insert      |0              |2025-01-10 16:38:18.768|
|JP          |2024-12-05|brightthomas@example.com  |GB56NVYS60885944026604|189b84f0-9527-45e4-a64b-c3f710a24fb9|Brittany

### Delta Timetravel (version)

In [13]:
spark.sql("""
    select 
        *
    from
        deltalake_raw.accounts version as of 0
    where
        id = '0daad7bc-25b6-4469-8a2f-2ba767f86791'
""").show(truncate=False)

25/01/10 16:38:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+------------+----------+---------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+
|country_code|created_at|email                |iban                  |id                                  |name            |passport |swift      |year|month|day|
+------------+----------+---------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+
|FR          |2024-12-06|derrick15@example.com|GB14AYNQ55188150393152|0daad7bc-25b6-4469-8a2f-2ba767f86791|Cassidy Jones MD|595954695|VTHYGBZMNOI|2024|12   |6  |
+------------+----------+---------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+



In [14]:
spark.sql("""
    select 
        *
    from
        deltalake_raw.accounts version as of 1
    where
        id = '0daad7bc-25b6-4469-8a2f-2ba767f86791'
""").show(truncate=False)

AnalysisException: Cannot time travel Delta table to version 1. Available versions: [0, 0].