In [1]:
import random
from faker import Faker

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth

### Spark Setup

In [3]:
spark_jar_packages = ",".join([
    "org.apache.hudi:hudi-spark3.3-bundle_2.12:1.0.0",
    "org.apache.hudi:hudi-utilities-slim-bundle_2.12:1.0.0",
    "org.apache.hudi:hudi-datahub-sync-bundle:1.0.0",
    "org.apache.thrift:libthrift:0.14.0",
    # "org.slf4j:slf4j-api:1.7.36",
    # "org.apache.logging.log4j:log4j-slf4j-impl:2.24.3",
    # "org.apache.hive:hive-metastore:3.1.3",
    # "org.apache.hive:hive-exec:3.1.3",
    # "org.apache.hadoop:hadoop-aws:3.3.4",
    # "com.amazonaws:aws-java-sdk-bundle:1.12.262",
])

In [4]:
spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("HudiWithDataHub")
    .config("spark.jars.packages", spark_jar_packages)

    # Hudi-Hive Integration
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar")
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
    .config("hoodie.meta.sync.client.tool.class", "org.apache.hudi.sync.datahub.DataHubSyncTool")
    .config("hoodie.meta.sync.datahub.emitter.server", "http://localhost:8080")
    .config("hoodie.datasource.meta.sync.enable", "true")
    #.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog")

    # # S3 (MinIO Integration)
    # .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")
    # .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    # .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    # .config("spark.hadoop.fs.s3a.path.style.access", "true")
    # .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    # .config("spark.hadoop.fs.s3a.region", "us-east-1")

    .getOrCreate()
)

Ivy Default Cache set to: /home/baptvit/.ivy2/cache
The jars for the packages stored in: /home/baptvit/.ivy2/jars
org.apache.hudi#hudi-spark3.3-bundle_2.12 added as a dependency
org.apache.hudi#hudi-utilities-slim-bundle_2.12 added as a dependency
org.apache.hudi#hudi-datahub-sync-bundle added as a dependency
org.apache.thrift#libthrift added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b59f8909-3c93-40ed-8c73-edc9b40efb16;1.0
	confs: [default]


25/01/09 19:05:08 WARN Utils: Your hostname, baptvit resolves to a loopback address: 127.0.1.1; using 192.168.2.129 instead (on interface wlp4s0)
25/01/09 19:05:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hudi#hudi-spark3.3-bundle_2.12;1.0.0 in central
	found org.apache.hive#hive-storage-api;2.8.1 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found org.apache.hudi#hudi-utilities-slim-bundle_2.12;1.0.0 in central
	found org.apache.hudi#hudi-spark-client;1.0.0 in central
	found org.scala-lang.modules#scala-collection-compat_2.12;2.8.1 in central
	found org.apache.hudi#hudi-client-common;1.0.0 in central
	found org.apache.hudi#hudi-common;1.0.0 in central
	found org.apache.hudi#hudi-io;1.0.0 in central
	found com.google.protobuf#protobuf-java;3.25.5 in central
	found io.airlift#aircompressor;0.27 in central
	found org.openjdk.jol#jol-core;0.16 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.10.0 in central
	found com.fasterxml.jackson.core#jackson-databind;2.10.0 in central
	found com.fasterxml.jackson.core#jackson-core;2.10.0 in central
	found com.fasterxml.jackson.datatype#jackson-datatype-jsr310;2.10.0 in central
	found com.fasterxml.jacks

25/01/09 19:05:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/09 19:05:10 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/09 19:05:10 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
spark

In [6]:
# Hudi and DataHub configurations
hudi_configs = {
    "hoodie.datasource.hive_sync.database": "my_hudi_db",
    "hoodie.datasource.hive_sync.table": "my_hudi_table",
    "hoodie.datasource.write.recordkey.field": "Industry",
    "hoodie.datasource.write.partitionpath.field": "partition",
    "hoodie.datasource.write.table.name": "mytable",
    "hoodie.datasource.write.operation": "upsert",
    "hoodie.datasource.write.precombine.field": "ts"
}


In [7]:
from pyspark.sql.functions import current_timestamp

# Load a dataset
df = spark.read.csv("industry.csv", header=True, inferSchema=True)
df_with_ts = df.withColumn("ts", current_timestamp())

25/01/09 19:05:15 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
25/01/09 19:05:15 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf


In [8]:
df_with_ts.show()

+--------------------+--------------------+
|            Industry|                  ts|
+--------------------+--------------------+
|  Accounting/Finance|2025-01-09 19:05:...|
|Advertising/Publi...|2025-01-09 19:05:...|
|  Aerospace/Aviation|2025-01-09 19:05:...|
|Arts/Entertainmen...|2025-01-09 19:05:...|
|          Automotive|2025-01-09 19:05:...|
|    Banking/Mortgage|2025-01-09 19:05:...|
|Business Development|2025-01-09 19:05:...|
|Business Opportunity|2025-01-09 19:05:...|
|Clerical/Administ...|2025-01-09 19:05:...|
|Construction/Faci...|2025-01-09 19:05:...|
|      Consumer Goods|2025-01-09 19:05:...|
|    Customer Service|2025-01-09 19:05:...|
|  Education/Training|2025-01-09 19:05:...|
|    Energy/Utilities|2025-01-09 19:05:...|
|         Engineering|2025-01-09 19:05:...|
| Government/Military|2025-01-09 19:05:...|
|               Green|2025-01-09 19:05:...|
|          Healthcare|2025-01-09 19:05:...|
|  Hospitality/Travel|2025-01-09 19:05:...|
|     Human Resources|2025-01-09

In [9]:
df_with_ts.write.format("hudi") \
    .options(**hudi_configs) \
    .mode("append") \
    .save("file:///home/baptvit/Documents/github/lakehouse-labs/datahub/path/to/hudi/table")

25/01/09 19:05:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-hbase.properties,hadoop-metrics2.properties


### Dataset Generation

In [None]:
def generate_entry(faker: Faker, country_codes: list):
    return {
        "id": faker.unique.uuid4(),
        "name":  faker.name(),
        "email": faker.email(),
        "passport": faker.passport_number(),
        "country_code": random.choice(country_codes),
        "iban": faker.iban(),
        "swift": faker.swift11(),
        "created_at": faker.past_date(start_date='-90d').strftime('%Y-%m-%d')
    }

In [None]:
def generate_dataset(num: int, seed: int):
    country_codes = ['US', 'CA', 'JP', 'KR', 'FR', 'GE', 'UK', 'BR', 'AR']
    Faker.seed(seed)
    faker = Faker()
    return [generate_entry(faker, country_codes) for _ in range(num)]

In [None]:
dataset = generate_dataset(num=100_000, seed=739)

In [None]:
df = spark.createDataFrame(dataset)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

### Hudi-Hive Integration

In [None]:
df.write.format("hudi") \
    .option("hoodie.database.name", "hudi_raw") \
    .option("hoodie.table.name", "accounts") \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "created_at") \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.meta.sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "hms") \
    .option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083") \
    .option("hoodie.datasource.hive_sync.partition_fields", "year,month") \
    .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \
    .option("hoodie.datasource.write.hive_style_partitioning","true") \
    .partitionBy("year", "month") \
    .mode("overwrite") \
    .save("s3a://lakehouse-raw/hudi/accounts")

### Upsert Dataset

In [None]:
entries = [
    # Existing entries
    dataset[2], 
    dataset[4], 
    dataset[7],
    dataset[11],
    # New entries
    *generate_dataset(4, seed=1037)
]

In [None]:
for entry in entries:
    username = entry['name'].lower().replace(" ", ".")
    entry['email'] = f"{username}@domain.com"

In [None]:
upsert_df = spark.createDataFrame(entries)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

In [None]:
upsert_df.show(8, truncate=False)

In [None]:
upsert_df.createOrReplaceTempView("upsert_data")

### Upsert Strategy

In [None]:
upsert_df.write.format("hudi") \
    .option("hoodie.database.name", "hudi_raw") \
    .option("hoodie.table.name", "accounts") \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "created_at") \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.meta.sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "hms") \
    .option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083") \
    .option("hoodie.datasource.hive_sync.partition_fields", "year,month") \
    .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \
    .option("hoodie.datasource.write.hive_style_partitioning","true") \
    .partitionBy("year", "month") \
    .mode("append") \
    .save("s3a://lakehouse-raw/hudi/accounts")

### Hudi Metadata (WIP)

In [None]:
spark.sql("""
    call show_commits (
        table => 'hudi_raw.accounts',
        from_commit => '0'
    )    
""").show(truncate=False)

In [None]:
hudi_changes = spark.read.format("hudi") \
    .option("hoodie.datasource.query.type", "incremental") \
    .option("hoodie.datasource.read.begin.instanttime", 0) \
    .load("s3a://lakehouse-raw/hudi/accounts")

In [None]:
hudi_changes.show()