In [1]:
import random
from faker import Faker

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth

### Spark Setup

In [7]:
spark_jar_packages = ",".join([
    "org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0",
    "org.apache.spark:spark-hive_2.12:3.5.3",
    "com.google.code.findbugs:jsr305:3.0.2",
    #"org.apache.thrift:libthrift:0.14.0",
    # "org.slf4j:slf4j-api:1.7.36",
    # "org.apache.logging.log4j:log4j-slf4j-impl:2.24.3",
    #"org.apache.hive:hive-metastore:2.3.9",
    #"org.apache.hive:hive-exec:2.3.9",
    # "org.apache.hadoop:hadoop-aws:3.3.4",
    # "com.amazonaws:aws-java-sdk-bundle:1.12.262",
])

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HudiWithHive") \
    .master("local[*]")  \
    .config("spark.jars.packages", spark_jar_packages)  \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://localhost:9083") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .enableHiveSupport() \
    .getOrCreate()


25/01/10 12:21:42 WARN Utils: Your hostname, baptvit resolves to a loopback address: 127.0.1.1; using 192.168.2.129 instead (on interface wlp4s0)
25/01/10 12:21:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/baptvit/.ivy2/cache
The jars for the packages stored in: /home/baptvit/.ivy2/jars
org.apache.hudi#hudi-spark3.5-bundle_2.12 added as a dependency
org.apache.spark#spark-hive_2.12 added as a dependency
com.google.code.findbugs#jsr305 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e36ae98c-2d53-471f-b04d-8fc35203c0a6;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hudi#hudi-spark3.5-bundle_2.12;1.0.0 in central
	found org.apache.hive#hive-storage-api;2.8.1 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found org.apache.spark#spark-hive_2.12;3.5.3 in central
	found org.apache.hive#hive-common;2.3.9 in central
	found commons-cli#commons-cli;1.5.0 in central
	found commons-lang#commons-lang;2.6 in local-m2-cache
	found org.apache.commons#commons-lang3;3.12.0 in central
	found jline#jline;2.14.6 in central
	found joda-time#joda-time;2.12.5 in central
	found org.apache.commons#commons-compress;1.23.0 in central
	found com.tdunning#json;1.8 in local-m2-cache
	found io.dropwizard.metrics#metrics-core;4.2.19 in central
	found io.dropwizard.metrics#metrics-jvm;4.2.19 in central
	found io.dropwizard.metrics#metrics-json;4.2.19 in central
	found com.fasterxml.jackson.core#jackson-core;2.15.2 in central
	found com.fasterxml.jackson.core#jackson-databind;2.15.2 in central
	found com.fasterxml.jackson.core#jackson-annotations;2.15.

In [9]:
spark

In [20]:
spark.sql("SHOW DATABASES;").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [14]:
# Define table name and base path
database_name = "default"
table_name = "hudi_cow"
base_path = "file:///home/baptvit/Documents/github/lakehouse-labs/ranger/path/to/hudi/table"

# Sample data
data = [
    ("row_1", "2021/01/01", 0, "bob", "v_0", "toBeDel0", 0, 1000000),
    ("row_2", "2021/01/01", 0, "john", "v_0", "toBeDel0", 0, 1000000),
    ("row_3", "2021/01/02", 0, "tom", "v_0", "toBeDel0", 0, 1000000)
]

In [15]:
# Define schema
from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType

schema = StructType([
    StructField("rowId", StringType(), True),
    StructField("partitionId", StringType(), True),
    StructField("preComb", LongType(), True),
    StructField("name", StringType(), True),
    StructField("versionId", StringType(), True),
    StructField("toBeDeletedStr", StringType(), True),
    StructField("intToLong", IntegerType(), True),
    StructField("longToInt", LongType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

In [16]:
df.show()

+-----+-----------+-------+----+---------+--------------+---------+---------+
|rowId|partitionId|preComb|name|versionId|toBeDeletedStr|intToLong|longToInt|
+-----+-----------+-------+----+---------+--------------+---------+---------+
|row_1| 2021/01/01|      0| bob|      v_0|      toBeDel0|        0|  1000000|
|row_2| 2021/01/01|      0|john|      v_0|      toBeDel0|        0|  1000000|
|row_3| 2021/01/02|      0| tom|      v_0|      toBeDel0|        0|  1000000|
+-----+-----------+-------+----+---------+--------------+---------+---------+



In [19]:
# Write data as a Hudi table
df.write.format("hudi") \
    .option("hoodie.datasource.write.precombine.field", "preComb") \
    .option("hoodie.datasource.write.recordkey.field", "rowId") \
    .option("hoodie.datasource.write.partitionpath.field", "partitionId") \
    .option("hoodie.database.name", database_name) \
    .option("hoodie.table.name", table_name) \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.write.hive_style_partitioning", "true") \
    .option("hoodie.datasource.hive_sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "jdbc") \
    .option("hoodie.datasource.hive_sync.jdbcurl", "jdbc:hive2://localhost:10000") \
    .option("hoodie.datasource.hive_sync.username", "hive") \
    .option("hoodie.datasource.hive_sync.password", "rangerR0cks!") \
    .mode("overwrite") \
    .save(base_path)

Py4JJavaError: An error occurred while calling o154.save.
: org.apache.hudi.exception.HoodieMetaSyncException: Could not sync using the meta sync class org.apache.hudi.hive.HiveSyncTool
	at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:106)
	at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:72)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.$anonfun$metaSync$2(HoodieSparkSqlWriter.scala:925)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.metaSync(HoodieSparkSqlWriter.scala:923)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:1022)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.writeInternal(HoodieSparkSqlWriter.scala:534)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.$anonfun$write$1(HoodieSparkSqlWriter.scala:190)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.adapter.BaseSpark3Adapter.sqlExecutionWithNewExecutionId(BaseSpark3Adapter.scala:105)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.write(HoodieSparkSqlWriter.scala:212)
	at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:127)
	at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:170)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: org.apache.hudi.exception.HoodieException: Got runtime exception when hive syncing hudi_cow
	at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:180)
	at org.apache.hudi.sync.common.util.SyncUtilHelpers.runHoodieMetaSync(SyncUtilHelpers.java:104)
	... 57 more
Caused by: org.apache.hudi.hive.HoodieHiveSyncException: failed to sync the table hudi_cow
	at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:272)
	at org.apache.hudi.hive.HiveSyncTool.doSync(HiveSyncTool.java:189)
	at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:177)
	... 58 more
Caused by: org.apache.hudi.hive.HoodieHiveSyncException: Failed in executing SQL CREATE EXTERNAL TABLE IF NOT EXISTS `default`.`hudi_cow`( `_hoodie_commit_time` string, `_hoodie_commit_seqno` string, `_hoodie_record_key` string, `_hoodie_partition_path` string, `_hoodie_file_name` string, `rowId` string, `preComb` bigint, `name` string, `versionId` string, `toBeDeletedStr` string, `intToLong` int, `longToInt` bigint) PARTITIONED BY (`partitionId` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' WITH SERDEPROPERTIES ('hoodie.query.as.ro.table'='false','path'='file:/home/baptvit/Documents/github/lakehouse-labs/ranger/path/to/hudi/table') STORED AS INPUTFORMAT 'org.apache.hudi.hadoop.HoodieParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' LOCATION 'file:/home/baptvit/Documents/github/lakehouse-labs/ranger/path/to/hudi/table' TBLPROPERTIES('spark.sql.sources.schema.numPartCols'='1','spark.sql.sources.schema.part.0'='{"type":"struct","fields":[{"name":"_hoodie_commit_time","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_commit_seqno","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_record_key","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_partition_path","type":"string","nullable":true,"metadata":{}},{"name":"_hoodie_file_name","type":"string","nullable":true,"metadata":{}},{"name":"rowId","type":"string","nullable":true,"metadata":{}},{"name":"preComb","type":"long","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"versionId","type":"string","nullable":true,"metadata":{}},{"name":"toBeDeletedStr","type":"string","nullable":true,"metadata":{}},{"name":"intToLong","type":"integer","nullable":true,"metadata":{}},{"name":"longToInt","type":"long","nullable":true,"metadata":{}},{"name":"partitionId","type":"string","nullable":true,"metadata":{}}]}','spark.sql.sources.schema.partCol.0'='partitionId','spark.sql.sources.schema.numParts'='1','spark.sql.sources.provider'='hudi','spark.sql.create.version'='3.5.3')
	at org.apache.hudi.hive.ddl.JDBCExecutor.runSQL(JDBCExecutor.java:70)
	at org.apache.hudi.hive.ddl.QueryBasedDDLExecutor.createTable(QueryBasedDDLExecutor.java:93)
	at org.apache.hudi.hive.HoodieHiveSyncClient.createTable(HoodieHiveSyncClient.java:284)
	at org.apache.hudi.hive.HiveSyncTool.syncFirstTime(HiveSyncTool.java:398)
	at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:256)
	... 60 more
Caused by: org.apache.hive.service.cli.HiveSQLException: Error while compiling statement: FAILED: SemanticException Cannot find class 'org.apache.hudi.hadoop.HoodieParquetInputFormat'
	at org.apache.hive.jdbc.Utils.verifySuccess(Utils.java:267)
	at org.apache.hive.jdbc.Utils.verifySuccessWithInfo(Utils.java:253)
	at org.apache.hive.jdbc.HiveStatement.runAsyncOnServer(HiveStatement.java:313)
	at org.apache.hive.jdbc.HiveStatement.execute(HiveStatement.java:253)
	at org.apache.hudi.hive.ddl.JDBCExecutor.runSQL(JDBCExecutor.java:68)
	... 64 more
Caused by: org.apache.hive.service.cli.HiveSQLException: Error while compiling statement: FAILED: SemanticException Cannot find class 'org.apache.hudi.hadoop.HoodieParquetInputFormat'
	at org.apache.hive.service.cli.operation.Operation.toSQLException(Operation.java:335)
	at org.apache.hive.service.cli.operation.SQLOperation.prepare(SQLOperation.java:199)
	at org.apache.hive.service.cli.operation.SQLOperation.runInternal(SQLOperation.java:260)
	at org.apache.hive.service.cli.operation.Operation.run(Operation.java:247)
	at org.apache.hive.service.cli.session.HiveSessionImpl.executeStatementInternal(HiveSessionImpl.java:541)
	at org.apache.hive.service.cli.session.HiveSessionImpl.executeStatementAsync(HiveSessionImpl.java:527)
	at org.apache.hive.service.cli.CLIService.executeStatementAsync(CLIService.java:312)
	at org.apache.hive.service.cli.thrift.ThriftCLIService.ExecuteStatement(ThriftCLIService.java:562)
	at org.apache.hive.service.rpc.thrift.TCLIService$Processor$ExecuteStatement.getResult(TCLIService.java:1557)
	at org.apache.hive.service.rpc.thrift.TCLIService$Processor$ExecuteStatement.getResult(TCLIService.java:1542)
	at org.apache.thrift.ProcessFunction.process(ProcessFunction.java:39)
	at org.apache.thrift.TBaseProcessor.process(TBaseProcessor.java:39)
	at org.apache.hive.service.auth.TSetIpAddressProcessor.process(TSetIpAddressProcessor.java:56)
	at org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:286)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.hadoop.hive.ql.parse.SemanticException: Cannot find class 'org.apache.hudi.hadoop.HoodieParquetInputFormat'
	at org.apache.hadoop.hive.ql.parse.ParseUtils.ensureClassExists(ParseUtils.java:260)
	at org.apache.hadoop.hive.ql.parse.StorageFormat.fillStorageFormat(StorageFormat.java:57)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeCreateTable(SemanticAnalyzer.java:13004)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genResolvedParseTree(SemanticAnalyzer.java:11974)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:12129)
	at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:330)
	at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:285)
	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:659)
	at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1826)
	at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1773)
	at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:1768)
	at org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:126)
	at org.apache.hive.service.cli.operation.SQLOperation.prepare(SQLOperation.java:197)
	... 15 more
Caused by: java.lang.ClassNotFoundException: org.apache.hudi.hadoop.HoodieParquetInputFormat
	at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.hadoop.hive.ql.parse.ParseUtils.ensureClassExists(ParseUtils.java:258)
	... 27 more


In [None]:
from pyspark.sql.functions import current_timestamp

# Load a dataset
df = spark.read.csv("industry.csv", header=True, inferSchema=True)
df_with_ts = df.withColumn("ts", current_timestamp())

In [None]:
df_with_ts.show()

In [None]:
table_name = "hudi_table"
database_name = "default"
base_path = "file:///home/baptvit/Documents/github/lakehouse-labs/ranger/path/to/hudi/table"

In [None]:
# Write data as a Hudi table
df.write.format("hudi") \
    .option("hoodie.table.name", table_name) \
    .option("hoodie.datasource.write.recordkey.field", "Industry") \
    .option("hoodie.datasource.write.table.name", table_name) \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.write.precombine.field", "Industry") \
    .option("hoodie.datasource.write.hive_style_partitioning", "true") \
    .option("hoodie.datasource.hive_sync.enable", "true") \
    .option("hoodie.database.name", database_name) \
    .option("hoodie.datasource.hive_sync.database", database_name) \
    .option("hoodie.datasource.hive_sync.table", table_name) \
    .mode("overwrite") \
    .save(base_path)

In [None]:
# Write data as a Hudi table
df.write.format("hudi") \
    .option("hoodie.table.name", table_name) \
    .option("hoodie.datasource.write.recordkey.field", "Industry") \
    .option("hoodie.datasource.write.table.name", table_name) \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.write.precombine.field", "Industry") \
    .option("hoodie.datasource.write.hive_style_partitioning", "true") \
    .option("hoodie.datasource.hive_sync.partition_fields", "ts") \
    .option("hoodie.datasource.hive_sync.enable", "true") \
    .option("hoodie.database.name", database_name) \
    .option("hoodie.datasource.hive_sync.database", database_name) \
    .option("hoodie.datasource.hive_sync.table", table_name) \
    .option("hoodie.datasource.hive_sync.jdbcurl", "jdbc:hive2://ranger-hive:10000/default") \
    .option("hoodie.datasource.hive_sync.username", "hive") \
    .option("hoodie.datasource.hive_sync.password", "rangerR0cks!") \
    .mode("overwrite") \
    .save(base_path)

In [None]:
df_with_ts.write.format("hudi") \
    .options(**hudi_configs) \
    .mode("append")\
    .save("file:///home/baptvit/Documents/github/lakehouse-labs/ranger/path/to/hudi/table")

### Dataset Generation

In [None]:
def generate_entry(faker: Faker, country_codes: list):
    return {
        "id": faker.unique.uuid4(),
        "name":  faker.name(),
        "email": faker.email(),
        "passport": faker.passport_number(),
        "country_code": random.choice(country_codes),
        "iban": faker.iban(),
        "swift": faker.swift11(),
        "created_at": faker.past_date(start_date='-90d').strftime('%Y-%m-%d')
    }

In [None]:
def generate_dataset(num: int, seed: int):
    country_codes = ['US', 'CA', 'JP', 'KR', 'FR', 'GE', 'UK', 'BR', 'AR']
    Faker.seed(seed)
    faker = Faker()
    return [generate_entry(faker, country_codes) for _ in range(num)]

In [None]:
dataset = generate_dataset(num=100_000, seed=739)

In [None]:
df = spark.createDataFrame(dataset)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

### Hudi-Hive Integration

In [None]:
df.write.format("hudi") \
    .option("hoodie.database.name", "hudi_raw") \
    .option("hoodie.table.name", "accounts") \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "created_at") \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.meta.sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "hms") \
    .option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083") \
    .option("hoodie.datasource.hive_sync.partition_fields", "year,month") \
    .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \
    .option("hoodie.datasource.write.hive_style_partitioning","true") \
    .partitionBy("year", "month") \
    .mode("overwrite") \
    .save("s3a://lakehouse-raw/hudi/accounts")

### Upsert Dataset

In [None]:
entries = [
    # Existing entries
    dataset[2], 
    dataset[4], 
    dataset[7],
    dataset[11],
    # New entries
    *generate_dataset(4, seed=1037)
]

In [None]:
for entry in entries:
    username = entry['name'].lower().replace(" ", ".")
    entry['email'] = f"{username}@domain.com"

In [None]:
upsert_df = spark.createDataFrame(entries)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

In [None]:
upsert_df.show(8, truncate=False)

In [None]:
upsert_df.createOrReplaceTempView("upsert_data")

### Upsert Strategy

In [None]:
upsert_df.write.format("hudi") \
    .option("hoodie.database.name", "hudi_raw") \
    .option("hoodie.table.name", "accounts") \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "created_at") \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.meta.sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "hms") \
    .option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083") \
    .option("hoodie.datasource.hive_sync.partition_fields", "year,month") \
    .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \
    .option("hoodie.datasource.write.hive_style_partitioning","true") \
    .partitionBy("year", "month") \
    .mode("append") \
    .save("s3a://lakehouse-raw/hudi/accounts")

### Hudi Metadata (WIP)

In [None]:
spark.sql("""
    call show_commits (
        table => 'hudi_raw.accounts',
        from_commit => '0'
    )    
""").show(truncate=False)

In [None]:
hudi_changes = spark.read.format("hudi") \
    .option("hoodie.datasource.query.type", "incremental") \
    .option("hoodie.datasource.read.begin.instanttime", 0) \
    .load("s3a://lakehouse-raw/hudi/accounts")

In [None]:
hudi_changes.show()