In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window

# https://hudi.apache.org/docs/quick-start-guide/#spark-3-support-matrix

spark = (SparkSession.builder
            .appName('HudiTable')
            .config("spark.jars.packages","org.apache.hudi:hudi-spark3.3-bundle_2.12:0.14.0"
                   )
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog")
            .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
            .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar")
            .getOrCreate()
        )


In [2]:
!pwd

/home/jovyan


## Create Table

In [19]:
spark.sql("""
CREATE TABLE hudi_table3 (
    ts BIGINT,
    uuid STRING,
    rider STRING,
    driver STRING,
    fare DOUBLE,
    city STRING
) USING HUDI
PARTITIONED BY (city)
LOCATION '/home/jovyan/hudi2';
"""
)

DataFrame[]

In [22]:
df = spark.read.format("hudi").load('/home/jovyan/hudi2')

df.show()

+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|           ts|                uuid|  rider|  driver| fare|         city|
+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|  20240427165953591|20240427165953591...|20240427165953591...|    city=san_francisco|0738651c-aec1-447...|1695159649087|334e26e9-8355-45c...|rider-A|driver-K| 19.1|san_francisco|
|  20240427165953591|20240427165953591...|20240427165953591...|    city=san_francisco|0738651c-aec1-447...|1695091554788|e96c4396-3fad-413...|rider-C|driver-M| 27.7|san_francisco|
|  20240427165953591|20240427165953591...|20240427165953591...|    city=san_francisco|0738651c-aec1-

In [12]:
# pyspark
basePath = '/home/jovyan/hudi'
table_name = 'hudi_table2'

columns = ["ts","uuid","rider","driver","fare","city"]

data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
       (1695091554788,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"),
       (1695046462179,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"),
       (1695516137016,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"),
       (1695115999911,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")]

df = spark.createDataFrame(data).toDF(*columns)

hudi_options = {
    'hoodie.table.name': table_name,
    'hoodie.datasource.write.partitionpath.field': 'city'
}

(df.write
    .format("hudi")
    .options(**hudi_options)
    .mode("overwrite")
    .save(basePath)
)

In [None]:
df = spark.read.format("hudi").load(basePath)

df.show()

## Insert

In [21]:
spark.sql("""
INSERT INTO hudi_table3
VALUES
(1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'),
(1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70 ,'san_francisco'),
(1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90 ,'san_francisco'),
(1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'),
(1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo'),
(1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40 ,'sao_paulo'),
(1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06 ,'chennai'),
(1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');;
"""
)

DataFrame[]

In [24]:
df = spark.read.format("hudi").table('hudi_table3')

df.show()

+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|           ts|                uuid|  rider|  driver| fare|         city|
+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|  20240427165953591|20240427165953591...|20240427165953591...|    city=san_francisco|0738651c-aec1-447...|1695159649087|334e26e9-8355-45c...|rider-A|driver-K| 19.1|san_francisco|
|  20240427165953591|20240427165953591...|20240427165953591...|    city=san_francisco|0738651c-aec1-447...|1695091554788|e96c4396-3fad-413...|rider-C|driver-M| 27.7|san_francisco|
|  20240427165953591|20240427165953591...|20240427165953591...|    city=san_francisco|0738651c-aec1-

In [17]:
df = spark.read.format("hudi").load(basePath)
df.createOrReplaceTempView("trips_table")

spark.sql("""
SELECT uuid, fare, ts, rider, driver, city 
FROM trips_table 
WHERE fare > 20.0
"""
).show()


+--------------------+-----+-------------+-------+--------+-------------+
|                uuid| fare|           ts|  rider|  driver|         city|
+--------------------+-----+-------------+-------+--------+-------------+
|e96c4396-3fad-413...| 27.7|1695091554788|rider-C|driver-M|san_francisco|
|9909a8b1-2d15-4d3...|339.0|1695046462179|rider-D|driver-L|san_francisco|
|e3cf430c-889d-401...|34.15|1695516137016|rider-F|driver-P|    sao_paulo|
+--------------------+-----+-------------+-------+--------+-------------+



## Update

In [14]:
df = spark.read.format("hudi").load(basePath)

df_update = (df.filter("rider == 'rider-D'")
                 .withColumn("fare",F.col("fare")*10)
            )

(df_update.write
    .format("hudi")
    .options(**hudi_options)
    .mode("append")
    .save(basePath)
)

In [15]:
df = spark.read.format("hudi").load(basePath)

df.show()

+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|           ts|                uuid|  rider|  driver| fare|         city|
+-------------------+--------------------+--------------------+----------------------+--------------------+-------------+--------------------+-------+--------+-----+-------------+
|  20240427163123273|20240427163123273...|20240427163123273...|         san_francisco|33aa4b27-b1c2-451...|1695159649087|334e26e9-8355-45c...|rider-A|driver-K| 19.1|san_francisco|
|  20240427163123273|20240427163123273...|20240427163123273...|         san_francisco|33aa4b27-b1c2-451...|1695091554788|e96c4396-3fad-413...|rider-C|driver-M| 27.7|san_francisco|
|  20240427163545966|20240427163545966...|20240427163123273...|         san_francisco|33aa4b27-b1c2-

In [18]:
df = spark.read.format("hudi").load(basePath).createOrReplaceTempView("df_table")

spark.sql("""
SELECT * FROM df_table
"""
).show()

Unnamed: 0,_hoodie_commit_time,_hoodie_commit_seqno,_hoodie_record_key,_hoodie_partition_path,_hoodie_file_name,ts,uuid,rider,driver,fare,city
0,20240427163123273,20240427163123273_0_1,20240427163123273_0_0,san_francisco,33aa4b27-b1c2-451a-9384-a69a4d6e4d11-0_0-53-70...,1695159649087,334e26e9-8355-45cc-97c6-c31daf0df330,rider-A,driver-K,19.1,san_francisco
1,20240427163123273,20240427163123273_0_2,20240427163123273_0_1,san_francisco,33aa4b27-b1c2-451a-9384-a69a4d6e4d11-0_0-53-70...,1695091554788,e96c4396-3fad-413a-a942-4cb36106d721,rider-C,driver-M,27.7,san_francisco
2,20240427163545966,20240427163545966_0_2,20240427163123273_0_2,san_francisco,33aa4b27-b1c2-451a-9384-a69a4d6e4d11-0_0-53-70...,1695046462179,9909a8b1-2d15-4d3d-8ec9-efc48c536a00,rider-D,driver-L,339.0,san_francisco
3,20240427163123273,20240427163123273_0_4,20240427163123273_0_3,sao_paulo,33aa4b27-b1c2-451a-9384-a69a4d6e4d11-1_0-39-0_...,1695516137016,e3cf430c-889d-4015-bc98-59bdce1e530c,rider-F,driver-P,34.15,sao_paulo
4,20240427163123273,20240427163123273_0_5,20240427163123273_0_4,chennai,33aa4b27-b1c2-451a-9384-a69a4d6e4d11-2_0-39-0_...,1695115999911,c8abbe79-8d89-47ea-b4ce-4d224bae5bfa,rider-J,driver-T,17.85,chennai


## Merge

In [None]:
https://hudi.apache.org/docs/quick-start-guide/

## Delete

## Time Travel

## Incremental

## Change Data Capture