# Apache Hudi Core Conceptions (5) - MOR + Compaction

## 1. Configuration

In [1]:
%%configure -f
{
    "conf" : {
        "spark.jars":"hdfs:///tmp/hudi-spark-bundle.jar",            
        "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
        "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension",
        "spark.sql.catalog.spark_catalog":"org.apache.spark.sql.hudi.catalog.HoodieCatalog"
    }
}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
103,application_1677377031637_0142,spark,idle,Link,Link,,
104,application_1677377031637_0143,spark,idle,Link,Link,,
105,application_1677377031637_0144,spark,idle,Link,Link,,
106,application_1677377031637_0145,spark,idle,Link,Link,,
107,application_1677377031637_0146,spark,idle,Link,Link,,


In [2]:
%%sh
# deploy hudi bundle jar
hdfs dfs -copyFromLocal -f /usr/lib/hudi/hudi-spark-bundle.jar /tmp/hudi-spark-bundle.jar
hdfs dfs -ls /tmp/hudi-spark-bundle.jar
# deploy hudi-stat.sh - a utility shell script 
wget https://github.com/bluishglc/hudi-core-conceptions/releases/download/v1.0/hudi-stat.sh -O ~/hudi-stat.sh &>/dev/null
chmod a+x ~/hudi-stat.sh
ls ~/hudi-stat.sh

-rw-r--r--   1 emr-notebook hdfsadmingroup   61421977 2023-03-04 09:20 /tmp/hudi-spark-bundle.jar
/home/emr-notebook/hudi-stat.sh


In [3]:
%%html
<style>
table {float:left}
</style>

## 2. Sync Compaction: Inline Schedule + Inline Execute

### 2.1. Create Table

In [4]:
%%sql
set TABLE_NAME=reviews_mor_simple_1

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
108,application_1677377031637_0150,spark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [5]:
%%sql
set TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_1

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [6]:
%env TABLE_NAME=reviews_mor_simple_1

env: TABLE_NAME=reviews_mor_simple_1


In [7]:
%env TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_1

env: TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_1


In [8]:
%%sh
echo $(basename $TABLE_PATH)
aws s3 rm $TABLE_PATH --recursive &>/dev/null
rm -rf ~/${TABLE_NAME}
sleep 3

reviews_mor_simple_1


In [9]:
%%sql
drop table if exists ${TABLE_NAME}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [10]:
%%sql
drop table if exists ${TABLE_NAME}_ro

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [11]:
%%sql
drop table if exists ${TABLE_NAME}_rt

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [12]:
%%sql
create table if not exists ${TABLE_NAME} (
    review_id string, 
    star_rating long, 
    review_body string, 
    review_date date, 
    year long,
    timestamp long,
    parity int
)
using hudi
location '${TABLE_PATH}'
partitioned by (parity)
options ( 
    type = 'mor',  
    primaryKey = 'review_id', 
    preCombineField = 'timestamp',
    hoodie.copyonwrite.record.size.estimate = '175',
    hoodie.compact.inline = 'true',
    -- hoodie.compact.schedule.inline = 'false',
    hoodie.compact.inline.max.delta.commits = '3'
);

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

### 2.2. Batch 1 - Insert

In [13]:
%%sql
insert into 
    ${TABLE_NAME}
select 
    review_id, 
    star_rating, 
    review_body, 
    review_date, 
    year,
    unix_timestamp(current_timestamp()) as timestamp,
    mod(crc32(review_id), 2) as parity
from
    reviews
where
    year = 1999

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [14]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092052102 │ deltacommit │ COMPLETED │ 03-04 09:20 │ 03-04 09:21 │ 03-04 09:21 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔═══════════════════╤═════════════════════╤═══════════════════╤═════════════════════╤══════════════════════════╤═══════════════════════╤══════════════════════════════╤══════════════╗
║ CommitTime        │ Total Bytes Written │ Total Files Added │ Total Files Updated │ Total Partitions Written │ Total Records Written │ Total Update Records Written │ Total Errors ║
╠═══════════════════╪════════

### 2.3.  Batch 2 - Update

In [15]:
%%sql
update
    ${TABLE_NAME}
set             
    review_body = concat(uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid()),
    timestamp = unix_timestamp(current_timestamp())
where
    review_date = '1999-01-01'
;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [16]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092052102 │ deltacommit │ COMPLETED │ 03-04 09:20 │ 03-04 09:21 │ 03-04 09:21 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092203550 │ deltacommit │ COMPLETED │ 03-04 09:22 │ 03-04 09:22 │ 03-04 09:22 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔═══════════════════╤═════════════════════╤═══════════════════╤═════════════════════╤══════════════════════════╤═══════════════════════╤══════════════════════════════╤══════════════╗
║ CommitTime        

### 2.4. Batch 3 - Update

In [17]:
%%sql
update
    ${TABLE_NAME}
set             
    review_body = concat(uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid()),
    timestamp = unix_timestamp(current_timestamp())
where
    review_date = '1999-01-02'
;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [18]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092052102 │ deltacommit │ COMPLETED │ 03-04 09:20 │ 03-04 09:21 │ 03-04 09:21 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092203550 │ deltacommit │ COMPLETED │ 03-04 09:22 │ 03-04 09:22 │ 03-04 09:22 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 2   │ 20230304092248714 │ deltacommit │ COMPLETED │ 03-04 09:22 │ 03-04 09:22 │ 03-04 09:23 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 3   │ 20230304092301764

## 3. Async Compaction: Inline Schedule + Offline Execute

### 3.1. Create Table

In [19]:
%%sql
set TABLE_NAME=reviews_mor_simple_2

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [20]:
%%sql
set TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_2

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [21]:
%env TABLE_NAME=reviews_mor_simple_2

env: TABLE_NAME=reviews_mor_simple_2


In [22]:
%env TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_2

env: TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_2


In [23]:
%%sh
echo $(basename $TABLE_PATH)
aws s3 rm $TABLE_PATH --recursive &>/dev/null
rm -rf ~/${TABLE_NAME}
sleep 3

reviews_mor_simple_2


In [24]:
%%sql
drop table if exists ${TABLE_NAME}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [25]:
%%sql
drop table if exists ${TABLE_NAME}_ro

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [26]:
%%sql
drop table if exists ${TABLE_NAME}_rt

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [27]:
%%sql
create table if not exists ${TABLE_NAME} (
    review_id string, 
    star_rating long, 
    review_body string, 
    review_date date, 
    year long,
    timestamp long,
    parity int
)
using hudi
location '${TABLE_PATH}'
partitioned by (parity)
options ( 
    type = 'mor',  
    primaryKey = 'review_id', 
    preCombineField = 'timestamp',
    hoodie.copyonwrite.record.size.estimate = '175',
    hoodie.compact.inline = 'false',
    hoodie.compact.schedule.inline = 'true',
    hoodie.compact.inline.max.delta.commits = '3'
);

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

### 3.2. Batch 1 - Insert

In [28]:
%%sql
insert into 
    ${TABLE_NAME}
select 
    review_id, 
    star_rating, 
    review_body, 
    review_date, 
    year,
    unix_timestamp(current_timestamp()) as timestamp,
    mod(crc32(review_id), 2) as parity
from
    reviews
where
    year = 1999

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [29]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092402455 │ deltacommit │ COMPLETED │ 03-04 09:24 │ 03-04 09:24 │ 03-04 09:24 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔═══════════════════╤═════════════════════╤═══════════════════╤═════════════════════╤══════════════════════════╤═══════════════════════╤══════════════════════════════╤══════════════╗
║ CommitTime        │ Total Bytes Written │ Total Files Added │ Total Files Updated │ Total Partitions Written │ Total Records Written │ Total Update Records Written │ Total Errors ║
╠═══════════════════╪════════

### 3.3. Batch 2 - Update

In [30]:
%%sql
-- 更新1999-01-01的评价
update
    ${TABLE_NAME}
set             
    review_body = concat(uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid()),
    timestamp = unix_timestamp(current_timestamp())
where
    review_date = '1999-01-01'
;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [31]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092402455 │ deltacommit │ COMPLETED │ 03-04 09:24 │ 03-04 09:24 │ 03-04 09:24 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092500900 │ deltacommit │ COMPLETED │ 03-04 09:25 │ 03-04 09:25 │ 03-04 09:25 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔═══════════════════╤═════════════════════╤═══════════════════╤═════════════════════╤══════════════════════════╤═══════════════════════╤══════════════════════════════╤══════════════╗
║ CommitTime        

### 3.4. Batch 3 - Update

In [32]:
%%sql
-- 更新1999-01-02的评价
update
    ${TABLE_NAME}
set             
    review_body = concat(uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid()),
    timestamp = unix_timestamp(current_timestamp())
where
    review_date = '1999-01-02'
;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [33]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092402455 │ deltacommit │ COMPLETED │ 03-04 09:24 │ 03-04 09:24 │ 03-04 09:24 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092500900 │ deltacommit │ COMPLETED │ 03-04 09:25 │ 03-04 09:25 │ 03-04 09:25 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 2   │ 20230304092545066 │ deltacommit │ COMPLETED │ 03-04 09:25 │ 03-04 09:25 │ 03-04 09:25 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 3   │ 20230304092557623

### 3.5. Async Execute Compaction

In [34]:
%%sh
# it's required for current user (emr-notebook) to get sudo permission
sudo -u hadoop spark-submit \
  --jars '/usr/lib/hudi/hudi-spark-bundle.jar' \
  --class "org.apache.hudi.utilities.HoodieCompactor" \
  /usr/lib/hudi/hudi-utilities-bundle.jar \
  --spark-memory '4g' \
  --mode 'execute' \
  --base-path "$TABLE_PATH" \
  --table-name "$TABLE_NAME" > ~/${TABLE_NAME}.compaction.execute.out &>/dev/null

In [35]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092402455 │ deltacommit │ COMPLETED │ 03-04 09:24 │ 03-04 09:24 │ 03-04 09:24 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092500900 │ deltacommit │ COMPLETED │ 03-04 09:25 │ 03-04 09:25 │ 03-04 09:25 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 2   │ 20230304092545066 │ deltacommit │ COMPLETED │ 03-04 09:25 │ 03-04 09:25 │ 03-04 09:25 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 3   │ 20230304092557623

## 4. Async Compaction: Offline Schedule + Offline Execute

### 4.1. Create Table

In [36]:
%%sql
set TABLE_NAME=reviews_mor_simple_3

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [37]:
%%sql
set TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_3

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…

Output()

In [38]:
%env TABLE_NAME=reviews_mor_simple_3

env: TABLE_NAME=reviews_mor_simple_3


In [39]:
%env TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_3

env: TABLE_PATH=s3://glc-examples/hudi-core-conceptions/reviews_mor_simple_3


In [40]:
%%sh
echo $(basename $TABLE_PATH)
aws s3 rm $TABLE_PATH --recursive &>/dev/null
rm -rf ~/${TABLE_NAME}
sleep 3

reviews_mor_simple_3


In [41]:
%%sql
drop table if exists ${TABLE_NAME}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [42]:
%%sql
drop table if exists ${TABLE_NAME}_ro

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [43]:
%%sql
drop table if exists ${TABLE_NAME}_rt

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [44]:
%%sql
create table if not exists ${TABLE_NAME} (
    review_id string, 
    star_rating long, 
    review_body string, 
    review_date date, 
    year long,
    timestamp long,
    parity int
)
using hudi
location '${TABLE_PATH}'
partitioned by (parity)
options ( 
    type = 'mor',  
    primaryKey = 'review_id', 
    preCombineField = 'timestamp',
    hoodie.copyonwrite.record.size.estimate = '175',
    -- hoodie.compact.inline = 'false',
    -- hoodie.compact.schedule.inline = 'false',
    hoodie.logfile.max.size = '512000',
    hoodie.compact.inline.max.delta.commits = '3'
);

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

### 4.2. Batch 1 - Insert

In [45]:
%%sql
insert into 
    ${TABLE_NAME}
select 
    review_id, 
    star_rating, 
    review_body, 
    review_date, 
    year,
    unix_timestamp(current_timestamp()) as timestamp,
    mod(crc32(review_id), 2) as parity
from
    reviews
where
    year = 1999

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [46]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092758774 │ deltacommit │ COMPLETED │ 03-04 09:28 │ 03-04 09:28 │ 03-04 09:28 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔═══════════════════╤═════════════════════╤═══════════════════╤═════════════════════╤══════════════════════════╤═══════════════════════╤══════════════════════════════╤══════════════╗
║ CommitTime        │ Total Bytes Written │ Total Files Added │ Total Files Updated │ Total Partitions Written │ Total Records Written │ Total Update Records Written │ Total Errors ║
╠═══════════════════╪════════

### 4.3. Batch 2 - Update

In [47]:
%%sql
-- 更新1999-01-01的评价
update
    ${TABLE_NAME}
set             
    review_body = concat(uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid()),
    timestamp = unix_timestamp(current_timestamp())
where
    review_date = '1999-01-01'
;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [48]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092758774 │ deltacommit │ COMPLETED │ 03-04 09:28 │ 03-04 09:28 │ 03-04 09:28 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092912844 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:29 │ 03-04 09:29 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔═══════════════════╤═════════════════════╤═══════════════════╤═════════════════════╤══════════════════════════╤═══════════════════════╤══════════════════════════════╤══════════════╗
║ CommitTime        

### 4.4. Batch 3 - Update

In [49]:
%%sql
-- 更新1999-01-02的评价
update
    ${TABLE_NAME}
set             
    review_body = concat(uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid(),uuid()),
    timestamp = unix_timestamp(current_timestamp())
where
    star_rating = 5
;

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

VBox(children=(HBox(), EncodingWidget(children=(VBox(children=(HTML(value='Encoding:'), Dropdown(description='…

Output()

In [50]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092758774 │ deltacommit │ COMPLETED │ 03-04 09:28 │ 03-04 09:28 │ 03-04 09:28 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092912844 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:29 │ 03-04 09:29 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 2   │ 20230304092956914 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:30 │ 03-04 09:30 ║
╚═════╧═══════════════════╧═════════════╧═══════════╧═════════════╧═════════════╧═════════════╝

[ COMMITS ]

╔══════════

### 4.5. Async Schedule Compaction

In [51]:
%%sh
# it's required for current user (emr-notebook) to get sudo permission
sudo -u hadoop spark-submit \
  --jars '/usr/lib/hudi/hudi-spark-bundle.jar' \
  --class 'org.apache.hudi.utilities.HoodieCompactor' \
  /usr/lib/hudi/hudi-utilities-bundle.jar \
  --spark-memory '4g' \
  --mode 'schedule' \
  --base-path "$TABLE_PATH" \
  --table-name "$TABLE_NAME" \
  --hoodie-conf "hoodie.compact.inline.max.delta.commits=3" > ~/${TABLE_NAME}.compaction.schedule.out &>/dev/null

In [52]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092758774 │ deltacommit │ COMPLETED │ 03-04 09:28 │ 03-04 09:28 │ 03-04 09:28 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092912844 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:29 │ 03-04 09:29 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 2   │ 20230304092956914 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:30 │ 03-04 09:30 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 3   │ 20230304093117717

### 4.6. Async Execute Compaction

In [53]:
%%sh
# it's required for current user (emr-notebook) to get sudo permission
sudo -u hadoop spark-submit \
  --jars '/usr/lib/hudi/hudi-spark-bundle.jar' \
  --class "org.apache.hudi.utilities.HoodieCompactor" \
  /usr/lib/hudi/hudi-utilities-bundle.jar \
  --spark-memory '4g' \
  --mode 'execute' \
  --base-path "$TABLE_PATH" \
  --table-name "$TABLE_NAME" > ~/${TABLE_NAME}.compaction.execute.out &>/dev/null

In [54]:
%%sh
~/hudi-stat.sh $TABLE_PATH timeline commits compactions storage


[ TIMELINE ]

╔═════╤═══════════════════╤═════════════╤═══════════╤═════════════╤═════════════╤═════════════╗
║ No. │ Instant           │ Action      │ State     │ Requested   │ Inflight    │ Completed   ║
║     │                   │             │           │ Time        │ Time        │ Time        ║
╠═════╪═══════════════════╪═════════════╪═══════════╪═════════════╪═════════════╪═════════════╣
║ 0   │ 20230304092758774 │ deltacommit │ COMPLETED │ 03-04 09:28 │ 03-04 09:28 │ 03-04 09:28 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 1   │ 20230304092912844 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:29 │ 03-04 09:29 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 2   │ 20230304092956914 │ deltacommit │ COMPLETED │ 03-04 09:29 │ 03-04 09:30 │ 03-04 09:30 ║
╟─────┼───────────────────┼─────────────┼───────────┼─────────────┼─────────────┼─────────────╢
║ 3   │ 20230304093117717