# Unit 10: Upsert with MOR tables

In this unit, we will learn upsert operations against MOR tables, and what happens under the hood.<br>


This unit takes about 5 minutes to complete.

In [1]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import col
from functools import reduce
from pyspark.sql.types import LongType
import pyspark.sql.functions as F
from datetime import datetime

spark = SparkSession.builder \
  .appName("Hudi-Learning-Unit-09-PySpark") \
  .master("yarn")\
  .enableHiveSupport()\
  .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
  .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
  .getOrCreate()

spark

23/08/02 15:52:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### Variables

In [2]:
PROJECT_ID_OUTPUT=!gcloud config get-value core/project
PROJECT_ID=PROJECT_ID_OUTPUT[0]
PROJECT_NBR_OUTPUT=!gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NBR=PROJECT_NBR_OUTPUT[0]
LOCATION = "us-central1" #Replace with your GCP region

HUDI_MOR_BASE_GCS_URI = f"gs://gaia_data_bucket-{PROJECT_NBR}/nyc-taxi-trips-hudi-mor"
TRIP_DATE="2020-01-30"
DATAPROC_METASTORE_THRIFT_URI_LIST=!gcloud metastore services list --location $LOCATION | grep thrift | cut -d' ' -f11
DATAPROC_METASTORE_THRIFT_URI=DATAPROC_METASTORE_THRIFT_URI_LIST[0]

print(f"Project ID is {PROJECT_ID}")
print(f"Project number is {PROJECT_NBR}")
print(f"Project location is is {LOCATION}")
print(f"Dataproc Metastore Service thrift URI is {DATAPROC_METASTORE_THRIFT_URI}")
print(f"Trip date partition we will insert into is {TRIP_DATE}")

Project ID is apache-hudi-lab
Project number is 623600433888
Project location is is us-central1
Dataproc Metastore Service thrift URI is thrift://10.60.192.28:9080
Trip date partition we will insert into is 2020-01-30


**Note**: Ensure you have the right URI for Dataproc Metastore

## [HUDI UPSERT FEATURE] Upsert into MoR table

### 1. Generate trips to upsert

In [3]:
# CREATE A RECORD THAT CAN BE INSERTED

print("----------------INSERT--------------")
INSERT_TRIP_DATE='2019-03-10'
INSERT_CLONE_CANDIDATE_TRIP_ID=spark.sql(f"select trip_id  from taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-03-10' AND trip_hour < 12 LIMIT 1").collect()[0][0]
print(f"INSERT_CLONE_CANDIDATE_TRIP_ID: {INSERT_CLONE_CANDIDATE_TRIP_ID}")
TO_BE_INSERTED_TRIP_ID=spark.sql(f"SELECT max(trip_id) FROM taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-03-10'").collect()[0][0] + 1
print(f"Unique Trip ID generated for the trip to be inserted is: {TO_BE_INSERTED_TRIP_ID}")

# Generate insert record
insertCandidateTripDFMoR=spark.sql(f"SELECT * FROM taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-03-10' and trip_id={INSERT_CLONE_CANDIDATE_TRIP_ID}")
insertTripDFMoR = insertCandidateTripDFMoR.withColumn('pickup_datetime', insertCandidateTripDFMoR.pickup_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('dropoff_datetime', insertCandidateTripDFMoR.dropoff_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('trip_hour', insertCandidateTripDFMoR.trip_hour + 5) \
                                    .withColumn('trip_id', lit(TO_BE_INSERTED_TRIP_ID)) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_partition_path") \
                                    .drop("_hoodie_file_name")

insertTripDFMoR.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)

----------------INSERT--------------


ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
23/08/02 15:52:44 WARN GhfsStorageStatistics: Detected potential high latency for operation op_open. latencyMs=106; previousMaxLatencyMs=0; operationCount=1; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/hoodie.properties
23/08/02 15:52:44 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_read_operations. latencyMs=160; previousMaxLatencyMs=96; operationCount=4; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731220137856.deltacommit
                                                                                

INSERT_CLONE_CANDIDATE_TRIP_ID: 764504188335


                                                                                

Unique Trip ID generated for the trip to be inserted is: 1786706865009


                                                                                

+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|1786706865009|yellow   |2        |2019-03-10 14:44:09|2019-03-10 14:59:03|141               |112                |2019-03-10|
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



                                                                                

In [4]:
# DETERMINE A RECORD THAT CAN BE UPDATED AND MODIFY IT
print("----------------UPDATE--------------")
UPDATE_TRIP_DATE='2019-01-15'
UPDATE_CANDIDATE_TRIP_ID=spark.sql(f"select trip_id  from taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-01-15' AND trip_hour < 12 LIMIT 1").collect()[0][0]
print(f"UPDATE_CLONE_CANDIDATE_TRIP_ID: {UPDATE_CANDIDATE_TRIP_ID}")

updateCandidateTripDFMoR=spark.sql(f"SELECT * FROM taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}")
updateTripDFMoR = updateCandidateTripDFMoR.withColumn('pickup_datetime', updateCandidateTripDFMoR.pickup_datetime + F.expr('INTERVAL 1 HOURS')) \
                                    .withColumn('dropoff_datetime', updateCandidateTripDFMoR.dropoff_datetime + F.expr('INTERVAL 1 HOURS')) \
                                    .withColumn('trip_hour', updateCandidateTripDFMoR.trip_hour + 1) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_file_name") \
                                    .drop("_hoodie_partition_path")

print("Original trip details")
updateCandidateTripDFMoR.select("_hoodie_commit_time","trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)

print("Updated trip details")
updateTripDFMoR.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)

----------------UPDATE--------------
UPDATE_CLONE_CANDIDATE_TRIP_ID: 455266534229
Original trip details


                                                                                

+-------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230731212147518  |455266534229|yellow   |1        |2019-01-15 09:45:50|2019-01-15 10:01:42|162               |230                |2019-01-15|
+-------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+

Updated trip details


                                                                                

+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|455266534229|yellow   |1        |2019-01-15 10:45:50|2019-01-15 11:01:42|162               |230                |2019-01-15|
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



### 1.2. Prepare to upsert

#### 1.2.1. Union the dataframes

In [5]:
# Lets union the dataframes
upsertTripDFMoR = insertTripDFMoR.union(updateTripDFMoR)

# Quick visual 
print("Upsert DF - just a few fields")
upsertTripDFMoR.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)

Upsert DF - just a few fields
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|1786706865009|yellow   |2        |2019-03-10 14:44:09|2019-03-10 14:59:03|141               |112                |2019-03-10|
|455266534229 |yellow   |1        |2019-01-15 10:45:50|2019-01-15 11:01:42|162               |230                |2019-01-15|
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [6]:
print("Upsert DF - all fields")
upsertTripDFMoR.show(truncate=False)

Upsert DF - all fields


23/08/02 15:53:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------------+----------+
|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge  |mta_tax    |tip_amount |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id      |trip_date |
+---------+---------+----------+--------+---------+-----------+---------+-------------------+-

                                                                                

#### 1.2.2. Before insert

In [7]:
# Capture GCS parquet file listing prior to insert
!gsutil ls -alh $HUDI_MOR_BASE_GCS_URI/trip_date=2019-03-10

     373 B  2023-08-02T15:31:49Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/.hoodie_partition_metadata.parquet#1690990309721298  metageneration=1
  4.26 MiB  2023-08-02T15:31:49Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/2003d935-a739-4848-8ca6-3f70a72df12e-0_868-19-9830_20230731212147518.parquet#1690990309717971  metageneration=1
   4.3 MiB  2023-08-02T15:31:49Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/b9dff0c4-aa56-40dc-a60c-713637f65090-0_867-19-9829_20230731212147518.parquet#1690990309722408  metageneration=1
TOTAL: 3 objects, 8973022 bytes (8.56 MiB)


In [8]:
# Capture record count before insert from RT table
TRIP_COUNT_BEFORE_UPS_INSERT_RT=spark.sql(f"select count(*)  from taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-03-10'").collect()[0][0]
print(f"Trip count before insert of RT table: {TRIP_COUNT_BEFORE_UPS_INSERT_RT}")

Trip count before insert of RT table: 245530


In [9]:
# Capture record count before insert from RO table
TRIP_COUNT_BEFORE_UPS_INSERT_RO=spark.sql(f"select count(*)  from taxi_db.nyc_taxi_trips_hudi_mor_ro WHERE trip_date='2019-03-10'").collect()[0][0]
print(f"Trip count before insert of RO table: {TRIP_COUNT_BEFORE_UPS_INSERT_RO}")

                                                                                

Trip count before insert of RO table: 245530


In [17]:
from pyspark.sql.functions import col

print("Trip count before insert from snapshot/RT with DataSource API:")
spark.read \
    .format('hudi') \
    .option('hoodie.datasource.query.type','snapshot') \
    .load(f"{HUDI_MOR_BASE_GCS_URI}/trip_date=2019-03-10") \
    .count()



Trip count before insert from snapshot/RT with DataSource API:


                                                                                

245530

In [18]:
print("Trip count before insert from RO with DataSource API:")
spark.read \
    .format('hudi') \
    .option('hoodie.datasource.query.type','read_optimized') \
    .load(f"{HUDI_MOR_BASE_GCS_URI}/trip_date=2019-03-10") \
    .count()

Trip count before insert from RO with DataSource API:


                                                                                

245530

#### 1.2.3. Before update

In [11]:
# Capture GCS parquet file listing prior to update
!gsutil ls -alh $HUDI_MOR_BASE_GCS_URI/trip_date=2019-01-15

     373 B  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/.hoodie_partition_metadata.parquet#1690990308002198  metageneration=1
  4.22 MiB  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/91a5e682-73f9-495d-b081-a0115014632b-0_477-19-9439_20230731212147518.parquet#1690990308009617  metageneration=1
  4.22 MiB  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/e374ddde-d33d-4542-885a-1e4152cc88d1-0_478-19-9440_20230731212147518.parquet#1690990308002879  metageneration=1
  1.69 MiB  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/e41b3e39-14f4-4379-a321-6e08589a20c3-0_479-19-9441_20230731212147518.parquet#1690990308008118  metageneration=1
TOTAL: 4 objects, 10623024 bytes (10.13 MiB)


In [12]:
# Capture record count before update
TRIP_COUNT_BEFORE_UPS_UPDATE_RO=spark.sql("select count(*)  from taxi_db.nyc_taxi_trips_hudi_mor_ro WHERE trip_date='2019-01-15'").collect()[0][0]
print(f"Trip count before update of RO table: {TRIP_COUNT_BEFORE_UPS_UPDATE_RO}")

Trip count before update of RO table: 289823


In [13]:
# Capture record count before update
TRIP_COUNT_BEFORE_UPS_UPDATE_RT=spark.sql("select count(*)  from taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-01-15'").collect()[0][0]
print(f"Trip count before update of RT table: {TRIP_COUNT_BEFORE_UPS_UPDATE_RT}")

Trip count before update of RT table: 289823


In [19]:
print("Trip count before update from snapshot/RT with DataSource API:")
spark.read \
    .format('hudi') \
    .option('hoodie.datasource.query.type','snapshot') \
    .load(f"{HUDI_MOR_BASE_GCS_URI}/trip_date=2019-01-15") \
    .count()

Trip count before update from snapshot/RT with DataSource API:


                                                                                

289823

In [20]:
print("Trip count before update from RO with DataSource API:")
spark.read \
    .format('hudi') \
    .option('hoodie.datasource.query.type','read_optimized') \
    .load(f"{HUDI_MOR_BASE_GCS_URI}/trip_date=2019-01-15") \
    .count()

Trip count before update from RO with DataSource API:


                                                                                

289823

In [15]:
# Capture original record prior to update
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_mor_rt "\
          f" WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)

+-------------------+----------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_file_name                                                           |trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+----------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230731212147518  |e374ddde-d33d-4542-885a-1e4152cc88d1-0_478-19-9440_20230731212147518.parquet|455266534229|yellow   |1        |2019-01-15 09:45:50|2019-01-15 10:01:42|162               |230                |2019-01-15|
+-------------------+----------------------------------------------------------------------------+------------+-

#### 1.2.4. Hudi options

In [16]:
# HUDI options for the upsert operation
hudi_upsert_options = {
            'hoodie.database.name': 'taxi_db',
            'hoodie.table.name': 'nyc_taxi_trips_hudi_mor',
            'hoodie.datasource.write.table.name': 'nyc_taxi_trips_hudi_mor',
            'hoodie.datasource.write.table.type': 'MERGE_ON_READ',
            'hoodie.datasource.write.recordkey.field': 'trip_id',
            'hoodie.datasource.write.partitionpath.field': 'trip_date',
            'hoodie.datasource.write.precombine.field': 'pickup_datetime',
            'hoodie.datasource.write.hive_style_partitioning': 'true',
            'hoodie.partition.metafile.use.base.format': 'true', 
            'hoodie.datasource.write.drop.partition.columns': 'true',
            'hoodie.datasource.write.operation': 'upsert',
            'hoodie.datasource.hive_sync.enable': 'true',
            'hoodie.meta.sync.client.tool.class': 'org.apache.hudi.hive.HiveSyncTool',
            'hoodie.datasource.hive_sync.mode':'hms',
            'hoodie.datasource.hive_sync.metastore.uris':DATAPROC_METASTORE_THRIFT_URI,
            'hoodie.datasource.hive_sync.auto_create_database':'true',
            'hoodie.datasource.hive_sync.database': 'taxi_db',
            'hoodie.datasource.hive_sync.table': 'nyc_taxi_trips_hudi_mor',
            'hoodie.datasource.hive_sync.partition_fields': 'trip_date', 
            'hoodie.datasource.hive_sync.partition_extractor_class':'org.apache.hudi.hive.MultiPartKeysValueExtractor',
            'hoodie.datasource.hive_sync.use_jdbc': 'false',
            'hoodie.datasource.hive_sync.support_timestamp': 'true'
        }

#### 1.2.5. Timeline before upsert

In [None]:
!gsutil ls -al $HUDI_MOR_BASE_GCS_URI/.hoodie

### 1.3. Upsert in action

#### 1.3.1. Execute the upsert

In [21]:
# Append to dataset in GCS
upsertTripDFMoR.write.format("hudi"). \
                options(**hudi_upsert_options). \
                mode("append"). \
                save(HUDI_MOR_BASE_GCS_URI)

23/08/02 15:57:38 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_read_operations. latencyMs=176; previousMaxLatencyMs=160; operationCount=16591; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/metadata/.hoodie/hoodie.properties
23/08/02 15:57:38 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_write_close_operations. latencyMs=140; previousMaxLatencyMs=0; operationCount=1; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230802155736547.deltacommit.requested
23/08/02 15:57:57 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_write_close_operations. latencyMs=241; previousMaxLatencyMs=140; operationCount=8; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/metadata/.hoodie/20230802155736547.deltacommit.requested
23/08/02 15:58:01 WARN GhfsStorageStatistics: Detected potential high latency for operation op_delete.

#### 1.3.2. Commits

In [22]:
spark.sql("SHOW tables IN taxi_db;").show(truncate=False)

+---------+--------------------------+-----------+
|namespace|tableName                 |isTemporary|
+---------+--------------------------+-----------+
|taxi_db  |nyc_taxi_trips_hudi_cow   |false      |
|taxi_db  |nyc_taxi_trips_hudi_mor_ro|false      |
|taxi_db  |nyc_taxi_trips_hudi_mor_rt|false      |
+---------+--------------------------+-----------+



In [23]:
spark.sql("call show_commits(table => 'taxi_db.nyc_taxi_trips_hudi_mor_ro', limit => 100);").show(100, truncate=False)

+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+
|commit_time      |total_bytes_written|total_files_added|total_files_updated|total_partitions_written|total_records_written|total_update_records_written|total_errors|
+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+
|20230802155736547|4470068            |0                |2                  |2                       |122148               |1                           |0           |
|20230731220137856|1414387506         |438              |0                  |337                     |37023925             |0                           |0           |
|20230731215336903|1223009119         |405              |0                  |365                     |31972637             |0                           |0           

In [24]:
spark.sql("call show_commits(table => 'taxi_db.nyc_taxi_trips_hudi_mor_rt', limit => 100);").show(100, truncate=False)

+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+
|commit_time      |total_bytes_written|total_files_added|total_files_updated|total_partitions_written|total_records_written|total_update_records_written|total_errors|
+-----------------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+
|20230802155736547|4470068            |0                |2                  |2                       |122148               |1                           |0           |
|20230731220137856|1414387506         |438              |0                  |337                     |37023925             |0                           |0           |
|20230731215336903|1223009119         |405              |0                  |365                     |31972637             |0                           |0           

#### 1.3.3. Timeline
Notice the latest deltacommit

In [25]:
!gsutil ls -al $HUDI_MOR_BASE_GCS_URI/.hoodie

   1606987  2023-08-02T15:31:47Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731212147518.deltacommit#1690990307440364  metageneration=1
    459360  2023-08-02T15:31:47Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731212147518.deltacommit.inflight#1690990307475910  metageneration=1
         0  2023-08-02T15:31:47Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731212147518.deltacommit.requested#1690990307422703  metageneration=1
    799752  2023-08-02T15:31:47Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731214644879.deltacommit#1690990307823565  metageneration=1
    459971  2023-08-02T15:31:47Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731214644879.deltacommit.inflight#1690990307845856  metageneration=1
         0  2023-08-02T15:31:47Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230731214644879.deltacommit.requested#1

#### 1.3.4. Commit log review

In [28]:
LOG_FILE_LIST=!gsutil ls $HUDI_MOR_BASE_GCS_URI/.hoodie/*.deltacommit | tail -n 1 
LOG_FILE=LOG_FILE_LIST[0]
print(f"Log file FQP is {LOG_FILE} and it correlates with a commit time in the table above")

!gsutil cat $LOG_FILE

Log file FQP is gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/.hoodie/20230802155736547.deltacommit and it correlates with a commit time in the table above
{
  "partitionToWriteStats" : {
    "trip_date=2019-01-15" : [ {
      "fileId" : "e374ddde-d33d-4542-885a-1e4152cc88d1-0",
      "path" : "trip_date=2019-01-15/.e374ddde-d33d-4542-885a-1e4152cc88d1-0_20230731212147518.log.1_0-97-18514",
      "prevCommit" : "20230731212147518",
      "numWrites" : 1,
      "numDeletes" : 0,
      "numUpdateWrites" : 1,
      "numInserts" : 0,
      "totalWriteBytes" : 4851,
      "totalWriteErrors" : 0,
      "tempPath" : null,
      "partitionPath" : "trip_date=2019-01-15",
      "totalLogRecords" : 0,
      "totalLogFilesCompacted" : 0,
      "totalLogSizeCompacted" : 0,
      "totalUpdatedRecordsCompacted" : 0,
      "totalLogBlocks" : 0,
      "totalCorruptLogBlock" : 0,
      "totalRollbackBlocks" : 0,
      "fileSizeInBytes" : 4851,
      "minEventTime" : null,
      "maxEventTim

#### 1.3.5. Files in DFS

In [26]:
# DFS partition where we inserted
!gsutil ls -alh $HUDI_MOR_BASE_GCS_URI/trip_date=2019-03-10

     373 B  2023-08-02T15:31:49Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/.hoodie_partition_metadata.parquet#1690990309721298  metageneration=1
  4.26 MiB  2023-08-02T15:57:55Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/2003d935-a739-4848-8ca6-3f70a72df12e-0_1-97-18515_20230802155736547.parquet#1690991875134725  metageneration=1
  4.26 MiB  2023-08-02T15:31:49Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/2003d935-a739-4848-8ca6-3f70a72df12e-0_868-19-9830_20230731212147518.parquet#1690990309717971  metageneration=1
   4.3 MiB  2023-08-02T15:31:49Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-03-10/b9dff0c4-aa56-40dc-a60c-713637f65090-0_867-19-9829_20230731212147518.parquet#1690990309722408  metageneration=1
TOTAL: 4 objects, 13438239 bytes (12.82 MiB)


In [27]:
# DFS partition where we updated
!gsutil ls -alh $HUDI_MOR_BASE_GCS_URI/trip_date=2019-01-15

  4.74 KiB  2023-08-02T15:57:52Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/.e374ddde-d33d-4542-885a-1e4152cc88d1-0_20230731212147518.log.1_0-97-18514#1690991872057873  metageneration=1
     373 B  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/.hoodie_partition_metadata.parquet#1690990308002198  metageneration=1
  4.22 MiB  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/91a5e682-73f9-495d-b081-a0115014632b-0_477-19-9439_20230731212147518.parquet#1690990308009617  metageneration=1
  4.22 MiB  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/e374ddde-d33d-4542-885a-1e4152cc88d1-0_478-19-9440_20230731212147518.parquet#1690990308002879  metageneration=1
  1.69 MiB  2023-08-02T15:31:48Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-mor/trip_date=2019-01-15/e41b3e39-14f4-4379-a321-6e

**Notice the DELTA LOG file created** above

#### 1.3.6. Record counts - RO table | Update focus

In [29]:
from pyspark.sql.functions import col

print("MOR_RO count - DataSource API:")

spark.read \
    .format('hudi') \
    .option('hoodie.datasource.query.type','read_optimized') \
    .load(f"{HUDI_MOR_BASE_GCS_URI}/trip_date=2019-01-15") \
    .count()

MOR_RO count - DataSource API:


                                                                                

289823

In [30]:
# Run a count to ensure that the record count is the same for the update against MOR-RO table
AFTER_UPDATE_TRIP_COUNT_RO=spark.sql(f"SELECT COUNT(*) FROM taxi_db.nyc_taxi_trips_hudi_mor_ro WHERE trip_date='2019-01-15'").collect()[0][0]
print(f"MOR_RO - SparkSQl - DPMS: Trip count before update on RO table was: {TRIP_COUNT_BEFORE_UPS_UPDATE_RO} and latest trip count is {AFTER_UPDATE_TRIP_COUNT_RO}")

                                                                                

MOR_RO - SparkSQl - DPMS: Trip count before update on RO table was: 289823 and latest trip count is 289823


#### 2.3.5. Record counts - RT table | Update focus

In [31]:
# Run a count to ensure that the record count increased against the MOR-RT table
AFTER_UPDATE_TRIP_COUNT_RT=spark.sql(f"SELECT COUNT(*) FROM taxi_db.nyc_taxi_trips_hudi_mor_rt WHERE trip_date='2019-01-15'").collect()[0][0]
print(f"MOR_RO: Trip count before update on RT table was: {TRIP_COUNT_BEFORE_UPS_UPDATE_RT} and latest trip count is {AFTER_UPDATE_TRIP_COUNT_RT}")

23/08/02 16:06:23 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_write_operations. latencyMs=382; previousMaxLatencyMs=37; operationCount=9561; context=gs://dataproc-temp-us-central1-623600433888-ojsvfynx/92fae947-ff2e-4e99-9649-afae6a74a071/spark-job-history/application_1690608046061_0165.inprogress

MOR_RO: Trip count before update on RT table was: 289823 and latest trip count is 289822


                                                                                

In [32]:
from pyspark.sql.functions import col

print("MOR_RT count - DataSource API:")

spark.read \
    .format('hudi') \
    .option('hoodie.datasource.query.type','snapshot') \
    .load(f"{HUDI_MOR_BASE_GCS_URI}/trip_date=2019-01-15") \
    .count()


MOR_RT count - DataSource API:


                                                                                

289823

In [34]:
# Read optimized: Has old record
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_mor_ro "\
          f" WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)

+-------------------+----------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_file_name                                                           |trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+----------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230731212147518  |e374ddde-d33d-4542-885a-1e4152cc88d1-0_478-19-9440_20230731212147518.parquet|455266534229|yellow   |1        |2019-01-15 09:45:50|2019-01-15 10:01:42|162               |230                |2019-01-15|
+-------------------+----------------------------------------------------------------------------+------------+-

In [35]:
# Real Time: Is missing the record altogether
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_mor_rt "\
          f" WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)

                                                                                

+-------------------+-----------------+-------+---------+---------+---------------+----------------+------------------+-------------------+---------+
|_hoodie_commit_time|_hoodie_file_name|trip_id|taxi_type|vendor_id|pickup_datetime|dropoff_datetime|pickup_location_id|dropoff_location_id|trip_date|
+-------------------+-----------------+-------+---------+---------+---------------+----------------+------------------+-------------------+---------+
+-------------------+-----------------+-------+---------+---------+---------------+----------------+------------------+-------------------+---------+



ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


# TODO: Troubleshoot why the update record is missing from RT view 

**Notice that the RT table shows the record count update, where as the RO table does not**, since it includes only data in parquet, not any delta logs

This concludes the lab unit, please proceed to the next notebook.