# Unit 4: UPSERT into COW tables
In this unit, we will learn upsert operations into COW tables.<br>
This unit takes about 5 minutes to complete.


### Initialize Spark Session

In [1]:
from pyspark.sql.functions import lit
from functools import reduce
from pyspark.sql.types import LongType
import pyspark.sql.functions as F
from datetime import datetime


spark = SparkSession.builder \
  .appName("Hudi-Learning-Unit-04-PySpark") \
  .master("yarn") \
  .enableHiveSupport() \
  .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
  .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
  .getOrCreate()

spark

23/08/01 03:27:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### Declare & define variables

In [2]:
PROJECT_ID_OUTPUT=!gcloud config get-value core/project 
PROJECT_ID=PROJECT_ID_OUTPUT[0]
PROJECT_NBR_OUTPUT=!gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NBR=PROJECT_NBR_OUTPUT[0]
LOCATION="us-central1" #Update as needed
HUDI_COW_BASE_GCS_URI = f"gs://gaia_data_bucket-{PROJECT_NBR}/nyc-taxi-trips-hudi-cow"
DATAPROC_METASTORE_THRIFT_URI_LIST=!gcloud metastore services list --location $LOCATION | grep thrift | cut -d' ' -f11
DATAPROC_METASTORE_THRIFT_URI=DATAPROC_METASTORE_THRIFT_URI_LIST[0]

print(f"Project ID is {PROJECT_ID}")
print(f"Project number is {PROJECT_NBR}")
print(f"Project location is {LOCATION}")
print(f"Hudi Base Cow Table GCS URI is {HUDI_COW_BASE_GCS_URI}")
print(f"Dataproc Metastore Service thrift URI is {DATAPROC_METASTORE_THRIFT_URI}")


Project ID is apache-hudi-lab
Project number is 623600433888
Project location is us-central1
Hudi Base Cow Table GCS URI is gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow
Dataproc Metastore Service thrift URI is thrift://10.60.192.28:9080


## 1. Upsert into Hudi

Here we will learn how to an insert a new record & and update existing record(s) via the upsert operation.<br>
Just like we did in the previous exercise, we will take some existing record and increment the hour and use in this lab unit.<br>

Insert candidate trip_date to clone & morph: '2019-03-15' <br>
Update candidate trip_date to clone & morph: '2019-01-18'<br>

### 1.1. Trips to clone and use for the lab unit


In [3]:
INSERT_TRIP_DATE='2019-03-10'
INSERT_CLONE_CANDIDATE_TRIP_ID=spark.sql(f"select trip_id  from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-03-10' AND trip_hour < 12 LIMIT 1").collect()[0][0]
print(f"INSERT_CLONE_CANDIDATE_TRIP_ID: {INSERT_CLONE_CANDIDATE_TRIP_ID}")

UPDATE_TRIP_DATE='2019-01-15'
UPDATE_CANDIDATE_TRIP_ID=spark.sql(f"select trip_id  from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-01-15' AND trip_hour < 12 LIMIT 1").collect()[0][0]
print(f"UPDATE_CLONE_CANDIDATE_TRIP_ID: {UPDATE_CANDIDATE_TRIP_ID}")

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
23/08/01 03:27:30 WARN GhfsStorageStatistics: Detected potential high latency for operation op_open. latencyMs=126; previousMaxLatencyMs=0; operationCount=1; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/hoodie.properties
                                                                                

INSERT_CLONE_CANDIDATE_TRIP_ID: 764504188335
UPDATE_CLONE_CANDIDATE_TRIP_ID: 695784702201


### 1.2. Generate unique Trip ID for the insert 

In [4]:
TO_BE_INSERTED_TRIP_ID=spark.sql(f"SELECT max(trip_id) FROM taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-03-10'").collect()[0][0] + 1
print(f"Unique Trip ID generated for the trip to be inserted is: {TO_BE_INSERTED_TRIP_ID}")



Unique Trip ID generated for the trip to be inserted is: 1786706865009


                                                                                

### 1.3. Insert dataframe creation

In [5]:
# Original record
insertCandidateTripDFCow=spark.sql(f"SELECT * FROM taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-03-10' and trip_id={INSERT_CLONE_CANDIDATE_TRIP_ID}")
insertCandidateTripDFCow.show()


23/08/01 03:27:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------------------+--------------------+------------------+----------------------+--------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+------------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|    pickup_datetime|   dropoff_datetime|store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance| fare_amount|  surcharge|    mta_tax| tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surc

In [6]:
# Generate record to insert based off of the above record - increment all the date column by 5 hours
insertTripDFCow = insertCandidateTripDFCow.withColumn('pickup_datetime', insertCandidateTripDFCow.pickup_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('dropoff_datetime', insertCandidateTripDFCow.dropoff_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('trip_hour', insertCandidateTripDFCow.trip_hour + 5) \
                                    .withColumn('trip_id', lit(TO_BE_INSERTED_TRIP_ID)) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_partition_path") \
                                    .drop("_hoodie_file_name")

insertTripDFCow.show(truncate=False)


                                                                                

+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------------+----------+
|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge  |mta_tax    |tip_amount |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id      |trip_date |
+---------+---------+----------+--------+---------+-----------+---------+-------------------+-

In [7]:
# Original record with just a few fields
spark.sql(f"SELECT trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f" WHERE trip_date='2019-03-10' and trip_id={INSERT_CLONE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)


                                                                                

+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|764504188335|yellow   |2        |2019-03-10 09:44:09|2019-03-10 09:59:03|141               |112                |2019-03-10|
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [8]:
# The record we want to insert - note its pickup_datetime and dropoff_datetime are different
insertTripDFCow.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)


+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|1786706865009|yellow   |2        |2019-03-10 14:44:09|2019-03-10 14:59:03|141               |112                |2019-03-10|
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



### 1.4. Update record generation

In [9]:
# Original record
updateCandidateTripDFCow=spark.sql(f"SELECT * FROM taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}")
updateCandidateTripDFCow.show()


+-------------------+--------------------+------------------+----------------------+--------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+------------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|    pickup_datetime|   dropoff_datetime|store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance| fare_amount|  surcharge|    mta_tax| tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surc

                                                                                

In [10]:
# Generate dataframe that updates the above record
updateTripDFCow = updateCandidateTripDFCow.withColumn('pickup_datetime', updateCandidateTripDFCow.pickup_datetime + F.expr('INTERVAL 1 HOURS')) \
                                    .withColumn('dropoff_datetime', updateCandidateTripDFCow.dropoff_datetime + F.expr('INTERVAL 1 HOURS')) \
                                    .withColumn('trip_hour', updateCandidateTripDFCow.trip_hour + 5) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_file_name") \
                                    .drop("_hoodie_partition_path")

# The full record we will update
updateTripDFCow.show(truncate=False)


                                                                                

+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+------------+----------+
|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge  |mta_tax    |tip_amount |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id     |trip_date |
+---------+---------+----------+--------+---------+-----------+---------+-------------------+---

                                                                                

In [11]:
# Original record prior to update - just a few columns for readbility
spark.sql(f"SELECT trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          ",_hoodie_commit_time,_hoodie_commit_seqno,_hoodie_file_name" \
          f" FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f" WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)


+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+-------------------+-----------------------+----------------------------------------------------------------------------+
|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |_hoodie_commit_time|_hoodie_commit_seqno   |_hoodie_file_name                                                           |
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+-------------------+-----------------------+----------------------------------------------------------------------------+
|695784702201|yellow   |1        |2019-01-15 10:00:30|2019-01-15 10:49:46|68                |141                |2019-01-15|20230731203923454  |20230731203923454_478_1|ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_478-19-9440_20230731203923454.parquet|
+------------+--

In [12]:
# Updated details of the above record - note its pickup_datetime and dropoff_datetime are different
updateTripDFCow.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)


+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|695784702201|yellow   |1        |2019-01-15 11:00:30|2019-01-15 11:49:46|68                |141                |2019-01-15|
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



### 1.5. Prepare to upsert

In [13]:
# Lets union the dataframes
upsertTripDFCow = insertTripDFCow.union(updateTripDFCow)
# Quick visual 
upsertTripDFCow.show(truncate=False)


+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------------+----------+
|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge  |mta_tax    |tip_amount |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id      |trip_date |
+---------+---------+----------+--------+---------+-----------+---------+-------------------+-

In [14]:
# Capture record count before insert
TRIP_COUNT_BEFORE_INSERT=spark.sql(f"select count(*)  from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-03-10'").collect()[0][0]
print(f"Trip count before insert: {TRIP_COUNT_BEFORE_INSERT}")


Trip count before insert: 245530


In [15]:
# Capture GCS parquet file listing prior to insert
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=2019-03-10


     373 B  2023-08-01T03:06:21Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/.hoodie_partition_metadata.parquet#1690859181660573  metageneration=1
  4.26 MiB  2023-08-01T03:06:21Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/340b03dc-b6e3-4113-b93a-c1d0e4f330e7-0_868-19-9830_20230731203923454.parquet#1690859181667420  metageneration=1
   4.3 MiB  2023-08-01T03:06:21Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/659d68c7-734c-4604-9e09-339705926683-0_867-19-9829_20230731203923454.parquet#1690859181699337  metageneration=1
TOTAL: 3 objects, 8974792 bytes (8.56 MiB)


In [16]:
# Capture record count before update
TRIP_COUNT_BEFORE_UPDATE=spark.sql(f"select count(*)  from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-01-15'").collect()[0][0]
print(f"Trip count before update: {TRIP_COUNT_BEFORE_UPDATE}")


Trip count before update: 289823


In [17]:
# Capture GCS parquet file listing prior to update
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=2019-01-15


     373 B  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/.hoodie_partition_metadata.parquet#1690859180071337  metageneration=1
  4.22 MiB  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_478-19-9440_20230731203923454.parquet#1690859180061839  metageneration=1
  4.22 MiB  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/c186a3d2-5697-4367-bd66-0239178a1e4b-0_477-19-9439_20230731203923454.parquet#1690859180086698  metageneration=1
  1.69 MiB  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/ed4c608d-c783-40af-a83e-e4411863865b-0_479-19-9441_20230731203923454.parquet#1690859180101788  metageneration=1
TOTAL: 4 objects, 10629229 bytes (10.14 MiB)


In [18]:
# Capture original record prior to update
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f" WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)


+-------------------+----------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_file_name                                                           |trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+----------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230731203923454  |ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_478-19-9440_20230731203923454.parquet|695784702201|yellow   |1        |2019-01-15 10:00:30|2019-01-15 10:49:46|68                |141                |2019-01-15|
+-------------------+----------------------------------------------------------------------------+------------+-

In [19]:
# HUDI options for the upsert operation
hudi_options = {
            'hoodie.database.name': 'taxi_db',
            'hoodie.table.name': 'nyc_taxi_trips_hudi_cow',
            'hoodie.datasource.write.table.name': 'nyc_taxi_trips_hudi_cow',
            'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
            'hoodie.datasource.write.recordkey.field': 'trip_id',
            'hoodie.datasource.write.partitionpath.field': 'trip_date',
            'hoodie.datasource.write.precombine.field': 'pickup_datetime',
            'hoodie.datasource.write.hive_style_partitioning': 'true',
            'hoodie.partition.metafile.use.base.format': 'true', 
            'hoodie.datasource.write.drop.partition.columns': 'true',
            'hoodie.datasource.write.operation': 'upsert',
            'hoodie.datasource.hive_sync.enable': 'true',
            'hoodie.meta.sync.client.tool.class': 'org.apache.hudi.hive.HiveSyncTool',
            'hoodie.datasource.hive_sync.mode':'hms',
            'hoodie.datasource.hive_sync.metastore.uris':DATAPROC_METASTORE_THRIFT_URI,
            'hoodie.datasource.hive_sync.auto_create_database':'true',
            'hoodie.datasource.hive_sync.database': 'taxi_db',
            'hoodie.datasource.hive_sync.table': 'nyc_taxi_trips_hudi_cow',
            'hoodie.datasource.hive_sync.partition_fields': 'trip_date', 
            'hoodie.datasource.hive_sync.partition_extractor_class':'org.apache.hudi.hive.MultiPartKeysValueExtractor',
            'hoodie.datasource.hive_sync.use_jdbc': 'false',
            'hoodie.datasource.hive_sync.support_timestamp': 'true'
        }


### 1.6. Execute the upsert

In [None]:
# Append to dataset in GCS, and refresh metadata in Dataproc Metastore for the table
upsertTripDFCow.write.format("hudi"). \
                options(**hudi_options). \
                mode("append"). \
                save(HUDI_COW_BASE_GCS_URI)


### 1.7. Validate the insert

In [21]:
# Record count after insert
TRIP_COUNT_AFTER_INSERT=spark.sql(f"select count(*) from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-03-10'").collect()[0][0]
print(f"Trip count before insert was {TRIP_COUNT_BEFORE_INSERT} and trip count after insert is {TRIP_COUNT_AFTER_INSERT}")


23/08/01 03:28:57 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_read_operations. latencyMs=110; previousMaxLatencyMs=78; operationCount=912; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/20230801032820936.commit
                                                                                

Trip count before insert was 245530 and trip count after insert is 245531


In [22]:
# GCS parquet file listing after insert - note the extra file
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=$INSERT_TRIP_DATE


     373 B  2023-08-01T03:06:21Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/.hoodie_partition_metadata.parquet#1690859181660573  metageneration=1
  4.26 MiB  2023-08-01T03:28:40Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/340b03dc-b6e3-4113-b93a-c1d0e4f330e7-0_1-49-4155_20230801032820936.parquet#1690860520263515  metageneration=1
  4.26 MiB  2023-08-01T03:06:21Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/340b03dc-b6e3-4113-b93a-c1d0e4f330e7-0_868-19-9830_20230731203923454.parquet#1690859181667420  metageneration=1
   4.3 MiB  2023-08-01T03:06:21Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/659d68c7-734c-4604-9e09-339705926683-0_867-19-9829_20230731203923454.parquet#1690859181699337  metageneration=1
TOTAL: 4 objects, 13441377 bytes (12.82 MiB)


In [23]:
# Check for existence of the record and the filename in which it exists
spark.sql(f"SELECT _hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f" WHERE trip_date='2019-03-10' AND trip_id={TO_BE_INSERTED_TRIP_ID}") \
        .show(truncate=False)


+--------------------------------------------------------------------------+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_file_name                                                         |trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+--------------------------------------------------------------------------+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|340b03dc-b6e3-4113-b93a-c1d0e4f330e7-0_1-49-4155_20230801032820936.parquet|1786706865009|yellow   |2        |2019-03-10 14:44:09|2019-03-10 14:59:03|141               |112                |2019-03-10|
+--------------------------------------------------------------------------+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+-------

### 1.8. Validate the update

In [24]:
# Record count after update
TRIP_COUNT_AFTER_UPDATE=spark.sql(f"select count(*) from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2019-01-15'").collect()[0][0]
print(f"Trip count before update was: {TRIP_COUNT_BEFORE_UPDATE} and trip count after update is {TRIP_COUNT_AFTER_UPDATE}")

Trip count before update was: 289823 and trip count after update is 289823


In [25]:
# GCS parquet file listing after insert
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=$UPDATE_TRIP_DATE

     373 B  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/.hoodie_partition_metadata.parquet#1690859180071337  metageneration=1
  4.22 MiB  2023-08-01T03:28:40Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_0-49-4154_20230801032820936.parquet#1690860520226622  metageneration=1
  4.22 MiB  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_478-19-9440_20230731203923454.parquet#1690859180061839  metageneration=1
  4.22 MiB  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/c186a3d2-5697-4367-bd66-0239178a1e4b-0_477-19-9439_20230731203923454.parquet#1690859180086698  metageneration=1
  1.69 MiB  2023-08-01T03:06:20Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/ed4c608d-c783-40af-a83e-e4

In [26]:
# Check for update
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date" \
          f" FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f" WHERE trip_date='2019-01-15' AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)

+-------------------+--------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_file_name                                                         |trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+--------------------------------------------------------------------------+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230801032820936  |ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_0-49-4154_20230801032820936.parquet|695784702201|yellow   |1        |2019-01-15 11:00:30|2019-01-15 11:49:46|68                |141                |2019-01-15|
+-------------------+--------------------------------------------------------------------------+------------+---------+-

### 1.9. Study the commit log

In [27]:
LOG_FILE_LIST=!gsutil ls $HUDI_COW_BASE_GCS_URI/.hoodie/*.commit | tail -n 1 
LOG_FILE=LOG_FILE_LIST[0]
print(f"Log file FQP is {LOG_FILE}")

Log file FQP is gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/20230801032820936.commit


Notice the insert and the update in the latest commit log

In [28]:
!gsutil cat $LOG_FILE

{
  "partitionToWriteStats" : {
    "trip_date=2019-01-15" : [ {
      "fileId" : "ab386b38-4ae1-4e57-b044-46bd2f79c14e-0",
      "path" : "trip_date=2019-01-15/ab386b38-4ae1-4e57-b044-46bd2f79c14e-0_0-49-4154_20230801032820936.parquet",
      "prevCommit" : "20230731203923454",
      "numWrites" : 122745,
      "numDeletes" : 0,
      "numUpdateWrites" : 1,
      "numInserts" : 0,
      "totalWriteBytes" : 4424512,
      "totalWriteErrors" : 0,
      "tempPath" : null,
      "partitionPath" : "trip_date=2019-01-15",
      "totalLogRecords" : 0,
      "totalLogFilesCompacted" : 0,
      "totalLogSizeCompacted" : 0,
      "totalUpdatedRecordsCompacted" : 0,
      "totalLogBlocks" : 0,
      "totalCorruptLogBlock" : 0,
      "totalRollbackBlocks" : 0,
      "fileSizeInBytes" : 4424512,
      "minEventTime" : null,
      "maxEventTime" : null
    } ],
    "trip_date=2019-03-10" : [ {
      "fileId" : "340b03dc-b6e3-4113-b93a-c1d0e4f330e7-0",
      "path" : "trip_date=2019-03-10/340b03dc-b

This concludes the unit. Please proceed to the next notebook.