# Unit 4: UPSERT into COW tables
In this unit, we will learn upsert operations into COW tables.<br>
This unit takes about 5 minutes to complete.


### Initialize Spark Session

In [1]:
from pyspark.sql.functions import lit
from functools import reduce
from pyspark.sql.types import LongType
import pyspark.sql.functions as F
from datetime import datetime

spark = SparkSession.builder \
  .appName("Hudi-Learning-Unit-06-PySpark") \
  .master("yarn")\
  .enableHiveSupport()\
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
  .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
  .getOrCreate()

spark

23/07/30 02:28:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### Declare & define variables

In [2]:
PROJECT_ID_OUTPUT=!gcloud config get-value core/project
PROJECT_ID=PROJECT_ID_OUTPUT[0]
PROJECT_NBR_OUTPUT=!gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NBR=PROJECT_NBR_OUTPUT[0]
print(f"Project ID is {PROJECT_ID}")
print(f"Project Number is {PROJECT_NBR}")

PERSIST_TO_BUCKET = f"gs://gaia_data_bucket-{PROJECT_NBR}"
HUDI_COW_BASE_GCS_URI = f"{PERSIST_TO_BUCKET}/nyc-taxi-trips-hudi-cow"
DATABASE_NAME = "taxi_db"
COW_TABLE_NAME = "nyc_taxi_trips_hudi_cow"

Project ID is apache-hudi-lab
Project Number is 623600433888


## 1. Upsert into Hudi

Here we will learn how to an insert a new record & and update existing record(s) via the upsert operation.<br>
Just like we did in the previous exercise, we will take some existing record and increment the hour and use in this lab unit.<br>

Insert candidate trip_date to clone & morph: '2019-03-15' <br>
Update candidate trip_date to clone & morph: '2019-01-18'<br>

### 1.1. Trips to clone and use for the lab unit


In [3]:
INSERT_TRIP_DATE='2019-03-10'
INSERT_TRIP_DATE_PREDICATE=f"trip_date=\"{INSERT_TRIP_DATE}\""
INSERT_CLONE_CANDIDATE_TRIP_ID=spark.sql(f"select trip_id  from {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {INSERT_TRIP_DATE_PREDICATE} LIMIT 1").collect()[0][0]
print(f"INSERT_CLONE_CANDIDATE_TRIP_ID: {INSERT_CLONE_CANDIDATE_TRIP_ID}")
INSERT_CANDIDATE_PREDICATES=f"{INSERT_TRIP_DATE_PREDICATE} AND trip_id={INSERT_CLONE_CANDIDATE_TRIP_ID}"
print(f"INSERT_CANDIDATE_PREDICATES: {INSERT_CANDIDATE_PREDICATES}")

UPDATE_TRIP_DATE='2019-01-15'
UPDATE_TRIP_DATE_PREDICATE=f"trip_date=\"{UPDATE_TRIP_DATE}\""
UPDATE_CANDIDATE_TRIP_ID=spark.sql(f"select trip_id  from {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {UPDATE_TRIP_DATE_PREDICATE} LIMIT 1").collect()[0][0]
print(f"UPDATE_CLONE_CANDIDATE_TRIP_ID: {UPDATE_CANDIDATE_TRIP_ID}")
UPDATE_CANDIDATE_PREDICATES=f"{UPDATE_TRIP_DATE_PREDICATE} AND trip_id={UPDATE_CANDIDATE_TRIP_ID}"
print(f"UPDATE_CANDIDATE_PREDICATES: {UPDATE_CANDIDATE_PREDICATES}")

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
                                                                                

INSERT_CLONE_CANDIDATE_TRIP_ID: 1760936629794
INSERT_CANDIDATE_PREDICATES: trip_date="2019-03-10" AND trip_id=1760936629794


[Stage 4:>                                                          (0 + 1) / 1]

UPDATE_CLONE_CANDIDATE_TRIP_ID: 83
UPDATE_CANDIDATE_PREDICATES: trip_date="2019-01-15" AND trip_id=83


                                                                                

### 1.2. Generate unique Trip ID for the insert 

In [4]:
TO_BE_INSERTED_TRIP_ID=spark.sql(f"SELECT max(trip_id) AS max_trip_id FROM {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {INSERT_TRIP_DATE_PREDICATE}").collect()[0][0] + 1
print(f"Insert trip ID is: {TO_BE_INSERTED_TRIP_ID}")



Insert trip ID is: 1786706865009


                                                                                

### 1.3. Insert dataframe creation

In [5]:
# Original record
insertCandidateTripDFCow=spark.sql(f"SELECT * FROM {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {INSERT_CANDIDATE_PREDICATES}")
insertCandidateTripDFCow.show()


23/07/30 02:29:19 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 9:>                                                          (0 + 1) / 1]

+-------------------+--------------------+------------------+----------------------+--------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+---------+-----------+----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|    pickup_datetime|   dropoff_datetime|store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance| fare_amount|surcharge|    mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge

                                                                                

In [6]:
# Generate record to insert based off of the above record - increment all the date column by 5 hours
insertTripDFCow = insertCandidateTripDFCow.withColumn('pickup_datetime', insertCandidateTripDFCow.pickup_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('dropoff_datetime', insertCandidateTripDFCow.dropoff_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('trip_hour', insertCandidateTripDFCow.trip_hour + 5) \
                                    .withColumn('trip_id', lit(TO_BE_INSERTED_TRIP_ID)) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_trip_date") \
                                    .drop("_hoodie_file_name")

insertTripDFCow.show(truncate=False)


[Stage 11:>                                                         (0 + 1) / 1]

+----------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+---------+-----------+----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------------+----------+
|_hoodie_partition_path|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge|mta_tax    |tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id      |trip_date |
+----------------------+---------+---------+----------

                                                                                

In [7]:
# Original record with just a few fields
spark.sql(f"SELECT trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM {DATABASE_NAME}.{COW_TABLE_NAME} "\
          f" WHERE {INSERT_CANDIDATE_PREDICATES}") \
        .show(truncate=False)

+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|1760936629794|green    |2        |2019-03-10 09:26:21|2019-03-10 09:50:20|177               |89                 |2019-03-10|
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [8]:
# The record we want to insert - note its pickup_datetime and dropoff_datetime are different
insertTripDFCow.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)


[Stage 15:>                                                         (0 + 1) / 1]

+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|1786706865009|green    |2        |2019-03-10 14:26:21|2019-03-10 14:50:20|177               |89                 |2019-03-10|
+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



                                                                                

### 1.4. Update record generation

In [9]:
# Original record
updateCandidateTripDFCow=spark.sql(f"SELECT * FROM {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {UPDATE_CANDIDATE_PREDICATES}")
updateCandidateTripDFCow.show()


+-------------------+--------------------+------------------+----------------------+--------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+------------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|    pickup_datetime|   dropoff_datetime|store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance| fare_amount|  surcharge|    mta_tax|  tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surchar

In [10]:
# Generate dataframe that updates the above record
updateTripDFCow = updateCandidateTripDFCow.withColumn('pickup_datetime', updateCandidateTripDFCow.pickup_datetime + F.expr('INTERVAL 1 HOURS')) \
                                    .withColumn('dropoff_datetime', updateCandidateTripDFCow.dropoff_datetime + F.expr('INTERVAL 1 HOURS')) \
                                    .withColumn('trip_hour', updateCandidateTripDFCow.trip_hour + 5) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_trip_date") \
                                    .drop("_hoodie_file_name")

# The full record we will update
updateTripDFCow.show(truncate=False)


                                                                                

+----------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+------------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------+----------+
|_hoodie_partition_path|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge  |mta_tax    |tip_amount  |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id|trip_date |
+----------------------+---------+---------+----------+---

                                                                                

In [11]:
# Original record prior to update - just a few columns for readbility
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_commit_seqno,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM {DATABASE_NAME}.{COW_TABLE_NAME} "\
          f" WHERE {UPDATE_CANDIDATE_PREDICATES}") \
        .show(truncate=False)


+-------------------+-----------------------+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno   |trip_id|taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+-----------------------+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230729053557449  |20230729053557449_478_0|83     |yellow   |2        |2019-01-15 10:18:19|2019-01-15 10:19:23|229               |229                |2019-01-15|
+-------------------+-----------------------+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [12]:
# Updated details of the above record - note its pickup_datetime and dropoff_datetime are different
updateTripDFCow.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)


+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id|taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|83     |yellow   |2        |2019-01-15 11:18:19|2019-01-15 11:19:23|229               |229                |2019-01-15|
+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



### 1.5. Prepare to upsert

In [13]:
# Lets union the dataframes
upsertTripDFCow = insertTripDFCow.union(updateTripDFCow)
# Quick visual 
upsertTripDFCow.show(truncate=False)




+----------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+------------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+-------------+----------+
|_hoodie_partition_path|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount |surcharge  |mta_tax    |tip_amount  |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id      |trip_date |
+----------------------+---------+---------+--

                                                                                

In [14]:
# Capture record count before insert
TRIP_COUNT_BEFORE_INSERT=spark.sql(f"select count(*)  from {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {INSERT_TRIP_DATE_PREDICATE}").collect()[0][0]
print(f"Trip count before insert: {TRIP_COUNT_BEFORE_INSERT}")


Trip count before insert: 245530


In [15]:
# Capture GCS parquet file listing prior to insert
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=$INSERT_TRIP_DATE


     373 B  2023-07-30T01:59:34Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/.hoodie_partition_metadata.parquet#1690682374925116  metageneration=1
  4.26 MiB  2023-07-30T01:59:34Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/7d98900a-981b-45a0-a5bf-bcbacbef7337-0_868-19-9830_20230729053557449.parquet#1690682374925584  metageneration=1
   4.3 MiB  2023-07-30T01:59:34Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/adec5232-a984-4bd2-8e20-948dcf45b651-0_867-19-9829_20230729053557449.parquet#1690682374945161  metageneration=1
TOTAL: 3 objects, 8984286 bytes (8.57 MiB)


In [16]:
# Capture record count before update
TRIP_COUNT_BEFORE_UPDATE=spark.sql(f"select count(*)  from {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {UPDATE_TRIP_DATE_PREDICATE}").collect()[0][0]
print(f"Trip count before update: {TRIP_COUNT_BEFORE_UPDATE}")

Trip count before update: 289823


In [17]:
# Capture GCS parquet file listing prior to update
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=$UPDATE_TRIP_DATE


     373 B  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/.hoodie_partition_metadata.parquet#1690682373367806  metageneration=1
  1.69 MiB  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/02882cd4-4d59-4b22-9a13-797c96667c99-0_479-19-9441_20230729053557449.parquet#1690682373389182  metageneration=1
  4.23 MiB  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/1a4b14d7-2419-4151-8474-de33d473db5f-0_478-19-9440_20230729053557449.parquet#1690682373423858  metageneration=1
  4.22 MiB  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/e8680840-8f56-452f-83b6-77acd4227926-0_477-19-9439_20230729053557449.parquet#1690682373392460  metageneration=1
TOTAL: 4 objects, 10638261 bytes (10.15 MiB)


In [18]:
# Capture original record prior to update
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM {DATABASE_NAME}.{COW_TABLE_NAME} "\
          f" WHERE {UPDATE_CANDIDATE_PREDICATES}") \
        .show(truncate=False)

+-------------------+----------------------------------------------------------------------------+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_file_name                                                           |trip_id|taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+----------------------------------------------------------------------------+-------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230729053557449  |1a4b14d7-2419-4151-8474-de33d473db5f-0_478-19-9440_20230729053557449.parquet|83     |yellow   |2        |2019-01-15 10:18:19|2019-01-15 10:19:23|229               |229                |2019-01-15|
+-------------------+----------------------------------------------------------------------------+-------+---------+---------+------

In [19]:
# HUDI options for the upsert operation
hudi_options = {
            'hoodie.database.name': DATABASE_NAME,
            'hoodie.table.name': COW_TABLE_NAME,
            'hoodie.datasource.write.table.name': COW_TABLE_NAME,
            'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
            'hoodie.datasource.write.recordkey.field': 'trip_id',
            'hoodie.datasource.write.partitionpath.field': 'trip_date',
            'hoodie.datasource.write.precombine.field': 'pickup_datetime',
            'hoodie.datasource.write.hive_style_partitioning': 'true',
            'hoodie.partition.metafile.use.base.format': 'true', 
            'hoodie.datasource.write.drop.partition.columns': 'true',
            'hoodie.datasource.write.operation': 'upsert'   
        }


### 1.6. Execute the upsert

In [20]:
# Append to dataset in GCS
upsertTripDFCow.write.format("hudi"). \
                options(**hudi_options). \
                mode("append"). \
                save(HUDI_COW_BASE_GCS_URI)


23/07/30 02:29:49 WARN GhfsStorageStatistics: Detected potential high latency for operation stream_write_close_operations. latencyMs=109; previousMaxLatencyMs=0; operationCount=1; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/20230730022946927.deltacommit.requested
23/07/30 02:30:00 WARN GhfsStorageStatistics: Detected potential high latency for operation op_get_file_status. latencyMs=290; previousMaxLatencyMs=286; operationCount=46; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/.aux/.bootstrap/.partitions/00000000-0000-0000-0000-000000000000-0_1-0-1_00000000000001.hfile
23/07/30 02:30:00 WARN GhfsStorageStatistics: Detected potential high latency for operation op_create. latencyMs=449; previousMaxLatencyMs=94; operationCount=3; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/20230730022946927.deltacommit.inflight
23/07/30 02:30:01 WARN GhfsStorageStatistics: Detected potential high latency for operati

In [21]:
# Refresh Hive Metsatore Metadata
spark.sql(f"REFRESH TABLE {DATABASE_NAME}.{COW_TABLE_NAME};").show(truncate=False)


++
||
++
++



### 1.7. Validate the insert

In [22]:
print(f"Trip count before insert was: {TRIP_COUNT_BEFORE_INSERT}")


Trip count before insert was: 245530


In [23]:
# Record count after insert
TRIP_COUNT_AFTER_INSERT=spark.sql(f"select count(*) from {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {INSERT_TRIP_DATE_PREDICATE}").collect()[0][0]
print(f"Trip count after insert: {TRIP_COUNT_AFTER_INSERT}")


                                                                                

Trip count after insert: 245531


In [24]:
# GCS parquet file listing after insert - note the extra file
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=$INSERT_TRIP_DATE


     373 B  2023-07-30T01:59:34Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/.hoodie_partition_metadata.parquet#1690682374925116  metageneration=1
  4.26 MiB  2023-07-30T02:30:06Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/7d98900a-981b-45a0-a5bf-bcbacbef7337-0_1-48-4123_20230730022946927.parquet#1690684206281955  metageneration=1
  4.26 MiB  2023-07-30T01:59:34Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/7d98900a-981b-45a0-a5bf-bcbacbef7337-0_868-19-9830_20230729053557449.parquet#1690682374925584  metageneration=1
   4.3 MiB  2023-07-30T01:59:34Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-03-10/adec5232-a984-4bd2-8e20-948dcf45b651-0_867-19-9829_20230729053557449.parquet#1690682374945161  metageneration=1
TOTAL: 4 objects, 13454278 bytes (12.83 MiB)


In [25]:
# Check for existence of the record and the filename in which it exists
spark.sql(f"SELECT _hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM {DATABASE_NAME}.{COW_TABLE_NAME} "\
          f" WHERE {INSERT_TRIP_DATE_PREDICATE} AND trip_id={TO_BE_INSERTED_TRIP_ID}") \
        .show(truncate=False)


                                                                                

+--------------------------------------------------------------------------+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_file_name                                                         |trip_id      |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+--------------------------------------------------------------------------+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|7d98900a-981b-45a0-a5bf-bcbacbef7337-0_1-48-4123_20230730022946927.parquet|1786706865009|green    |2        |2019-03-10 14:26:21|2019-03-10 14:50:20|177               |89                 |2019-03-10|
+--------------------------------------------------------------------------+-------------+---------+---------+-------------------+-------------------+------------------+-------------------+-------

### 1.8. Validate the update

In [26]:
print(f"Trip count before update was: {TRIP_COUNT_BEFORE_UPDATE}")

Trip count before update was: 289823


In [31]:
# Record count after update
TRIP_COUNT_AFTER_UPDATE=spark.sql(f"select count(*) from {DATABASE_NAME}.{COW_TABLE_NAME} WHERE {UPDATE_TRIP_DATE_PREDICATE}").collect()[0][0]
print(f"Trip count after update: {TRIP_COUNT_AFTER_UPDATE}")

Trip count after update: 289822


In [28]:
# GCS parquet file listing after insert
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=$UPDATE_TRIP_DATE

  4.72 KiB  2023-07-30T02:30:03Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/.1a4b14d7-2419-4151-8474-de33d473db5f-0_20230729053557449.log.1_0-48-4122#1690684203324738  metageneration=1
     373 B  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/.hoodie_partition_metadata.parquet#1690682373367806  metageneration=1
  1.69 MiB  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/02882cd4-4d59-4b22-9a13-797c96667c99-0_479-19-9441_20230729053557449.parquet#1690682373389182  metageneration=1
  4.23 MiB  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/1a4b14d7-2419-4151-8474-de33d473db5f-0_478-19-9440_20230729053557449.parquet#1690682373423858  metageneration=1
  4.22 MiB  2023-07-30T01:59:33Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2019-01-15/e8680840-8f56-452f-83b6-77a

In [None]:
# Check for update
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_file_name,trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date" \
          f" FROM {DATABASE_NAME}.{COW_TABLE_NAME} "\
          f" WHERE {UPDATE_TRIP_DATE_PREDICATE} AND trip_id={UPDATE_CANDIDATE_TRIP_ID}") \
        .show(truncate=False)

NameError: name 'DATABASE_NAME' is not defined

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
