# Unit 3: INSERT INTO COW tables

In this unit, we will learn INSERT operations into COW tables.<br>


This unit takes about 5 minutes to complete.

In [1]:
from pyspark.sql.functions import lit
from functools import reduce
from pyspark.sql.types import LongType
import pyspark.sql.functions as F
from datetime import datetime

### Initialize Spark Session

In [2]:
spark = SparkSession.builder \
  .appName("Hudi-Learning-Unit-05-PySpark") \
  .master("yarn")\
  .enableHiveSupport()\
  .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
  .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
  .getOrCreate()

23/08/01 03:21:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
spark

### Declare & define variables

In [4]:
PROJECT_ID_OUTPUT=!gcloud config get-value core/project
PROJECT_ID=PROJECT_ID_OUTPUT[0]
PROJECT_NBR_OUTPUT=!gcloud projects describe $PROJECT_ID --format="value(projectNumber)"
PROJECT_NBR=PROJECT_NBR_OUTPUT[0]
LOCATION = "us-central1" #Replace with your GCP region

HUDI_COW_BASE_GCS_URI = f"gs://gaia_data_bucket-{PROJECT_NBR}/nyc-taxi-trips-hudi-cow"
TRIP_DATE="2020-01-30"
DATAPROC_METASTORE_THRIFT_URI_LIST=!gcloud metastore services list --location $LOCATION | grep thrift | cut -d' ' -f11
DATAPROC_METASTORE_THRIFT_URI=DATAPROC_METASTORE_THRIFT_URI_LIST[0]

print(f"Project ID is {PROJECT_ID}")
print(f"Project number is {PROJECT_NBR}")
print(f"Project location is is {LOCATION}")
print(f"Dataproc Metastore Service thrift URI is {DATAPROC_METASTORE_THRIFT_URI}")
print(f"Trip date partition we will insert into is {TRIP_DATE}")

Project ID is apache-hudi-lab
Project number is 623600433888
Project location is is us-central1
Dataproc Metastore Service thrift URI is thrift://10.60.192.28:9080
Trip date partition we will insert into is 2020-01-30


**Note**: Ensure you have the right URI for Dataproc Metastore

## 1. [HUDI INSERT FEATURE] Insert into CoW table

### 1.1. Determine trip ID to clone

In [5]:
ORIGINAL_TRIP_ID=spark.sql(f"select trip_id  from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2020-01-30' AND trip_hour < 12 LIMIT 1").collect()[0][0]
print(f"ID of the trip cloned: {ORIGINAL_TRIP_ID}")

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
23/08/01 03:21:56 WARN GhfsStorageStatistics: Detected potential high latency for operation op_open. latencyMs=131; previousMaxLatencyMs=0; operationCount=1; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/hoodie.properties
[Stage 3:>                                                          (0 + 1) / 1]

ID of the trip cloned: 206158430212


                                                                                

### 1.2. Review the Hudi metadata

In [6]:
!gsutil cat $HUDI_COW_BASE_GCS_URI/.hoodie/hoodie.properties

#Properties saved on 2023-07-31T21:14:19.970133Z
#Mon Jul 31 21:14:19 UTC 2023
hoodie.table.type=COPY_ON_WRITE
hoodie.table.metadata.partitions=files
hoodie.table.precombine.field=pickup_datetime
hoodie.table.partition.fields=trip_date
hoodie.archivelog.folder=archived
hoodie.table.create.schema={"type"\:"record","name"\:"topLevelRecord","fields"\:[{"name"\:"_hoodie_commit_time","type"\:["string","null"]},{"name"\:"_hoodie_commit_seqno","type"\:["string","null"]},{"name"\:"_hoodie_record_key","type"\:["string","null"]},{"name"\:"_hoodie_partition_path","type"\:["string","null"]},{"name"\:"_hoodie_file_name","type"\:["string","null"]},{"name"\:"taxi_type","type"\:["string","null"]},{"name"\:"trip_year","type"\:["int","null"]},{"name"\:"trip_month","type"\:["int","null"]},{"name"\:"trip_day","type"\:["int","null"]},{"name"\:"trip_hour","type"\:["int","null"]},{"name"\:"trip_minute","type"\:["int","null"]},{"name"\:"vendor_id","type"\:["string","null"]},{"name"\:"pickup_datetime","type"\:

### 1.3. Study the data

#### a) Layout and size

In [7]:
# File system layout, files, types & counts by types, byte sizes
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=2020-01-30/

     373 B  2023-08-01T03:06:31Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/.hoodie_partition_metadata.parquet#1690859191405842  metageneration=1
  7.91 MiB  2023-08-01T03:06:31Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_192-38-12676_20230731205659112.parquet#1690859191443082  metageneration=1
TOTAL: 2 objects, 8290683 bytes (7.91 MiB)


#### b) Record count

In [8]:
print(f"Trip Date: {TRIP_DATE}")
ORIGINAL_TRIP_COUNT=spark.sql(f"SELECT count(*) as trip_count from taxi_db.nyc_taxi_trips_hudi_cow where trip_date='2020-01-30'").collect()[0][0]
print(f"Original Trip Count: {ORIGINAL_TRIP_COUNT}")

Trip Date: 2020-01-30




Original Trip Count: 257927


                                                                                

### 1.4. Create a record / taxi trip that we will use for our insert trial
We'll grab a record and change the hour of pickup and dropoff to be 5 hours later:<br>

#### 1.4.1. Generate a new trip ID to use for the record

In [9]:
NEW_TRIP_ID=spark.sql(f"select max(trip_id) as max_trip_id from taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2020-01-30'").collect()[0][0] + 1
print(f"New trip ID is: {NEW_TRIP_ID}")



New trip ID is: 309237772005


                                                                                

#### 1.4.2. Identify a record to use that we will morph and insert

In [10]:
# This query returns exactly one record
candidateTripDFCow=spark.sql(f"SELECT * FROM taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2020-01-30' AND trip_id={ORIGINAL_TRIP_ID}")
candidateTripDFCow.show(truncate=False)

23/08/01 03:22:30 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------------------+-----------------------+------------------+----------------------+-----------------------------------------------------------------------------+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+-----------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+------------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno   |_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name                                                            |taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount

In [11]:
candidateTripDFCow.printSchema

<bound method DataFrame.printSchema of DataFrame[_hoodie_commit_time: string, _hoodie_commit_seqno: string, _hoodie_record_key: string, _hoodie_partition_path: string, _hoodie_file_name: string, taxi_type: string, trip_year: int, trip_month: int, trip_day: int, trip_hour: int, trip_minute: int, vendor_id: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, store_and_forward: string, rate_code: string, pickup_location_id: string, dropoff_location_id: string, passenger_count: bigint, trip_distance: decimal(38,9), fare_amount: decimal(38,9), surcharge: decimal(38,9), mta_tax: decimal(38,9), tip_amount: decimal(38,9), tolls_amount: decimal(38,9), improvement_surcharge: decimal(10,0), total_amount: decimal(38,9), payment_type_code: string, congestion_surcharge: decimal(10,0), trip_type: string, ehail_fee: decimal(10,0), partition_date: date, distance_between_service: decimal(38,9), time_between_service: bigint, trip_id: bigint, trip_date: string]>

In [12]:
insertTripDFCow = candidateTripDFCow.withColumn('pickup_datetime', candidateTripDFCow.pickup_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('dropoff_datetime', candidateTripDFCow.dropoff_datetime + F.expr('INTERVAL 5 HOURS')) \
                                    .withColumn('trip_hour', candidateTripDFCow.trip_hour + 5) \
                                    .withColumn('trip_id', lit(NEW_TRIP_ID)) \
                                    .drop("_hoodie_commit_time") \
                                    .drop("_hoodie_commit_seqno") \
                                    .drop("_hoodie_record_key") \
                                    .drop("_hoodie_partition_path") \
                                    .drop("_hoodie_file_name")

In [13]:
insertTripDFCow.show(truncate=False)

                                                                                

+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+-----------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+------------+----------+
|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount|surcharge  |mta_tax    |tip_amount |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id     |trip_date |
+---------+---------+----------+--------+---------+-----------+---------+-------------------+-----

In [14]:
insertTripDFCow.printSchema

<bound method DataFrame.printSchema of DataFrame[taxi_type: string, trip_year: int, trip_month: int, trip_day: int, trip_hour: int, trip_minute: int, vendor_id: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, store_and_forward: string, rate_code: string, pickup_location_id: string, dropoff_location_id: string, passenger_count: bigint, trip_distance: decimal(38,9), fare_amount: decimal(38,9), surcharge: decimal(38,9), mta_tax: decimal(38,9), tip_amount: decimal(38,9), tolls_amount: decimal(38,9), improvement_surcharge: decimal(10,0), total_amount: decimal(38,9), payment_type_code: string, congestion_surcharge: decimal(10,0), trip_type: string, ehail_fee: decimal(10,0), partition_date: date, distance_between_service: decimal(38,9), time_between_service: bigint, trip_id: bigint, trip_date: string]>

In [15]:
# Original record
spark.sql(f"SELECT trip_id,taxi_type,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id,dropoff_location_id,trip_date " \
          f" FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f" WHERE trip_date='2020-01-30' AND trip_id={ORIGINAL_TRIP_ID}") \
        .show(truncate=False)

                                                                                

+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|206158430212|yellow   |2        |2020-01-30 10:29:47|2020-01-30 10:42:48|162               |163                |2020-01-30|
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [16]:
# The record we want to insert - note its pickup_datetime and dropoff_datetime are different
insertTripDFCow.select("trip_id","taxi_type","vendor_id","pickup_datetime","dropoff_datetime","pickup_location_id","dropoff_location_id","trip_date") \
               .show(truncate=False)

                                                                                

+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id     |taxi_type|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+
|309237772005|yellow   |2        |2020-01-30 15:29:47|2020-01-30 15:42:48|162               |163                |2020-01-30|
+------------+---------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [17]:
# The full record we will insert
insertTripDFCow.show(truncate=False)

                                                                                

+---------+---------+----------+--------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+-----------+-----------+-----------+-----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+------------+----------+
|taxi_type|trip_year|trip_month|trip_day|trip_hour|trip_minute|vendor_id|pickup_datetime    |dropoff_datetime   |store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance|fare_amount|surcharge  |mta_tax    |tip_amount |tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_id     |trip_date |
+---------+---------+----------+--------+---------+-----------+---------+-------------------+-----

### 1.4. Insert the record

In [18]:
hudi_options = {
            'hoodie.database.name': 'taxi_db',
            'hoodie.table.name': 'nyc_taxi_trips_hudi_cow',
            'hoodie.datasource.write.table.name': 'nyc_taxi_trips_hudi_cow',
            'hoodie.datasource.write.table.type': 'COPY_ON_WRITE',
            'hoodie.datasource.write.recordkey.field': 'trip_id',
            'hoodie.datasource.write.partitionpath.field': 'trip_date',
            'hoodie.datasource.write.precombine.field': 'pickup_datetime',
            'hoodie.datasource.write.hive_style_partitioning': 'true',
            'hoodie.partition.metafile.use.base.format': 'true', 
            'hoodie.datasource.write.drop.partition.columns': 'true',
            'hoodie.datasource.write.operation': 'insert',
            'hoodie.datasource.hive_sync.enable': 'true',
            'hoodie.meta.sync.client.tool.class': 'org.apache.hudi.hive.HiveSyncTool',
            'hoodie.datasource.hive_sync.mode':'hms',
            'hoodie.datasource.hive_sync.metastore.uris':DATAPROC_METASTORE_THRIFT_URI,
            'hoodie.datasource.hive_sync.auto_create_database':'true',
            'hoodie.datasource.hive_sync.database': 'taxi_db',
            'hoodie.datasource.hive_sync.table': 'nyc_taxi_trips_hudi_cow',
            'hoodie.datasource.hive_sync.partition_fields': 'trip_date', 
            'hoodie.datasource.hive_sync.partition_extractor_class':'org.apache.hudi.hive.MultiPartKeysValueExtractor',
            'hoodie.datasource.hive_sync.use_jdbc': 'false',
            'hoodie.datasource.hive_sync.support_timestamp': 'true'
        }

In [None]:
# Insert in append mode to the base path, and with a hive metastore metadata sync
insertTripDFCow.write.format("hudi"). \
                options(**hudi_options). \
                mode("append"). \
                save(HUDI_COW_BASE_GCS_URI)

### 1.5. Validate the insert

In [21]:
# Run a count to ensure that the record count increased
AFTER_INSERT_TRIP_COUNT=spark.sql(f"SELECT COUNT(*) as trip_count FROM taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2020-01-30'").collect()[0][0]
print(f"Original trip count was: {ORIGINAL_TRIP_COUNT} and latest trip count is {AFTER_INSERT_TRIP_COUNT}")

                                                                                

Original trip count was: 257927 and latest trip count is 257928


In [22]:
# We started off a certain number of parquets, because its CoW and small data, we should see just another file
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=2020-01-30

     373 B  2023-08-01T03:06:31Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/.hoodie_partition_metadata.parquet#1690859191405842  metageneration=1
   7.9 MiB  2023-08-01T03:22:59Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-24-2892_20230801032242395.parquet#1690860179590863  metageneration=1
  7.91 MiB  2023-08-01T03:06:31Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_192-38-12676_20230731205659112.parquet#1690859191443082  metageneration=1
TOTAL: 3 objects, 16577526 bytes (15.81 MiB)


In [23]:
# Lets ensure the original record is still there
spark.sql(f"SELECT trip_id,taxi_type,trip_year,trip_month,trip_day,vendor_id,pickup_datetime,dropoff_datetime,pickup_location_id," \
          "dropoff_location_id,trip_date "\
          f"FROM taxi_db.nyc_taxi_trips_hudi_cow " \
          f" WHERE trip_date='2020-01-30' AND trip_id={ORIGINAL_TRIP_ID}") \
         .show(truncate=False)


                                                                                

+------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+----------+
|trip_id     |taxi_type|trip_year|trip_month|trip_day|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+----------+
|206158430212|yellow   |2020     |1         |30      |2        |2020-01-30 10:29:47|2020-01-30 10:42:48|162               |163                |2020-01-30|
+------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+----------+



In [24]:
# New record inserted
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_commit_seqno,_hoodie_file_name,taxi_type,trip_year,trip_month,trip_day,vendor_id,pickup_datetime,dropoff_datetime," \
          "pickup_location_id,dropoff_location_id,trip_date "\
          f"FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f"WHERE trip_date='2020-01-30' AND trip_id={NEW_TRIP_ID}") \
        .show(truncate=False)

[Stage 54:>                                                         (0 + 1) / 1]

+-------------------+--------------------------+--------------------------------------------------------------------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno      |_hoodie_file_name                                                         |taxi_type|trip_year|trip_month|trip_day|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|trip_date |
+-------------------+--------------------------+--------------------------------------------------------------------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+----------+
|20230801032242395  |20230801032242395_0_257927|4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-24-2892_20230801032242395.parquet|yellow   |2020     |1         |30      |2        |2020-01-30 15:29:47|2020-01-30 15:4

                                                                                

The record is in file listed under _hoodie_file_name 

### 1.6. Study the commit log
Notice the insert in the latest commit log

In [25]:
LOG_FILE_LIST=!gsutil ls $HUDI_COW_BASE_GCS_URI/.hoodie/*.commit | tail -n 1 
LOG_FILE=LOG_FILE_LIST[0]
print(f"Log file FQP is {LOG_FILE}")

Log file FQP is gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/20230801032242395.commit


In [26]:
!gsutil cat $LOG_FILE

{
  "partitionToWriteStats" : {
    "trip_date=2020-01-30" : [ {
      "fileId" : "4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0",
      "path" : "trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-24-2892_20230801032242395.parquet",
      "prevCommit" : "20230731205659112",
      "numWrites" : 257928,
      "numDeletes" : 0,
      "numUpdateWrites" : 0,
      "numInserts" : 1,
      "totalWriteBytes" : 8286843,
      "totalWriteErrors" : 0,
      "tempPath" : null,
      "partitionPath" : "trip_date=2020-01-30",
      "totalLogRecords" : 0,
      "totalLogFilesCompacted" : 0,
      "totalLogSizeCompacted" : 0,
      "totalUpdatedRecordsCompacted" : 0,
      "totalLogBlocks" : 0,
      "totalCorruptLogBlock" : 0,
      "totalRollbackBlocks" : 0,
      "fileSizeInBytes" : 8286843,
      "minEventTime" : null,
      "maxEventTime" : null
    } ]
  },
  "compacted" : false,
  "extraMetadata" : {
    "schema" : "{\"type\":\"record\",\"name\":\"nyc_taxi_trips_hudi_cow_record\",\"fields\

### 2. [HUDI DEDUPE FEATURE] Hudi dedupes upon insert based on record key specification and using precombine field for latest record
Lets insert the new record yet again and observe what happens

In [27]:
insertTripDFCow.write.format("hudi"). \
                options(**hudi_options). \
                mode("append"). \
                save(HUDI_COW_BASE_GCS_URI)

23/08/01 03:23:56 WARN GhfsStorageStatistics: Detected potential high latency for operation op_create. latencyMs=122; previousMaxLatencyMs=108; operationCount=19; context=gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/metadata/.hoodie/20230801032340309.deltacommit.inflight
                                                                                

In [28]:
# We should not see a record count increase
AFTER_DUPE_INSERT_TRIP_COUNT=spark.sql(f"SELECT COUNT(*) as trip_count FROM taxi_db.nyc_taxi_trips_hudi_cow WHERE trip_date='2020-01-30'").collect()[0][0]
print(f"Original post insert trip count was: {AFTER_INSERT_TRIP_COUNT} and latest trip count is also {AFTER_DUPE_INSERT_TRIP_COUNT}")

                                                                                

Original post insert trip count was: 257928 and latest trip count is also 257928


In [29]:
# We should see an additional parquet files, because Hudi deduped and persisted, but eliminated the dupe
!gsutil ls -alh $HUDI_COW_BASE_GCS_URI/trip_date=2020-01-30

     373 B  2023-08-01T03:06:31Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/.hoodie_partition_metadata.parquet#1690859191405842  metageneration=1
   7.9 MiB  2023-08-01T03:22:59Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-24-2892_20230801032242395.parquet#1690860179590863  metageneration=1
   7.9 MiB  2023-08-01T03:23:53Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-59-8659_20230801032340309.parquet#1690860233039086  metageneration=1
  7.91 MiB  2023-08-01T03:06:31Z  gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_192-38-12676_20230731205659112.parquet#1690859191443082  metageneration=1
TOTAL: 4 objects, 24864369 bytes (23.71 MiB)


Note that a new file got created, and our record is  within it. (The old file containing the record is still there and useful for time travel)

In [30]:
# Lets query the record we attempted to insert a second time
spark.sql(f"SELECT _hoodie_commit_time,_hoodie_commit_seqno,_hoodie_file_name,taxi_type,trip_year,trip_month,trip_day,vendor_id,pickup_datetime,dropoff_datetime," \
          "pickup_location_id,dropoff_location_id "\
          f"FROM taxi_db.nyc_taxi_trips_hudi_cow "\
          f"WHERE trip_date='2020-01-30' AND trip_id={NEW_TRIP_ID}") \
        .show(truncate=False)


+-------------------+--------------------------+--------------------------------------------------------------------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+
|_hoodie_commit_time|_hoodie_commit_seqno      |_hoodie_file_name                                                         |taxi_type|trip_year|trip_month|trip_day|vendor_id|pickup_datetime    |dropoff_datetime   |pickup_location_id|dropoff_location_id|
+-------------------+--------------------------+--------------------------------------------------------------------------+---------+---------+----------+--------+---------+-------------------+-------------------+------------------+-------------------+
|20230801032340309  |20230801032340309_0_257927|4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-59-8659_20230801032340309.parquet|yellow   |2020     |1         |30      |2        |2020-01-30 15:29:47|2020-01-30 15:42:48|162               |163      

In [31]:
# Review the commit log

LOG_FILE_LIST=!gsutil ls $HUDI_COW_BASE_GCS_URI/.hoodie/*.commit | tail -n 1 
LOG_FILE=LOG_FILE_LIST[0]
print(f"Log file FQP is {LOG_FILE}")

!gsutil cat $LOG_FILE

Log file FQP is gs://gaia_data_bucket-623600433888/nyc-taxi-trips-hudi-cow/.hoodie/20230801032340309.commit
{
  "partitionToWriteStats" : {
    "trip_date=2020-01-30" : [ {
      "fileId" : "4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0",
      "path" : "trip_date=2020-01-30/4ae6b6ec-c809-4f6e-8e7b-f4afee169f1f-0_0-59-8659_20230801032340309.parquet",
      "prevCommit" : "20230801032242395",
      "numWrites" : 257928,
      "numDeletes" : 0,
      "numUpdateWrites" : 1,
      "numInserts" : 0,
      "totalWriteBytes" : 8286843,
      "totalWriteErrors" : 0,
      "tempPath" : null,
      "partitionPath" : "trip_date=2020-01-30",
      "totalLogRecords" : 0,
      "totalLogFilesCompacted" : 0,
      "totalLogSizeCompacted" : 0,
      "totalUpdatedRecordsCompacted" : 0,
      "totalLogBlocks" : 0,
      "totalCorruptLogBlock" : 0,
      "totalRollbackBlocks" : 0,
      "fileSizeInBytes" : 8286843,
      "minEventTime" : null,
      "maxEventTime" : null
    } ]
  },
  "compacted" : false,
  "e

This concludes the lab unit, please proceed to the next notebook.