# Generate Hudi dataset off of a Parquet dataset of NYC taxi trip data

### 1. Get or create Spark Session

In [67]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("NYC Taxi Hudi Data Generator")\
  .master("yarn")\
  .enableHiveSupport()\
  .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
  .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
  .getOrCreate()

### 2. Variables

In [68]:
import os

PROJECT_ID = ""
PROJECT_NBR = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    project_id_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = project_id_output[0]
    print("Project ID: ", PROJECT_ID)
    
    
    project_nbr_output = !gcloud projects describe $PROJECT_ID --format='value(projectNumber)'
    PROJECT_NBR = project_nbr_output[0]
    print("Project Number: ", PROJECT_NBR)
    


PERSIST_TO_BUCKET=f"gs://gaia_sample_data_bucket-{PROJECT_NBR}"
print("PERSIST_TO_BUCKET: ",PERSIST_TO_BUCKET)

PARQUET_BASE_GCS_URI=f"{PERSIST_TO_BUCKET}/nyc-taxi-trips/parquet-base/"
HUDI_BASE_GCS_URI=f"{PERSIST_TO_BUCKET}/nyc-taxi-trips/hudi-base/"


DATABASE_NAME="taxi_db"
TABLE_NAME="nyc_taxi_trips_hudi"



Project ID:  apache-hudi-lab-build
Project Number:  343776852591
PERSIST_TO_BUCKET:  gs://gaia_sample_data_bucket-343776852591


### 3. Create database in Apache Hive Metastore
The Dataproc cluster was created with an existing Dataproc Metatsore Service referenced as Apache Hive Metastore

In [69]:
# Create database
spark.sql(f"create database if not exists {databaseName};")

DataFrame[]

In [70]:
# Drop any existing tables 
spark.sql(f"drop table if exists {databaseName}.{tableName}")

DataFrame[]

### 4. Read Taxi trips in Parquet format in Cloud Storage and persist as Hudi

In [71]:
import datetime
startTime = datetime.datetime.now()
print(f"Started at {startTime}")

Started at 2023-06-26 20:50:12.642343


#### 4.1. Read Parquet from Cloud Storage

In [72]:
tripsDF=spark.read.format("parquet").load(PARQUET_BASE_GCS_URI)

In [73]:
tripsDF.show(2)

+---------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+---------+----------+--------+
|taxi_type|trip_hour|trip_minute|vendor_id|    pickup_datetime|   dropoff_datetime|store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance| fare_amount|  surcharge|    mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_between_service|time_between_service|trip_year|trip_month|trip_day|
+---------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+----------------

                                                                                

#### 4.2. Persist as Hudi to Cloud Storage

In [74]:
hudi_options = {
    'hoodie.table.name': tableName,
    'hoodie.datasource.write.table.name': tableName,
    'hoodie.datasource.write.keygenerator.class':'org.apache.hudi.keygen.CustomKeyGenerator',
    'hoodie.datasource.write.recordkey.field': 'taxi_type,trip_year,trip_month,trip_day,vendor_id,pickup_location_id,dropoff_location_id',
    'hoodie.datasource.write.partitionpath.field': 'trip_year:SIMPLE,trip_month:SIMPLE,trip_day:SIMPLE',
    'hoodie.datasource.write.precombine.field': 'pickup_datetime',
    'hoodie.datasource.write.hive_style_partitioning': 'true',
    'hoodie.partition.metafile.use.base.format': 'true',   
}

tripsDF.write.format("hudi"). \
    options(**hudi_options). \
    mode("overwrite"). \
    save(HUDI_BASE_GCS_URI)

23/06/26 20:51:10 WARN HoodieSparkSqlWriter$: hoodie table at gs://gaia_sample_data_bucket-343776852591/nyc-taxi-trips/hudi-base already exists. Deleting existing data & overwriting with new data.
23/06/26 20:51:15 WARN HoodieBackedTableMetadata: Metadata table was not found at path gs://gaia_sample_data_bucket-343776852591/nyc-taxi-trips/hudi-base//.hoodie/metadata
23/06/26 21:13:17 WARN DAGScheduler: Broadcasting large task binary with size 1100.2 KiB
23/06/26 21:15:02 WARN DAGScheduler: Broadcasting large task binary with size 1100.9 KiB
                                                                                

In [75]:
completionTime = datetime.datetime.now()
print(f"Completed at {completionTime}")

Completed at 2023-06-26 21:15:13.525836


#### 4.3. A quick review of the schema

In [76]:
tripsDF.printSchema()

root
 |-- taxi_type: string (nullable = true)
 |-- trip_hour: integer (nullable = true)
 |-- trip_minute: integer (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- store_and_forward: string (nullable = true)
 |-- rate_code: string (nullable = true)
 |-- pickup_location_id: string (nullable = true)
 |-- dropoff_location_id: string (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: decimal(38,9) (nullable = true)
 |-- fare_amount: decimal(38,9) (nullable = true)
 |-- surcharge: decimal(38,9) (nullable = true)
 |-- mta_tax: decimal(38,9) (nullable = true)
 |-- tip_amount: decimal(38,9) (nullable = true)
 |-- tolls_amount: decimal(38,9) (nullable = true)
 |-- improvement_surcharge: decimal(10,0) (nullable = true)
 |-- total_amount: decimal(38,9) (nullable = true)
 |-- payment_type_code: string (nullable = true)
 |-- congestion_surcharge: decimal

### 5. Register table in Dataproc Metastore Service

In [77]:
spark.sql("SHOW DATABASES;").show(truncate=False)

+---------+
|namespace|
+---------+
|default  |
|taxi_db  |
+---------+



In [78]:
# Create an external table on the Hudi files in the data lake in Cloud Storage
spark.sql(f"CREATE TABLE IF NOT EXISTS {DATABASE_NAME}.{TABLE_NAME} USING hudi LOCATION \"{HUDI_BASE_GCS_URI}\";").show()

23/06/26 21:15:15 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


++
||
++
++



In [79]:
spark.sql(f"SELECT count(*) FROM {DATABASE_NAME}.{TABLE_NAME}").show()



+--------+
|count(1)|
+--------+
|20939415|
+--------+



                                                                                

In [80]:
spark.sql(f"SELECT * FROM {DATABASE_NAME}.{TABLE_NAME} LIMIT 2").show()



+-------------------+--------------------+--------------------+----------------------+--------------------+---------+---------+-----------+---------+-------------------+-------------------+-----------------+---------+------------------+-------------------+---------------+-------------+------------+-----------+-----------+----------+------------+---------------------+------------+-----------------+--------------------+---------+---------+--------------+------------------------+--------------------+---------+----------+--------+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|taxi_type|trip_hour|trip_minute|vendor_id|    pickup_datetime|   dropoff_datetime|store_and_forward|rate_code|pickup_location_id|dropoff_location_id|passenger_count|trip_distance| fare_amount|  surcharge|    mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|payment_type_code|congestion_surcharge|trip_type|ehail_fee|partition_date|distance_be

                                                                                

In [81]:
spark.sql(F"SHOW PARTITIONS {DATABASE_NAME}.{TABLE_NAME}").show(truncate=False)

+---------------------------------------+
|partition                              |
+---------------------------------------+
|trip_year=2019/trip_month=1/trip_day=1 |
|trip_year=2019/trip_month=1/trip_day=10|
|trip_year=2019/trip_month=1/trip_day=11|
|trip_year=2019/trip_month=1/trip_day=12|
|trip_year=2019/trip_month=1/trip_day=13|
|trip_year=2019/trip_month=1/trip_day=14|
|trip_year=2019/trip_month=1/trip_day=15|
|trip_year=2019/trip_month=1/trip_day=16|
|trip_year=2019/trip_month=1/trip_day=17|
|trip_year=2019/trip_month=1/trip_day=18|
|trip_year=2019/trip_month=1/trip_day=19|
|trip_year=2019/trip_month=1/trip_day=2 |
|trip_year=2019/trip_month=1/trip_day=20|
|trip_year=2019/trip_month=1/trip_day=21|
|trip_year=2019/trip_month=1/trip_day=22|
|trip_year=2019/trip_month=1/trip_day=23|
|trip_year=2019/trip_month=1/trip_day=24|
|trip_year=2019/trip_month=1/trip_day=25|
|trip_year=2019/trip_month=1/trip_day=26|
|trip_year=2019/trip_month=1/trip_day=27|
+---------------------------------

In [None]:
spark.sql(f"SELECT  trip_year, count(*) FROM {DATABASE_NAME}.{TABLE_NAME} GROUP BY trip_year").show()



+---------+--------+
|trip_year|count(1)|
+---------+--------+
|     2019| 8023712|
|     2020| 4179576|
|     2021| 4713998|
|     2022| 4022129|
+---------+--------+



23/06/26 21:27:48 WARN YarnAllocator: Container from a bad node: container_1687408001156_0036_01_000039 on host: gaia-dpgce-cpu-343776852591-sw-rnj6.us-central1-a.c.apache-hudi-lab-build.internal. Exit status: 143. Diagnostics: [2023-06-26 21:27:48.368]Container killed on request. Exit code is 143
[2023-06-26 21:27:48.368]Container exited with a non-zero exit code 143. 
[2023-06-26 21:27:48.378]Killed by external signal
.
23/06/26 21:27:48 ERROR YarnScheduler: Lost executor 38 on gaia-dpgce-cpu-343776852591-sw-rnj6.us-central1-a.c.apache-hudi-lab-build.internal: Container from a bad node: container_1687408001156_0036_01_000039 on host: gaia-dpgce-cpu-343776852591-sw-rnj6.us-central1-a.c.apache-hudi-lab-build.internal. Exit status: 143. Diagnostics: [2023-06-26 21:27:48.368]Container killed on request. Exit code is 143
[2023-06-26 21:27:48.368]Container exited with a non-zero exit code 143. 
[2023-06-26 21:27:48.378]Killed by external signal
.
23/06/26 21:27:48 WARN YarnSchedulerBackend