# 0.Notebook Setup With Hudi Table Non-PII Reader EMR Runtime Role


In [None]:
%%configure -f
{ "conf": {
    "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar",
    "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension,com.amazonaws.emr.recordserver.connector.spark.sql.RecordServerSQLExtension",
    "spark.sql.catalog.spark_catalog.lf.managed":"true"
}}

In [None]:
# Import libraries

import os
from datetime import datetime

from pyspark.sql.functions import col,lit, current_timestamp,unix_timestamp, min, when, desc, split

## 0.1 Global variables setup

Go to 'CloudFormation'. Select the blog stack, and select 'Outputs' tab. Copy 'S3BucketName' value, and replace `<STACK-OUTPUTS-S3-BUCKET-NAME>` in the following cell.

In [None]:
S3_BUCKET_NAME = <"STACK-OUTPUTS-S3-BUCKET-NAME">

In [None]:
VERSION = 1

TABLE_NAME = "dl_tpc_customer"
HUDI_DATABASE = f"rsv2_blog_hudi_db_{VERSION}"
HUDI_DATABASE_LOCATION = os.path.join(
    "s3://",
    S3_BUCKET_NAME,
    HUDI_DATABASE
)

COW_TABLE_NAME_SQL = f"rsv2_blog_hudi_cow_sql_{TABLE_NAME}_{VERSION}"
COW_TABLE_LOCATION_SQL = os.path.join(
    HUDI_DATABASE_LOCATION,
    COW_TABLE_NAME_SQL
)

MOR_TABLE_NAME_SQL = f"rsv2_blog_hudi_mor_sql_{TABLE_NAME}_{VERSION}"
MOR_TABLE_LOCATION_SQL = os.path.join(
    HUDI_DATABASE_LOCATION,
    MOR_TABLE_NAME_SQL
)

## 0.2 Spark variables setup

In [None]:
# sparkmagic SQL configs

spark.conf.set('hudi_db', HUDI_DATABASE)
spark.conf.set('hudi_db_location', HUDI_DATABASE_LOCATION)

spark.conf.set('cow_table_name_sql', COW_TABLE_NAME_SQL)
spark.conf.set('cow_table_location_sql', COW_TABLE_LOCATION_SQL)

spark.conf.set('mor_table_name_sql', MOR_TABLE_NAME_SQL)
spark.conf.set('mor_table_location_sql', MOR_TABLE_LOCATION_SQL)


# 1. Hudi Table PII Reader Lake Formation Configuration

***Please do the following steps in Blog "Query Hudi tables with column-level & row-level data filters" Section before runing the following cells***


# 2. Hudi MoR Table Queries with FAGC


## 2.1 MoR snapshot queries with data filter

In [None]:
%%sql

SELECT * FROM ${hudi_db}.${mor_table_name_sql} LIMIT 10;

## 2.2 MoR incremental queries with data filter

In [None]:
# Get Hudi commit time
df = spark.sql(f"""
SELECT DISTINCT _hoodie_commit_time FROM {HUDI_DATABASE}.{MOR_TABLE_NAME_SQL} ORDER BY _hoodie_commit_time
""")

commit_ts = [row.asDict()["_hoodie_commit_time"] for row in df.collect()]
commit_ts

In [None]:
incremental_df = spark.sql(f"""
SELECT * FROM {HUDI_DATABASE}.{MOR_TABLE_NAME_SQL} WHERE _hoodie_commit_time >= {commit_ts[-1]}
""")

incremental_df.createOrReplaceTempView("incremental_view")

In [None]:
%%sql

SELECT 
    c_birth_country, 
    count(*) 
FROM incremental_view
GROUP BY c_birth_country;

In [None]:
%%sql

SELECT * FROM incremental_view

## 2.3 MoR time travel queries with data filter

In [None]:
# Get Hudi commit time
df = spark.sql(f"""
SELECT DISTINCT _hoodie_commit_time FROM {HUDI_DATABASE}.{MOR_TABLE_NAME_SQL} ORDER BY _hoodie_commit_time
""")

commit_ts = [row.asDict()["_hoodie_commit_time"] for row in df.collect()]
commit_ts

In [None]:
time_df = spark.sql(f"""
SELECT * FROM {HUDI_DATABASE}.{MOR_TABLE_NAME_SQL} WHERE _hoodie_commit_time == {commit_ts[-2]}
""")

time_df.groupBy("c_birth_country").count().show(10, False

## 2.4 Observations for MoR queries with dat afilter

* Lake Formation data filter can be applied to non-PII table reader role
* MoR Incremental and time travel queries can work with Lake Formation data filter

* Spark SQL for MoR time travel is not working
