## Load sagemaker_studio_analytics_extension

In [None]:
%load_ext sagemaker_studio_analytics_extension.magics

## 3. Act as data engineer 

Configure Apache Hudi for Apache Spark

In [None]:
%%configure -f
{ "conf": {
    "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar",
    "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
}}

Connect to Cluster via Livy with EMR execution role: ENGINEER_ROLE

In [None]:
%%sh
source ~/.bash_profile
echo $ENGINEER_ROLE
echo $ANALYST_ROLE
echo $CLUSTER_ID

In [None]:
%sm_analytics emr connect \
--cluster-id <CLUSTER_ID> \
--auth-type Basic_Access \
--emr-execution-role-arn <ENGINEER_ROLE>

Show session information

In [None]:
%%info

Obtain account ID and region

In [None]:
%%sh
ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
echo $ACCOUNT_ID

REGION=$(aws configure get region)
echo $REGION

In [None]:
account_id="account_id"
region="region"

Getting started Apache Hudi and Apache Spark

In [None]:
import os
from pyspark.sql.functions import concat, col, lit, to_timestamp, dense_rank, desc, count, rand, when
from pyspark.sql.window import Window
from pyspark.sql.types import StringType


rawS3TablePath = f"s3://lf-datalake-{account_id}-{region}/data_lake_location/tickets/dms_sample/ticket_purchase_hist/"
hudiTablePath = f"s3://lf-datalake-{account_id}-{region}/data_lake_location/hudi/"
cdcTablePath = f"s3://lf-datalake-{account_id}-{region}/data_lake_location/cdc/dms_sample/ticket_purchase_hist/"

targetDBName = 'cpa_hudi_sample'
targetTableName = 'cpa_hudi_ticket_purchase_hist'
targetPath = os.path.join(hudiTablePath, targetDBName, targetTableName)

primaryKey = "sporting_event_ticket_id"

hudiStorageType = 'CoW'

In [None]:
spark.sql('CREATE DATABASE IF NOT EXISTS ' + targetDBName)

In [None]:
# Create Hudi Table
commonConfig = {
    'className' : 'org.apache.hudi', 
    'hoodie.datasource.hive_sync.use_jdbc':'false', 
    'hoodie.datasource.write.precombine.field': 'transaction_date_time', 
    'hoodie.datasource.write.recordkey.field': primaryKey, 
    'hoodie.table.name': targetTableName, 
    'hoodie.consistency.check.enabled': 'true', 
    'hoodie.datasource.hive_sync.database': targetDBName, 
    'hoodie.datasource.hive_sync.table': targetTableName, 
    'hoodie.datasource.hive_sync.enable': 'true',
    'hoodie.datasource.hive_sync.mode': "hms"
}

unpartitionDataConfig = {
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor', 
    'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator'
}

initLoadConfig = {
    'hoodie.bulkinsert.shuffle.parallelism': 3, 
    'hoodie.datasource.write.operation': 'bulk_insert'
}

incrementalConfig = {
    'hoodie.upsert.shuffle.parallelism': 20, 
    'hoodie.datasource.write.operation': 'upsert', 
    'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 
    'hoodie.cleaner.commits.retained': 10
}

dropColumnList = ['db','table_name','Op']

In [None]:
# Input raw dataframe
inputDf = spark.read.option("header", True).csv(rawS3TablePath)
inputDf.printSchema()

In [None]:
inputDf.show(10, False)

## Login as an analyst

Connect to Cluster via Livy with EMR execution role: ANALYST_ROLE

In [None]:
%%sh
source ~/.bash_profile
echo $ENGINEER_ROLE
echo $ANALYST_ROLE
echo $CLUSTER_ID

In [None]:
%sm_analytics emr connect \
--cluster-id <CLUSTER_ID> \
--auth-type Basic_Access \
--emr-execution-role-arn <ANALYST_ROLE>

test the column-level permission

In [None]:
spark.sql("show databases").show()

In [None]:
spark.sql("use cpa_hudi_sample")
# spark.sql("desc formatted cpa_hudi_ticket_purchase_hist").show(100, False)
spark.sql("SELECT * FROM cpa_hudi_sample.cpa_hudi_ticket_purchase_hist limit 10").show()