In [None]:
%%sh
pip install sagemaker-studio-analytics-extension

In [None]:
%load_ext sagemaker_studio_analytics_extension.magics

## Act as data engineer 

### 1. Configure Apache Hudi for Apache Spark

In [None]:
%%configure -f
{ "conf": {
    "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar",
    "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
}}

### 2. Connect to EMR Cluster via Livy as the ENGINEER_ROLE

In [None]:
%%sh
source ~/.bash_profile
EMR_CLUSTER_ID=$(aws emr list-clusters --active  --query 'Clusters[?contains(Name,`emr-roadshow-runtime-role-lf`)].Id' --output text)

echo $ENGINEER_ROLE
echo $EMR_CLUSTER_ID

In [None]:
%sm_analytics emr connect \
--cluster-id j-Q87F82QR17AT \
--auth-type Basic_Access \
--emr-execution-role-arn arn:aws:iam::260906135353:role/lf-data-access-engineer

In [None]:
# %%spark

# hdfs dfs -mkdir -p /apps/hudi/lib
# hdfs dfs -copyFromLocal /usr/lib/hudi/hudi-spark-bundle.jar /apps/hudi/lib/hudi-spark-bundle.jar

### 3. Show session information

In [None]:
%%info

### 4. Obtain S3 Data Lake bucket name

In [None]:
%%sh
echo $DATALAKE_BUCKET

### 5. Getting started Apache Hudi and Apache Spark

In [None]:
import os
from pyspark.sql.functions import concat, col, lit, to_timestamp, dense_rank, desc, count, rand, when
from pyspark.sql.window import Window
from pyspark.sql.types import StringType


rawS3TablePath = "s3://${DATALAKE_BUCKET}/raw/ticket_purchase_hist/"
hudiTablePath = "s3://${DATALAKE_BUCKET}/hudi/"
cdcTablePath = "s3://${DATALAKE_BUCKET}/cdc/ticket_purchase_hist/"

targetDBName = 'hudi_sample'
targetTableName = 'hudi_ticket_purchase_hist'
targetPath = os.path.join(hudiTablePath, targetDBName, targetTableName)

primaryKey = "sporting_event_ticket_id"

hudiStorageType = 'CoW'

In [None]:
spark.sql('CREATE DATABASE IF NOT EXISTS ' + targetDBName)

In [None]:
# Create Hudi Table
commonConfig = {
    'className' : 'org.apache.hudi', 
    'hoodie.datasource.hive_sync.use_jdbc':'false', 
    'hoodie.datasource.write.precombine.field': 'transaction_date_time', 
    'hoodie.datasource.write.recordkey.field': primaryKey, 
    'hoodie.table.name': targetTableName, 
    'hoodie.consistency.check.enabled': 'true', 
    'hoodie.datasource.hive_sync.database': targetDBName, 
    'hoodie.datasource.hive_sync.table': targetTableName, 
    'hoodie.datasource.hive_sync.enable': 'true',
    'hoodie.datasource.hive_sync.mode': "hms"
}

unpartitionDataConfig = {
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor', 
    'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator'
}

initLoadConfig = {
    'hoodie.bulkinsert.shuffle.parallelism': 3, 
    'hoodie.datasource.write.operation': 'bulk_insert'
}

incrementalConfig = {
    'hoodie.upsert.shuffle.parallelism': 20, 
    'hoodie.datasource.write.operation': 'upsert', 
    'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 
    'hoodie.cleaner.commits.retained': 10
}

dropColumnList = ['db','table_name','Op']

In [None]:
# Input raw dataframe
inputDf = spark.read.option("header", True).csv(rawS3TablePath)
inputDf.printSchema()

In [None]:
inputDf.show(10, False)

## Login as an analyst

### 1. Connect to EMR Cluster via Livy as ANALYST_ROLE

In [None]:
%%sh

echo $ANALYST_ROLE
echo $EMR_CLUSTER_ID

In [None]:
%sm_analytics emr connect \
--cluster-id <CLUSTER_ID> \
--auth-type Basic_Access \
--emr-execution-role-arn <ANALYST_ROLE>

### 2. test the column-level permission

In [None]:
spark.sql("show databases").show()

In [None]:
spark.sql("use hudi_sample")
# spark.sql("desc formatted cpa_hudi_ticket_purchase_hist").show(100, False)
spark.sql("SELECT * FROM hudi_sample.cpa_hudi_ticket_purchase_hist limit 10").show()