In [None]:
%%sh
pip install sagemaker-studio-analytics-extension

In [None]:
%load_ext sagemaker_studio_analytics_extension.magics

## 1. Get EMR Cluster ID, IAM roles and other variables

In [None]:
%%sh
source ~/.bash_profile
CLUSTER=$(aws emr list-clusters --active  --query 'Clusters[?contains(Name,`emr-roadshow-runtime-role-lf`)].Id' --output text)

echo "EMR ClusterId: $CLUSTER"
echo "ENGINEER_ROLE: $ENGINEER_ROLE"
echo "ANALYST_ROLE: $ANALYST_ROLE"
echo "Data Lake S3 bucket name: $DATALAKE_BUCKET"

## 2. Upload sample data to datalake

In [None]:
%%sh
aws s3api get-object --bucket aws-dataengineering-day.workshop.aws --key data/dms_sample/ticket_purchase_hist/LOAD00000001.csv --range bytes=1-10000 ticket_purchase_hist.csv
aws s3 cp ticket_purchase_hist.csv s3://$DATALAKE_BUCKET/raw/
    
# expecting access deny error    
aws s3 ls s3://$DATALAKE_BUCKET/raw/

## 3. Submit job to EMR as a data engineer 
### can create databases

In [None]:
%sm_analytics emr connect \
--cluster-id j-ZZWQ7QVK9O3B \
--auth-type Basic_Access \
--emr-execution-role-arn arn:aws:iam::312026938062:role/lf-data-access-engineer

In [None]:
%%configure -f
{ "conf": {
    "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar",
    "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
}}

In [None]:
import os
from pyspark.sql.functions import concat, col, lit, to_timestamp, dense_rank, desc, count, rand, when
from pyspark.sql.window import Window
from pyspark.sql.types import StringType


rawS3TablePath = "s3://lf-datalake-312026938062-us-west-2/raw/"
hudiTablePath = "s3://lf-datalake-312026938062-us-west-2/hudi/"
# cdcTablePath = "s3://lf-datalake-312026938062-us-west-2/cdc/"

targetDBName = 'hudi_sample'
targetTableName = 'hudi_ticket_purchase_hist'
targetPath = os.path.join(hudiTablePath, targetDBName, targetTableName)

primaryKey = "sporting_event_ticket_id"

hudiStorageType = 'CoW'


In [None]:
spark.sql('CREATE DATABASE IF NOT EXISTS ' + targetDBName)

In [None]:
# Create Hudi Table
commonConfig = {
    'className' : 'org.apache.hudi', 
    'hoodie.datasource.hive_sync.use_jdbc':'false', 
    'hoodie.datasource.write.precombine.field': 'transaction_date_time', 
    'hoodie.datasource.write.recordkey.field': primaryKey, 
    'hoodie.table.name': targetTableName, 
    'hoodie.consistency.check.enabled': 'true', 
    'hoodie.datasource.hive_sync.database': targetDBName, 
    'hoodie.datasource.hive_sync.table': targetTableName, 
    'hoodie.datasource.hive_sync.enable': 'true',
    'hoodie.datasource.hive_sync.mode': "hms"
}

unpartitionDataConfig = {
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor', 
    'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator'
}

initLoadConfig = {
    'hoodie.bulkinsert.shuffle.parallelism': 3, 
    'hoodie.datasource.write.operation': 'bulk_insert'
}

incrementalConfig = {
    'hoodie.upsert.shuffle.parallelism': 20, 
    'hoodie.datasource.write.operation': 'upsert', 
    'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 
    'hoodie.cleaner.commits.retained': 10
}

dropColumnList = ['db','table_name','Op']

In [None]:
# Input raw dataframe
inputDf = spark.read.option("header", True).csv(rawS3TablePath)
inputDf.printSchema()


In [None]:
inputDf.show(10, False)

## 4. Login as an analyst 
### test the column-level permission

In [None]:
%sm_analytics emr connect \
--cluster-id YOUR_EMR_CLUSTERID \
--auth-type Basic_Access \
--emr-execution-role-arn arn:aws:iam::633458367150:role/lf-data-access-analyst
     

In [None]:
spark.sql("show databases").show()


In [None]:
spark.sql("use hudi_sample")
# spark.sql("desc formatted cpa_hudi_ticket_purchase_hist").show(100, False)
spark.sql("SELECT * FROM hudi_sample.hudi_ticket_purchase_hist limit 10").show()