In [64]:
%%sh
pip install sagemaker-studio-analytics-extension
%load_ext sagemaker_studio_analytics_extension.magics


Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


sh: line 1: import: command not found
sh: line 3: fg: no job control


CalledProcessError: Command 'b'import os\npip install sagemaker-studio-analytics-extension\n%load_ext sagemaker_studio_analytics_extension.magics\n'' returned non-zero exit status 1.

## 1. Get EMR Cluster ID

In [70]:
%%sh
export CLUSTER
CLUSTER=$(aws emr list-clusters --active  --query 'Clusters[?contains(Name,`emr-roadshow-runtime-role-lf`)].Id' --output text)

echo $CLUSTER


j-3U8F7PV6UJJ6U


## 2. List all persona's IAM roles

In [90]:
%%sh
export ENGINEER_ROLE
export ANALYST_ROLE

ENGINEER_ROLE=$(aws iam list-roles --query 'Roles[?contains(RoleName,`engineer`)].Arn' --output text)
ANALYST_ROLE=$(aws iam list-roles --query 'Roles[?contains(RoleName,`analyst`)].Arn' --output text)

echo $ENGINEER_ROLE
echo $ANALYST_ROLE


arn:aws:iam::785675716968:role/lf-data-access-engineer
arn:aws:iam::785675716968:role/lf-data-access-analyst


In [102]:
import os
os.environ['MYPATH']=$ENGINEER_ROLE


The code failed because of a fatal error:
	Error sending http request and maximum retry encountered..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


In [99]:
%local 
!echo $MYPATH

`arn:aws:iam::785675716968:role/lf-data-access-engineer`


## 3. Submit job to EMR as a data engineer 
### can create databases

In [88]:
%sm_analytics emr connect \
--cluster-id $CLUSTER \
--auth-type Basic_Access \
--emr-execution-role-arn $ENGINEER_ROLE

Note: The next major version update will change the default for `--verify-certificate` from False to True, enabling SSL verification for HTTPS connections by default. Refer to %sm_analytics? for more details about options supported by `--verify-certificate` 


InvalidArnException: ARNs must be of the form arn:partition:service:region:accountId:resource

In [None]:
%%configure -f
{ "conf": {
    "spark.jars":"hdfs:///apps/hudi/lib/hudi-spark-bundle.jar",
    "spark.serializer":"org.apache.spark.serializer.KryoSerializer",
    "spark.sql.catalog.spark_catalog": "org.apache.spark.sql.hudi.catalog.HoodieCatalog",
    "spark.sql.extensions":"org.apache.spark.sql.hudi.HoodieSparkSessionExtension"
}}

In [None]:
import os
from pyspark.sql.functions import concat, col, lit, to_timestamp, dense_rank, desc, count, rand, when
from pyspark.sql.window import Window
from pyspark.sql.types import StringType


rawS3TablePath = "s3://cpa-lakeformation-sagemaker-poc/data_lake_location/tickets/dms_sample/ticket_purchase_hist/"
hudiTablePath = "s3://cpa-lakeformation-sagemaker-poc/data_lake_location/hudi/"
cdcTablePath = "s3://cpa-lakeformation-sagemaker-poc/data_lake_location/cdc/dms_sample/ticket_purchase_hist/"

targetDBName = 'cpa_hudi_sample'
targetTableName = 'cpa_hudi_ticket_purchase_hist'
targetPath = os.path.join(hudiTablePath, targetDBName, targetTableName)

primaryKey = "sporting_event_ticket_id"

hudiStorageType = 'CoW'


In [None]:
spark.sql('CREATE DATABASE IF NOT EXISTS ' + targetDBName)

In [None]:
# Create Hudi Table
commonConfig = {
    'className' : 'org.apache.hudi', 
    'hoodie.datasource.hive_sync.use_jdbc':'false', 
    'hoodie.datasource.write.precombine.field': 'transaction_date_time', 
    'hoodie.datasource.write.recordkey.field': primaryKey, 
    'hoodie.table.name': targetTableName, 
    'hoodie.consistency.check.enabled': 'true', 
    'hoodie.datasource.hive_sync.database': targetDBName, 
    'hoodie.datasource.hive_sync.table': targetTableName, 
    'hoodie.datasource.hive_sync.enable': 'true',
    'hoodie.datasource.hive_sync.mode': "hms"
}

unpartitionDataConfig = {
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.NonPartitionedExtractor', 
    'hoodie.datasource.write.keygenerator.class': 'org.apache.hudi.keygen.NonpartitionedKeyGenerator'
}

initLoadConfig = {
    'hoodie.bulkinsert.shuffle.parallelism': 3, 
    'hoodie.datasource.write.operation': 'bulk_insert'
}

incrementalConfig = {
    'hoodie.upsert.shuffle.parallelism': 20, 
    'hoodie.datasource.write.operation': 'upsert', 
    'hoodie.cleaner.policy': 'KEEP_LATEST_COMMITS', 
    'hoodie.cleaner.commits.retained': 10
}

dropColumnList = ['db','table_name','Op']

In [None]:
# Input raw dataframe
inputDf = spark.read.option("header", True).csv(rawS3TablePath)
inputDf.printSchema()


In [None]:
inputDf.show(10, False)

## Login as an analyst 
### test the column-level permission

In [None]:
%sm_analytics emr connect \
--cluster-id j-3NK7CP7TFAEY6 \
--auth-type Basic_Access \
--emr-execution-role-arn arn:aws:iam::633458367150:role/lf-data-access-analyst
     

In [None]:
spark.sql("show databases").show()


In [None]:
spark.sql("use cpa_hudi_sample")
# spark.sql("desc formatted cpa_hudi_ticket_purchase_hist").show(100, False)
spark.sql("SELECT * FROM cpa_hudi_sample.cpa_hudi_ticket_purchase_hist limit 10").show()