# Load data incrementally from Apache Hudi dataset to Amazon Redshift


####  Run this cell to set up and start your interactive session.


In [None]:
%session_id_prefix hudi-redshift-incremental-
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5
%connections redshift
%%configure
{
    "--datalake-formats": "hudi"
}

#### Configure your resources

In [None]:
AWS_ACCOUNT_ID = "123456789101"
REGION = "us-east-1"

HUDI_DATASET_PATH = "s3://<Your S3 bucket>/<Your S3 prefix>/hudi_incremental/ghcn/"

REDSHIFT_CONNECTION_NAME = "redshift"
REDSHIFT_IAM_ROLE_ARN = "arn:aws:iam::123456789101:role/RedshiftSpectrumRole"
REDSHIFT_SCHEMA = "public"
REDSHIFT_TABLE_NAME = "ghcn"
REDSHIFT_TABLE_PRIMARY_KEYS = ["ID", "ELEMENT"]

#### Initialize SparkSession and GlueContext

In [None]:
import sys
from datetime import datetime
import boto3
from botocore.exceptions import ClientError
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session

params = []
if '--JOB_NAME' in sys.argv:
    params.append('JOB_NAME')
if '--TempDir' in sys.argv:
    params.append('TempDir')
args = getResolvedOptions(sys.argv, params)

job_name = None
if 'JOB_NAME' in args:
    job_name = args['JOB_NAME']
if not job_name:
    job_name = "hudi-ghcn-incremental-load-notebook"

if 'TempDir' in args:
    temp_dir = args['TempDir']
if not temp_dir:
    temp_dir = f"s3://aws-glue-assets-{AWS_ACCOUNT_ID}-{REGION}/temporary/"

jdbc_conf = glueContext.extract_jdbc_conf(connection_name=REDSHIFT_CONNECTION_NAME)


#### Determine target time range for incremental query

In [None]:
glue = boto3.client('glue')

try:
    res = glue.get_tags(ResourceArn=f"arn:aws:glue:{REGION}:{AWS_ACCOUNT_ID}:job/{job_name}")
    if 'Tags' in res and 'lastQueryEndTime' in res['Tags']:
        beginTime = res['Tags']['lastQueryEndTime']
    else:
        beginTime = "000" ### retrieve all
except Exception as e:
    raise Exception("Failed to retrieve lastQueryEndTime tag via get_tags: " + e.__str__())

endTime = datetime.now().strftime("%Y%m%d%H%M%S")

print(f"beginTime: {beginTime}")
print(f"endTime: {endTime}")

#### Run query

In [None]:
df = spark.read.format("hudi") \
    .option("hoodie.datasource.query.type", "incremental") \
    .option("hoodie.datasource.read.begin.instanttime", beginTime) \
    .option("hoodie.datasource.read.end.instanttime", endTime) \
    .load(HUDI_DATASET_PATH)

In [None]:
df.show()

#### Merge changes into destination table

In [None]:
column_names = [f.name for f in df.schema.fields]
print(column_names)

In [None]:
tmp_table_name = f"{REDSHIFT_TABLE_NAME}_tmp"

post_actions = f"BEGIN; CREATE TABLE IF NOT EXISTS {REDSHIFT_SCHEMA}.{REDSHIFT_TABLE_NAME} (LIKE {REDSHIFT_SCHEMA}.{tmp_table_name}); "
post_actions += f"MERGE INTO {REDSHIFT_SCHEMA}.{REDSHIFT_TABLE_NAME} USING {REDSHIFT_SCHEMA}.{tmp_table_name} ON "

post_actions += ' AND '.join(f"{REDSHIFT_SCHEMA}.{REDSHIFT_TABLE_NAME}.{pk} = {REDSHIFT_SCHEMA}.{tmp_table_name}.{pk}" for pk in REDSHIFT_TABLE_PRIMARY_KEYS)

post_actions += " WHEN MATCHED THEN UPDATE SET "
post_actions += ', '.join(f"{col} = {REDSHIFT_SCHEMA}.{tmp_table_name}.{col}" for col in column_names)

post_actions += " WHEN NOT MATCHED THEN INSERT VALUES ("
post_actions += ', '.join(f"{REDSHIFT_SCHEMA}.{tmp_table_name}.{col}" for col in column_names)

post_actions += f"); DROP TABLE {REDSHIFT_SCHEMA}.{tmp_table_name}; END;"

print(f"post_actions: {post_actions}")

In [None]:
df.write \
  .format("io.github.spark_redshift_community.spark.redshift") \
  .option("url", jdbc_conf["fullUrl"]) \
  .option("user", jdbc_conf["user"]) \
  .option("password", jdbc_conf["password"]) \
  .option("dbtable", tmp_table_name) \
  .option("postactions", post_actions) \
  .option("tempdir", temp_dir) \
  .option("aws_iam_role", REDSHIFT_IAM_ROLE_ARN) \
  .mode("error") \
  .save()

#### Update the last query end time

In [None]:
tag = {"lastQueryEndTime": endTime}

try:
    glue.tag_resource(ResourceArn=f"arn:aws:glue:{REGION}:{AWS_ACCOUNT_ID}:job/{job_name}",TagsToAdd=tag)
except Exception as e:
    raise Exception("Failed to update lastQueryEndTime tag via tags_resource: " + e.__str__())