In [None]:
import os

from pyspark.sql import SparkSession

os.environ["AWS_PROFILE"] = "blueriver"

CATALOG = "glue_catalog"
ICEBERG_S3_ROOT_PATH = "s3a://blueriver-datalake/iceberg"

spark = (
    SparkSession.builder.appName("2")
    .config("spark.sql.defaultCatalog", CATALOG)
    .config(f"spark.sql.catalog.{CATALOG}", "org.apache.iceberg.spark.SparkCatalog")
    .config(f"spark.sql.catalog.{CATALOG}.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog")
    .config(f"spark.sql.catalog.{CATALOG}.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config(f"spark.sql.catalog.{CATALOG}.warehouse", ICEBERG_S3_ROOT_PATH)
    .config(f"spark.sql.catalog.{CATALOG}.s3.path-style-access", True)
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider",
    )
    .config("spark.sql.caseSensitive", True)
    .config("spark.sql.session.timeZone", "UTC")
    .getOrCreate()
)


def process_batch(df_batch, batch_id) -> None:
    if not df_batch.isEmpty():
        print(df_batch.count())
        print(df_batch.select("last_applied_date").show(truncate=False))


read_stream = (
    spark.readStream.format("iceberg")
    .option("streaming-skip-overwrite-snapshots", "true")
    .option("streaming-skip-delete-snapshots", "true")
    .option("streaming-max-rows-per-micro-batch", 100)
    .option("stream-from-timestamp", "0")
    .load("store_bronze.tb_lower")
)

query = (
    read_stream.writeStream.outputMode("append")
    .foreachBatch(process_batch)
    .trigger(processingTime="1 seconds")
    .option("checkpointLocation", f"{ICEBERG_S3_ROOT_PATH}/checkpoints/iceberg-cdc")
    .start()
)

query.awaitTermination()