In [None]:
%env AWS_REGION = us-east-1
%env AWS_ACCESS_KEY_ID = xxxxxy
%env AWS_SECRET_ACCESS_KEY = xxxxxy
%env AWS_SESSION_TOKEN = xxxxxy
%env AWS_DEFAULT_REGION = us-east-1
%env AWS_DEFAULT_OUTPUT = json

In [2]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkConf, __version__ as pyspark_version

JAVA_HOME = "/Library/Java/JavaVirtualMachines/amazon-corretto-17.jdk/Contents/Home"
os.environ["JAVA_HOME"] = JAVA_HOME

In [5]:
# Let's prepare all the packages we need to deal with glue, and iceberg tables, which requires DynamoDB, yes!

aws_bundles = ["kms", "glue", "dynamodb", "sts", "s3", "url-connection-client"]
aws_version = "2.21.24"
aws_jars = ",".join(
    [f"software.amazon.awssdk:{pkg}:{aws_version}" for pkg in aws_bundles]
)

main_pyspark_version = ".".join(pyspark_version.split(".")[:-1])
# Ensure the versions for pyspark match below
os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = f"--packages org.apache.iceberg:iceberg-spark-runtime-{main_pyspark_version}_2.12:1.4.1,{aws_jars},org.apache.spark:spark-hadoop-cloud_2.12:{pyspark_version} pyspark-shell"


In [6]:
# Initialize Spark Session with Iceberg configurations
spark = SparkSession.builder \
    .appName("Iceberg Table Registration") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.glue_catalog.warehouse", "s3://jegp-emr-lakeformation-useast1-v2/jegp_db.db/") \
    .config("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .getOrCreate()


In [None]:
# Create a new database in Glue Catalog
database_name = "jegp_db"
create_db_query = f"CREATE DATABASE IF NOT EXISTS glue_catalog.{database_name}"
spark.sql(create_db_query)

In [None]:
# Register existing Iceberg table from S3 to Glue Catalog
existing_table_location = "s3://jegp-emr-lakeformation-useast1-v2/jegp_db.db/customer"
existing_metadata_file = "00000-bd40ccb8-e67d-451f-b30a-8dcd1d1dab0d.metadata.json"
new_table_name = "customer"

register_table_query = f"""
CALL glue_catalog.system.register_table(
    table => '{database_name}.{new_table_name}',
    metadata_file => '{existing_table_location}/metadata/{existing_metadata_file}')
"""
spark.sql(register_table_query)

# Verify the table was registered correctly
spark.sql(f"SELECT * FROM glue_catalog.{database_name}.{new_table_name} LIMIT 5").show()
