In [1]:
import logging
import sys
import os
from datetime import datetime
from decimal import Decimal
from IPython.display import display, HTML
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DecimalType, LongType


# Logging configuration
formatter = logging.Formatter('[%(asctime)s] %(levelname)s @ line %(lineno)d: %(message)s')
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
handler.setFormatter(formatter)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(handler)


# Application-specific variables
dt_string = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
AppName = "Demo"
# AWS specific variables
region = os.environ.get('AWS_REGION', 'us-east-1')

# Replace S3_BUCKET and ACCOUNT_NUMBER with your own values
input_csv_path = "s3a://<S3_BUCKET>/s3table-example/input/"
# Ensure this table bucket exists
s3table_arn = f"arn:aws:s3tables:{region}:<ACCOUNT_NUMBER>:bucket/doeks-spark-s3-tables"
namespace = "doeks_namespace"
table_name = "employee_s3_table"
full_table_name = f"s3tablesbucket.{namespace}.{table_name}"

spark = (SparkSession
    .builder
    .appName(f"{AppName}_{dt_string}")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.s3tablesbucket", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.s3tablesbucket.type", "rest")
    .config("spark.sql.catalog.s3tablesbucket.warehouse", s3table_arn)
    .config("spark.sql.catalog.s3tablesbucket.uri", f"https://s3tables.{region}.amazonaws.com/iceberg")
    .config("spark.sql.catalog.s3tablesbucket.rest.sigv4-enabled", "true")
    .config("spark.sql.catalog.s3tablesbucket.rest.signing-name", "s3tables")
    .config("spark.sql.catalog.s3tablesbucket.rest.signing-region", region)
    .config("spark.sql.catalog.s3tablesbucket.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .config('spark.hadoop.fs.s3.impl', "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.sql.defaultCatalog", "s3tablesbucket")
    .config("spark.hadoop.fs.s3a.connection.timeout", "1200000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.maximum", "200") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.hadoop.fs.s3a.readahead.range", "256K") \
    .config("spark.hadoop.fs.s3a.input.fadvise", "random") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider.mapping", "com.amazonaws.auth.WebIdentityTokenCredentialsProvider=software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "software.amazon.awssdk.auth.credentials.WebIdentityTokenFileCredentialsProvider")
    .getOrCreate())

In [2]:
# Create namespace if not exists
spark.sql(f"CREATE NAMESPACE IF NOT EXISTS s3tablesbucket.{namespace}")

DataFrame[]

In [3]:
# Read input CSV data
employee_df = spark.read.csv(input_csv_path, header=True, inferSchema=True)
employee_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- level: string (nullable = true)
 |-- salary: double (nullable = true)



In [4]:
# Create a table
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {full_table_name} (
        id INT,
        name STRING,
        level STRING,
        salary DOUBLE
    )
    USING iceberg
    OPTIONS ('format-version'='2')
""")

DataFrame[]

In [5]:
# Write to the table
employee_df.writeTo(full_table_name).using('iceberg').append()

In [6]:
# Read back from S3 Tables
iceberg_data_df = spark.read.format("iceberg").load(full_table_name)

In [7]:
# List the table snapshots
iceberg_data_df.show(10, truncate=False)
print(f"DataFrame count: {iceberg_data_df.count()}")
spark.sql(f"SELECT * FROM {full_table_name}.history LIMIT 10").show()

+---+-----------+------+--------+
|id |name       |level |salary  |
+---+-----------+------+--------+
|1  |Employee_1 |Senior|197000.0|
|2  |Employee_2 |Mid   |117500.0|
|3  |Employee_3 |Senior|54500.0 |
|4  |Employee_4 |Mid   |110000.0|
|5  |Employee_5 |Junior|59000.0 |
|6  |Employee_6 |Mid   |165500.0|
|7  |Employee_7 |Senior|137000.0|
|8  |Employee_8 |Junior|71000.0 |
|9  |Employee_9 |Exec  |140000.0|
|10 |Employee_10|Senior|129500.0|
+---+-----------+------+--------+
only showing top 10 rows

DataFrame count: 700
+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2025-04-14 17:04:...|1247367129977964401|               NULL|               true|
|2025-04-14 17:04:...|2293528557878885471|1247367129977964401|               true|
|2025-04-14 17:29:...|8156129775284340201|2293528557878885471|