In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf
import boto3
import os

In [2]:
# Get env vars
database_name = os.environ.get("DB_NAME")
table_name = os.environ.get("TB_NAME")
bucket_name = os.environ.get("BUCKET_NAME")

In [3]:
# Create SparkSession
spark = SparkSession.builder \
    .appName('sample_spark') \
    .config("spark.sql.catalog.AwsGlueCatalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.AwsGlueCatalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config("spark.sql.catalog.AwsGlueCatalog.warehouse", "s3a://bd-datawarehouse/") \
    .config("spark.sql.catalog.AwsGlueCatalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Set log level to WARN
spark.sparkContext.setLogLevel("WARN")

# Enable schema evolution (Allows Iceberg to add new columns)
spark.conf.set("spark.sql.iceberg.schema.evolution.enabled", "true")


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/home/glue_user/aws-glue-libs/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Reload4jLoggerFactory]
25/02/03 17:39:48 INFO SparkContext: Running Spark version 3.3.0-amzn-1
25/02/03 17:39:48 INFO ResourceUtils: No custom resources configured for spark.driver.
25/02/03 17:39:48 INFO SparkContext: Submitted application: sample_spark
25/02/03 17:39:48 INFO Reso

In [4]:
# Read a sample csv dataset
df = spark.read.csv('data/tips.csv', inferSchema=True, header=True)

In [5]:
# Create 'created_at' column and init
df = df.withColumn('created_at', F.current_timestamp())

# Correct way to reorder (and avoid duplicates):
df = df.select("created_at", *[col for col in df.columns if col != "created_at"])


In [6]:
# Define S3 bucket and path for database and table
bucket_name = "bd-datawarehouse"
s3_path = f"{database_name}/{table_name}/"  # Path inside the bucket

# Initialize S3 client
s3 = boto3.client("s3")

def s3_path_exists(bucket, path):
    """Check if an S3 path exists by listing objects with that prefix."""
    response = s3.list_objects_v2(Bucket=bucket, Prefix=path)
    return "Contents" in response  # Returns True if objects exist

def create_s3_path(bucket, path):
    """Create an empty directory in S3 by uploading an empty file."""
    if not s3_path_exists(bucket, path):
        s3.put_object(Bucket=bucket, Key=f"{path}placeholder.txt", Body=b"")  # Upload an empty file
        print(f"Created path: s3://{bucket}/{path}")
    else:
        print(f"Path already exists: s3://{bucket}/{path}")

# Check and create the path if needed
create_s3_path(bucket_name, s3_path)

Created path: s3://bd-datawarehouse/database_name/table_name/


In [7]:
# Create database if not exists
glue_client = boto3.client('glue', region_name="eu-west-1")  # Change to your region

# Check if the database already exists
existing_databases = [db['Name'] for db in glue_client.get_databases()['DatabaseList']]
if database_name not in existing_databases:
    glue_client.create_database(DatabaseInput={'Name': database_name})
    print(f"Database {database_name} created successfully.")
else:
    print(f"Database {database_name} already exists.")

Database database_name created successfully.


In [8]:
# Create Iceberg table if not exist partionted by created_at
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS AwsGlueCatalog.{database_name}.{table_name} (
        created_at TIMESTAMP
    )
    USING iceberg
    PARTITIONED BY (year(created_at))
    LOCATION 's3a://bd-datawarehouse/{database_name}/{table_name}'
    TBLPROPERTIES (
        'table_type' = 'ICEBERG',
        'format' = 'parquet',        
        'vacuum_max_snapshot_age_seconds'='1296000',
        'vacuum_min_snapshots_to_keep'='4',
        'write_compression'='snappy',
        'write.spark.accept-any-schema'='true'
    )
""")

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


DataFrame[]

In [9]:
# Write the dataframe to the table auto-adding new columns (schema on read) based on property mergeSchema
df.write.format("iceberg") \
    .mode("overwrite") \
    .option("catalog", "AwsGlueCatalog") \
    .option("mergeSchema", "true") \
    .save(f"AwsGlueCatalog.{database_name}.{table_name}")


                                                                                

In [10]:
spark.stop()

In [None]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import *  # Import Spark functions

# # Initialize a SparkSession
# spark = SparkSession.builder.appName("IcebergExample").getOrCreate()

# # Configure Iceberg (replace with your actual configuration)
# spark.conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
# spark.conf.set("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.IcebergCatalog")
# spark.conf.set("spark.sql.catalog.iceberg.type", "hadoop") # or hive
# spark.conf.set("spark.sql.catalog.iceberg.warehouse", "s3a://your-iceberg-warehouse") # or hdfs://path

# # Create a sample DataFrame
# data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
# df = spark.createDataFrame(data, ["name", "age"])

# # Write to Iceberg (create a new table or overwrite if it exists)
# df.write.format("iceberg").mode("overwrite").saveAsTable("iceberg.your_catalog.your_table") # iceberg.your_catalog is required. your_table is the table name.

# # Read from Iceberg
# iceberg_df = spark.read.format("iceberg").table("iceberg.your_catalog.your_table") # Read from iceberg.your_catalog.your_table

# iceberg_df.show()

# # Example Iceberg queries
# # You can use SQL queries to interact with Iceberg tables
# spark.sql("SELECT * FROM iceberg.your_catalog.your_table WHERE age > 25").show()

# # Example of updating data in an Iceberg table
# updatesDF = spark.createDataFrame([("Alice", 26)], ["name", "age"]) # Create a dataframe with updates
# updatesDF.write.format("iceberg").mode("merge").option("mergeSchema", "true").saveAsTable("iceberg.your_catalog.your_table") # Merge the updates

# # Example of deleting data from an Iceberg table
# df.filter("age > 27").write.format("iceberg").mode("delete").saveAsTable("iceberg.your_catalog.your_table")

# # Show the updated table
# iceberg_df = spark.read.format("iceberg").table("iceberg.your_catalog.your_table") # Read from iceberg.your_catalog.your_table
# iceberg_df.show()

# # Stop the SparkSession
# spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf
import boto3
import os