In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf
import boto3
import os

In [5]:
# Get env vars
database_name = os.environ.get("DB_NAME")
table_name = os.environ.get("TB_NAME")
bucket_name = os.environ.get("BUCKET_NAME")

In [7]:
# Create SparkSession
spark = SparkSession.builder \
    .appName('sample_spark') \
    .config("spark.sql.catalog.AwsGlueCatalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.AwsGlueCatalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config("spark.sql.catalog.AwsGlueCatalog.warehouse", "s3a://bd-datawarehouse/") \
    .config("spark.sql.catalog.AwsGlueCatalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .getOrCreate()

# Set log level to WARN
spark.sparkContext.setLogLevel("WARN")

# Enable schema evolution (Allows Iceberg to add new columns)
spark.conf.set("spark.sql.iceberg.schema.evolution.enabled", "true")


Py4JJavaError: An error occurred while calling None.org.apache.spark.sql.SparkSession.
: java.lang.UnsupportedClassVersionError: org/apache/iceberg/spark/extensions/IcebergSparkSessionExtensions has been compiled by a more recent version of the Java Runtime (class file version 55.0), this version of the Java Runtime only recognizes class file versions up to 52.0
	at java.lang.ClassLoader.defineClass1(Native Method)
	at java.lang.ClassLoader.defineClass(ClassLoader.java:756)
	at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142)
	at java.net.URLClassLoader.defineClass(URLClassLoader.java:473)
	at java.net.URLClassLoader.access$100(URLClassLoader.java:74)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:369)
	at java.net.URLClassLoader$1.run(URLClassLoader.java:363)
	at java.security.AccessController.doPrivileged(Native Method)
	at java.net.URLClassLoader.findClass(URLClassLoader.java:362)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
	at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:352)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:405)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at org.apache.spark.util.Utils$.classForName(Utils.scala:219)
	at org.apache.spark.sql.SparkSession$.$anonfun$applyExtensions$1(SparkSession.scala:1221)
	at org.apache.spark.sql.SparkSession$.$anonfun$applyExtensions$1$adapted(SparkSession.scala:1219)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.sql.SparkSession$.org$apache$spark$sql$SparkSession$$applyExtensions(SparkSession.scala:1219)
	at org.apache.spark.sql.SparkSession.<init>(SparkSession.scala:106)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [None]:
# Read a sample csv dataset
df = spark.read.csv('data/tips.csv', inferSchema=True, header=True)

In [None]:
# Create 'created_at' column and init
df = df.withColumn('created_at', F.current_timestamp())

# Correct way to reorder (and avoid duplicates):
df = df.select("created_at", *[col for col in df.columns if col != "created_at"])


In [None]:
# Define S3 bucket and path for database and table
bucket_name = "bd-datawarehouse"
s3_path = f"{database_name}/{table_name}/"  # Path inside the bucket

# Initialize S3 client
s3 = boto3.client("s3")

def s3_path_exists(bucket, path):
    """Check if an S3 path exists by listing objects with that prefix."""
    response = s3.list_objects_v2(Bucket=bucket, Prefix=path)
    return "Contents" in response  # Returns True if objects exist

def create_s3_path(bucket, path):
    """Create an empty directory in S3 by uploading an empty file."""
    if not s3_path_exists(bucket, path):
        s3.put_object(Bucket=bucket, Key=f"{path}placeholder.txt", Body=b"")  # Upload an empty file
        print(f"Created path: s3://{bucket}/{path}")
    else:
        print(f"Path already exists: s3://{bucket}/{path}")

# Check and create the path if needed
create_s3_path(bucket_name, s3_path)

In [None]:
# Create database if not exists
glue_client = boto3.client('glue', region_name="eu-west-1")  # Change to your region

# Check if the database already exists
existing_databases = [db['Name'] for db in glue_client.get_databases()['DatabaseList']]
if database_name not in existing_databases:
    glue_client.create_database(DatabaseInput={'Name': database_name})
    print(f"Database {database_name} created successfully.")
else:
    print(f"Database {database_name} already exists.")

In [None]:
# Create Iceberg table if not exist partionted by created_at
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS AwsGlueCatalog.{database_name}.{table_name} (
        created_at TIMESTAMP
    )
    USING iceberg
    PARTITIONED BY (year(created_at))
    LOCATION 's3a://bd-datawarehouse/{database_name}/{table_name}'
    TBLPROPERTIES (
        'table_type' = 'ICEBERG',
        'format' = 'parquet',        
        'vacuum_max_snapshot_age_seconds'='1296000',
        'vacuum_min_snapshots_to_keep'='4',
        'write_compression'='snappy',
        'write.spark.accept-any-schema'='true'
    )
""")

In [None]:
# Write data to table
# df.writeTo(f'glue_catalog.{database_name}.{table_name}')

# df.write.parquet('df_big_rep_parquet')

# df.write.format("iceberg") \
#     .mode("overwrite") \
#     .option("catalog", "AwsGlueCatalog") \
#     .option("table", f"AwsGlueCatalog.{database_name}.{table_name}") \
#     .option("path", f"s3://bd-datawarehouse/warehouse/dw/table_name/") \
#     .save()

In [None]:
# Write the dataframe to the table auto-adding new columns (schema on read) based on property mergeSchema
df.write.format("iceberg") \
    .mode("overwrite") \
    .option("catalog", "AwsGlueCatalog") \
    .option("mergeSchema", "true") \
    .save(f"AwsGlueCatalog.{database_name}.{table_name}")


In [None]:
spark.stop()

In [None]:
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import *  # Import Spark functions

# # Initialize a SparkSession
# spark = SparkSession.builder.appName("IcebergExample").getOrCreate()

# # Configure Iceberg (replace with your actual configuration)
# spark.conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
# spark.conf.set("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.IcebergCatalog")
# spark.conf.set("spark.sql.catalog.iceberg.type", "hadoop") # or hive
# spark.conf.set("spark.sql.catalog.iceberg.warehouse", "s3a://your-iceberg-warehouse") # or hdfs://path

# # Create a sample DataFrame
# data = [("Alice", 25), ("Bob", 30), ("Charlie", 28)]
# df = spark.createDataFrame(data, ["name", "age"])

# # Write to Iceberg (create a new table or overwrite if it exists)
# df.write.format("iceberg").mode("overwrite").saveAsTable("iceberg.your_catalog.your_table") # iceberg.your_catalog is required. your_table is the table name.

# # Read from Iceberg
# iceberg_df = spark.read.format("iceberg").table("iceberg.your_catalog.your_table") # Read from iceberg.your_catalog.your_table

# iceberg_df.show()

# # Example Iceberg queries
# # You can use SQL queries to interact with Iceberg tables
# spark.sql("SELECT * FROM iceberg.your_catalog.your_table WHERE age > 25").show()

# # Example of updating data in an Iceberg table
# updatesDF = spark.createDataFrame([("Alice", 26)], ["name", "age"]) # Create a dataframe with updates
# updatesDF.write.format("iceberg").mode("merge").option("mergeSchema", "true").saveAsTable("iceberg.your_catalog.your_table") # Merge the updates

# # Example of deleting data from an Iceberg table
# df.filter("age > 27").write.format("iceberg").mode("delete").saveAsTable("iceberg.your_catalog.your_table")

# # Show the updated table
# iceberg_df = spark.read.format("iceberg").table("iceberg.your_catalog.your_table") # Read from iceberg.your_catalog.your_table
# iceberg_df.show()

# # Stop the SparkSession
# spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark import SparkConf
import boto3
import os