In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window

# https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/
# https://iceberg.apache.org/multi-engine-support/#apache-spark

spark = (SparkSession
    .builder
    .appName("etl-device-subscription")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.1,"
                                    "com.amazonaws:aws-java-sdk-bundle:1.12.709,"
                                    "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.0,"
            )
    # S3 / Minio
    .config("spark.hadoop.fs.s3a.access.key", "andreyolv")
    .config("spark.hadoop.fs.s3a.secret.key", "andreyolv")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio.minio:9000")
    .config("spark.hadoop.fs.s3a.path.style.access", True)
    .config("spark.hadoop.fs.s3a.fast.upload", True)
    .config("spark.hadoop.fs.s3a.multipart.size", 104857600)
    .config("fs.s3a.connection.maximum", 100)
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    # Iceberg
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.owshq", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.owshq.s3.endpoint", "http://minio.minio:9000")
    # Catalog on S3
    .config("spark.sql.catalog.owshq.type", "hadoop")
    .config("spark.sql.catalog.owshq.warehouse", "s3a://lakehouse/iceberg")
    .getOrCreate()
)

In [2]:
example_table_iceberg = spark.sql("""
        CREATE TABLE IF NOT EXISTS owshq.db.example 
        (
           id int, 
           name string,
           age int,
           city string,
           __op string
        ) 
        USING iceberg
""")

In [3]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("__op", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York", "r"), 
        (2, "Joao", 30, "San Francisco", "r"), 
        (3, "Flavio", 21, "Sao Paulo", "r")]

df = spark.createDataFrame(data, schema)

(df.write
 .format("iceberg")
 .mode("overwrite")
 .saveAsTable("owshq.db.example")
)

In [4]:
df2 = spark.read.format("iceberg").load("owshq.db.example")

df2.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- __op: string (nullable = true)

