In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window

catalog_name = 'testdb' # 'local'

spark = (SparkSession
        .builder
        .appName("IcebergSession")
        .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.2.1")
        .config("spark.sql.catalog.spark_catalog.type", "hive")
        .config(f"spark.sql.catalog.{catalog_name}", "org.apache.iceberg.spark.SparkCatalog")
        .config(f"spark.sql.catalog.{catalog_name}.type", "hadoop")
        .config(f"spark.sql.catalog.{catalog_name}.warehouse", "/home/jovyan/warehouse2")
        .getOrCreate()
        )

In [3]:
example_table_iceberg = spark.sql("""
        CREATE TABLE IF NOT EXISTS local.db.example 
        (
           id int, 
           name string,
           age int,
           city string,
           __op string
        ) 
        USING iceberg
""")

In [21]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("__op", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York", "r"), 
        (2, "Joao", 30, "San Francisco", "r"), 
        (3, "Flavio", 21, "Sao Paulo", "r")]

df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()

+---+------+---+-------------+----+
| id|  name|age|         city|__op|
+---+------+---+-------------+----+
|  1| Alice| 25|     New York|   r|
|  2|  Joao| 30|San Francisco|   r|
|  3|Flavio| 21|    Sao Paulo|   r|
+---+------+---+-------------+----+



In [22]:
(df.write
     .format("iceberg")
     .mode("overwrite")
     .save("local.db.example")
)

In [23]:
df2 = spark.read.format("iceberg").load("local.db.example")

df2.show()

+---+------+---+-------------+----+
| id|  name|age|         city|__op|
+---+------+---+-------------+----+
|  1| Alice| 25|     New York|   r|
|  2|  Joao| 30|San Francisco|   r|
|  3|Flavio| 21|    Sao Paulo|   r|
+---+------+---+-------------+----+



## Insert data

In [24]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("__op", StringType(), True)
])

# Create a DataFrame using the schema
data = [(4, "Alic2e", 252, "New York2", "r"), 
        (5, "Flavio2", 212, "Sao Paulo2", "r")]

df_new = spark.createDataFrame(data, schema)

(df_new.write
     .format("iceberg")
     .mode("append")
     .save("local.db.example")
)

In [25]:
spark.sql("""
SELECT * FROM local.db.example.history;
""").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2024-04-26 22:17:...|6372009097357199076|               null|               true|
|2024-04-26 22:28:...|4004617722887816301|6372009097357199076|               true|
|2024-04-26 22:29:...|5616672445895419694|4004617722887816301|               true|
|2024-04-26 22:29:...|3648637280184859472|5616672445895419694|               true|
+--------------------+-------------------+-------------------+-------------------+



In [26]:
spark.sql("""
SELECT * FROM local.db.example;
""").show()

+---+-------+---+-------------+----+
| id|   name|age|         city|__op|
+---+-------+---+-------------+----+
|  4| Alic2e|252|    New York2|   r|
|  5|Flavio2|212|   Sao Paulo2|   r|
|  1|  Alice| 25|     New York|   r|
|  2|   Joao| 30|San Francisco|   r|
|  3| Flavio| 21|    Sao Paulo|   r|
+---+-------+---+-------------+----+



In [29]:
spark.sql("""
SELECT * FROM local.db.example FOR SYSTEM_VERSION AS OF 6372009097357199076;
""").show()

+---+------+---+-------------+----+
| id|  name|age|         city|__op|
+---+------+---+-------------+----+
|  1| Alice| 25|     New York|   r|
|  2|  Joao| 30|San Francisco|   r|
|  3|Flavio| 21|    Sao Paulo|   r|
+---+------+---+-------------+----+



In [28]:
spark.sql("""
SELECT * FROM local.db.example FOR SYSTEM_VERSION AS OF 4004617722887816301;
""").show()

+---+-------+---+----------+----+
| id|   name|age|      city|__op|
+---+-------+---+----------+----+
|  4| Alic2e|252| New York2|   r|
|  5|Flavio2|212|Sao Paulo2|   r|
+---+-------+---+----------+----+



In [33]:
spark.read.format("iceberg").option("snapshot-id", 6372009097357199076).load("local.db.example").show()

+---+------+---+-------------+----+
| id|  name|age|         city|__op|
+---+------+---+-------------+----+
|  1| Alice| 25|     New York|   r|
|  2|  Joao| 30|San Francisco|   r|
|  3|Flavio| 21|    Sao Paulo|   r|
+---+------+---+-------------+----+



In [19]:
spark.sql("""
SELECT * FROM local.db.example.snapshots;
""").show()

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2024-04-26 22:17:...|6372009097357199076|               null|overwrite|/home/jovyan/ware...|{spark.app.id -> ...|
|2024-04-26 22:28:...|4004617722887816301|6372009097357199076|overwrite|/home/jovyan/ware...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+



In [31]:
spark.sql("""
SELECT * FROM local.db.example.metadata_log_entries;
""").show()

+--------------------+--------------------+-------------------+----------------+----------------------+
|           timestamp|                file| latest_snapshot_id|latest_schema_id|latest_sequence_number|
+--------------------+--------------------+-------------------+----------------+----------------------+
|2024-04-26 22:11:...|/home/jovyan/ware...|               null|            null|                  null|
|2024-04-26 22:17:...|/home/jovyan/ware...|6372009097357199076|               0|                     0|
|2024-04-26 22:28:...|/home/jovyan/ware...|4004617722887816301|               0|                     0|
|2024-04-26 22:29:...|/home/jovyan/ware...|5616672445895419694|               0|                     0|
|2024-04-26 22:29:...|/home/jovyan/ware...|3648637280184859472|               0|                     0|
+--------------------+--------------------+-------------------+----------------+----------------------+



In [2]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("__op", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York", "r"), 
        (2, "Joao", 30, "San Francisco", "r"), 
        (3, "Flavio", 21, "Sao Paulo", "r")]

df = spark.createDataFrame(data, schema)

(df.write
 .format("iceberg")
 .mode("overwrite")
 .saveAsTable(f"{catalog_name}.db2.tabela_iceberg")
)

In [5]:
df2 = spark.read.format("iceberg").load(f"{catalog_name}.db2.tabela_iceberg")

df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- __op: string (nullable = true)



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 37552)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/usr/local/spark/python/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/local/spark/python/

In [None]:
# Streaming https://iceberg.apache.org/docs/latest/spark-structured-streaming/
# Update https://iceberg.apache.org/docs/latest/spark-writes/
# Spark DDL https://iceberg.apache.org/docs/latest/spark-ddl/
# Branching and Tagging https://iceberg.apache.org/docs/latest/branching/