In [1]:
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
current_dir = os.getcwd()
dir_warehouse = f"{current_dir}/warehouse"

In [3]:
spark = SparkSession.builder \
    .appName("IcebergWithSpark") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.6.1,org.postgresql:postgresql:42.3.1") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.hadoop_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.hadoop_catalog.type", "hadoop") \
    .config("spark.sql.catalog.hadoop_catalog.warehouse", dir_warehouse) \
    .config("spark.sql.default.catalog", "hadoop_catalog") \
    .getOrCreate()

25/01/09 21:24:17 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
25/01/09 21:24:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/apolo/anaconda3/envs/pyspark/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/apolo/.ivy2/cache
The jars for the packages stored in: /home/apolo/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-92254366-de4a-42d7-922e-d30f11af58ca;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.6.1 in central
	found org.postgresql#postgresql;42.3.1 in central
	found org.checkerframework#checker-qual;3.5.0 in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/1.6.1/iceberg-spark-runtime-3.3_2.12-1.6.1.jar ...
	[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.6.1!iceberg-spark-runtime-3.3_2.12.jar (4175ms)
downloading https://repo1.maven.org/maven2/org/postgresql/postgresql/42.3.1/postgresql-42.3.1.jar ...
	[SUCCESSFUL ] org.postgresql#postgresql;42.3.1!postgresql.jar (281ms)
downloading https://repo1.mave

25/01/09 21:24:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# !unzip ../Iceberg/vendas_iceberg.zip -d ./
# !mkdir -p ./warehouse/default/vendas_iceberg
# !cp -r ../Iceberg/vendas_iceberg/* ./warehouse/default/vendas_iceberg/

In [5]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/apolo/Dropbox/programacao/Udemy/2024/engenharia_de_dados_com_apache_iceberg_e_spark/00-scripts_apolo/spark-warehouse')]

In [6]:
spark.catalog.listTables(dbName="default")

[]

In [7]:
# Exclui a tabela se existir
spark.sql("DROP TABLE IF EXISTS hadoop_catalog.default.vendas")

# Cria a tabela Vendas no catalogo, usando Iceberg
spark.sql("""
CREATE TABLE hadoop_catalog.default.vendas (
    id INT,
    produto STRING,
    quantidade INT,
    preco DOUBLE,
    data_venda DATE
)
USING iceberg
""")

DataFrame[]

In [8]:
# Incluindo dados na tabela vendas
data = [
    (1, "Produto A", 10, 15.5, "2024-11-01"),
    (2, "Produto B", 5, 22.0, "2024-11-02"),
    (3, "Produto C", 8, 30.0, "2024-11-03")
]
columns = ["id", "produto", "quantidade", "preco", "data_venda"]
df = spark.createDataFrame(data, columns)
df = df.withColumn("data_venda", F.to_date(F.col("data_venda"), "yyyy-MM-dd"))

# Gravamos os dados na tabela vendas
df.writeTo("hadoop_catalog.default.vendas").append()

                                                                                

In [9]:
# Verificamos os dados
spark.sql("SELECT * FROM hadoop_catalog.default.vendas").show()

+---+---------+----------+-----+----------+
| id|  produto|quantidade|preco|data_venda|
+---+---------+----------+-----+----------+
|  1|Produto A|        10| 15.5|2024-11-01|
|  2|Produto B|         5| 22.0|2024-11-02|
|  3|Produto C|         8| 30.0|2024-11-03|
+---+---------+----------+-----+----------+



### Evolução do Schema

In [10]:
# Nova coluna desconto
spark.sql("""
ALTER TABLE hadoop_catalog.default.vendas
ADD COLUMN desconto DOUBLE
""")

DataFrame[]

In [11]:
# Schema Atualizado
spark.sql("DESCRIBE hadoop_catalog.default.vendas").show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|             id|      int|       |
|        produto|   string|       |
|     quantidade|      int|       |
|          preco|   double|       |
|     data_venda|     date|       |
|       desconto|   double|       |
|               |         |       |
| # Partitioning|         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



In [12]:
# Inserindo dados com a nova coluna
data_new = [
    (4, "Produto D", 12, 25.0, "2024-11-04", 2.5),
    (5, "Produto E", 7, 18.0, "2024-11-05", 1.0)
]
columns_new = ["id", "produto", "quantidade", "preco", "data_venda", "desconto"]
df_new = spark.createDataFrame(data_new, columns_new)
df_new = df_new.withColumn("data_venda", F.to_date(F.col("data_venda"), "yyyy-MM-dd"))

# Gravando os dados
df_new.writeTo("hadoop_catalog.default.vendas").append()

In [14]:
# Consultando
spark.sql("SELECT * FROM hadoop_catalog.default.vendas").show()

+---+---------+----------+-----+----------+--------+
| id|  produto|quantidade|preco|data_venda|desconto|
+---+---------+----------+-----+----------+--------+
|  4|Produto D|        12| 25.0|2024-11-04|     2.5|
|  1|Produto A|        10| 15.5|2024-11-01|    null|
|  5|Produto E|         7| 18.0|2024-11-05|     1.0|
|  2|Produto B|         5| 22.0|2024-11-02|    null|
|  3|Produto C|         8| 30.0|2024-11-03|    null|
+---+---------+----------+-----+----------+--------+



In [33]:
# mostra snapshots
snapshots_df = spark.sql("SELECT * FROM hadoop_catalog.default.vendas.snapshots order by committed_at asc")
print(snapshots_df.columns)

['committed_at', 'snapshot_id', 'parent_id', 'operation', 'manifest_list', 'summary']


In [34]:
snapshots_df.select("committed_at", "snapshot_id", "operation").show(truncate=False)

+-----------------------+-------------------+---------+
|committed_at           |snapshot_id        |operation|
+-----------------------+-------------------+---------+
|2025-01-09 21:24:44.304|4794066217950379325|append   |
|2025-01-09 21:25:31.192|8905770237618997467|append   |
+-----------------------+-------------------+---------+



In [35]:
record = snapshots_df.select("snapshot_id", "summary").collect()

idx = 1
print(record[idx].snapshot_id)
record[idx].summary

8905770237618997467


{'engine-version': '3.3.0',
 'added-data-files': '2',
 'total-equality-deletes': '0',
 'app-id': 'local-1736468665595',
 'added-records': '2',
 'total-records': '5',
 'spark.app.id': 'local-1736468665595',
 'changed-partition-count': '1',
 'engine-name': 'spark',
 'total-position-deletes': '0',
 'added-files-size': '3380',
 'total-delete-files': '0',
 'iceberg-version': 'Apache Iceberg 1.6.1 (commit 8e9d59d299be42b0bca9461457cd1e95dbaad086)',
 'total-files-size': '7654',
 'total-data-files': '5'}