In [1]:
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
current_dir = os.getcwd()
dir_warehouse = f"{current_dir}/warehouse"

In [3]:
spark = SparkSession.builder \
    .appName("IcebergWithSpark") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.6.1,org.postgresql:postgresql:42.3.1") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.hadoop_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.hadoop_catalog.type", "hadoop") \
    .config("spark.sql.catalog.hadoop_catalog.warehouse", dir_warehouse) \
    .config("spark.sql.default.catalog", "hadoop_catalog") \
    .getOrCreate()

25/01/09 21:50:18 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
25/01/09 21:50:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/apolo/anaconda3/envs/pyspark/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/apolo/.ivy2/cache
The jars for the packages stored in: /home/apolo/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-249f2e58-debb-4d44-aaaa-87adec112767;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.6.1 in central
	found org.postgresql#postgresql;42.3.1 in central
	found org.checkerframework#checker-qual;3.5.0 in central
:: resolution report :: resolve 89ms :: artifacts dl 4ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.6.1 from central in [default]
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.3.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| sea

25/01/09 21:50:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/09 21:50:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# !unzip ../Iceberg/vendas_iceberg.zip -d ./
# !mkdir -p ./warehouse/default/vendas_iceberg
# !cp -r ../Iceberg/vendas_iceberg/* ./warehouse/default/vendas_iceberg/

In [5]:
# Criando Vendas
spark.sql("DROP TABLE IF EXISTS hadoop_catalog.default.vendas")

spark.sql("""
CREATE TABLE hadoop_catalog.default.vendas (
    id INT,
    produto STRING,
    quantidade INT,
    preco DOUBLE,
    data_venda DATE
)
USING iceberg
""")

DataFrame[]

In [6]:
# Incluindo dados
data = [
    (1, "Produto A", 10, 15.5, "2024-11-01"),
    (2, "Produto B", 5, 22.0, "2024-11-02"),
    (3, "Produto C", 8, 30.0, "2024-11-03")
]
columns = ["id", "produto", "quantidade", "preco", "data_venda"]
df = spark.createDataFrame(data, columns)
df = df.withColumn("data_venda", F.to_date(F.col("data_venda"), "yyyy-MM-dd"))

df.writeTo("hadoop_catalog.default.vendas").append()

                                                                                

In [7]:
# Vizualizando Dados
spark.sql("SELECT * FROM hadoop_catalog.default.vendas").show()

+---+---------+----------+-----+----------+
| id|  produto|quantidade|preco|data_venda|
+---+---------+----------+-----+----------+
|  1|Produto A|        10| 15.5|2024-11-01|
|  2|Produto B|         5| 22.0|2024-11-02|
|  3|Produto C|         8| 30.0|2024-11-03|
+---+---------+----------+-----+----------+



### Uso de Metadados

In [8]:
# 1. Visualizando schemas do catálogo
print("Schemas no catálogo:")
spark.sql("""SHOW SCHEMAS IN hadoop_catalog""").show()

Schemas no catálogo:
+---------+
|namespace|
+---------+
|  default|
+---------+



In [9]:
# 2. Visualizando tabelas do schema default
print("Tabelas no schema default:")
spark.sql("""SHOW TABLES IN hadoop_catalog.default""").show()

Tabelas no schema default:
+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|      vendas_iceberg|      false|
|  default|  vendas_partitioned|      false|
|  default|    vendas_versioned|      false|
|  default|              vendas|      false|
|  default|clientes_partitioned|      false|
+---------+--------------------+-----------+



In [10]:
# 3. Schema da tabela vendas
print("Schema da tabela vendas:")
spark.sql("""DESCRIBE TABLE hadoop_catalog.default.vendas""").show()

Schema da tabela vendas:
+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|             id|      int|       |
|        produto|   string|       |
|     quantidade|      int|       |
|          preco|   double|       |
|     data_venda|     date|       |
|               |         |       |
| # Partitioning|         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



In [11]:
# 4. Propriedades da tabela 'vendas'
print("Propriedades da tabela vendas:")
spark.sql("""DESCRIBE EXTENDED hadoop_catalog.default.vendas""").show(truncate=False)

Propriedades da tabela vendas:
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                                          |comment|
+----------------------------+-----------------------------------------------------------------------------------------------------------------------------------+-------+
|id                          |int                                                                                                                                |       |
|produto                     |string                                                                                                                             |       |
|quantidade                  |int                                                                                 

In [None]:
# # 5. Listando tabelas de metadados para vendas
# metadata_tables = ["snapshots", "manifests", "partitions", "files", "history", "refs"]

# for table in metadata_tables:
#     print(f"\nMetadados da tabela '{table}':")
#     spark.sql(f"SELECT * FROM hadoop_catalog.default.vendas.{table}").show(truncate=False)

In [19]:
spark.sql("SELECT * FROM hadoop_catalog.default.vendas.snapshots").show(truncate=False)

+-----------------------+-------------------+---------+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id|operation|manifest_list                                                                                                                                                                                            

In [21]:
spark.sql("SELECT * FROM hadoop_catalog.default.vendas.manifests").show(truncate=False)

+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----------------+-------------------+----------------------+-------------------------+------------------------+------------------------+---------------------------+--------------------------+-------------------+
|content|path                                                                                                                                                                                     |length|partition_spec_id|added_snapshot_id  |added_data_files_count|existing_data_files_count|deleted_data_files_count|added_delete_files_count|existing_delete_files_count|deleted_delete_files_count|partition_summaries|
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
spark.sql("SELECT * FROM hadoop_catalog.default.vendas.partitions").show(truncate=False)

+------------+----------+-----------------------------+----------------------------+--------------------------+----------------------------+--------------------------+-----------------------+------------------------+
|record_count|file_count|total_data_file_size_in_bytes|position_delete_record_count|position_delete_file_count|equality_delete_record_count|equality_delete_file_count|last_updated_at        |last_updated_snapshot_id|
+------------+----------+-----------------------------+----------------------------+--------------------------+----------------------------+--------------------------+-----------------------+------------------------+
|3           |3         |4274                         |0                           |0                         |0                           |0                         |2025-01-09 21:50:27.386|7874342425608235472     |
+------------+----------+-----------------------------+----------------------------+--------------------------+---------------------

In [23]:
spark.sql("SELECT * FROM hadoop_catalog.default.vendas.files").show(truncate=False)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-------+------------+------------------+---------------------------------------------+----------------------------------------+----------------------------------------+----------------+-----------------------------------------------------------------+-----------------------------------------------------------------+------------+-------------+------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
|content|file_path                                                                                                                                                                                             |file_format|spec_id|record_count|file_si

In [24]:
spark.sql("SELECT * FROM hadoop_catalog.default.vendas.history").show(truncate=False)

+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2025-01-09 21:50:27.386|7874342425608235472|null     |true               |
+-----------------------+-------------------+---------+-------------------+



In [25]:
spark.sql("SELECT * FROM hadoop_catalog.default.vendas.refs").show(truncate=False)

+----+------+-------------------+-----------------------+---------------------+----------------------+
|name|type  |snapshot_id        |max_reference_age_in_ms|min_snapshots_to_keep|max_snapshot_age_in_ms|
+----+------+-------------------+-----------------------+---------------------+----------------------+
|main|BRANCH|7874342425608235472|null                   |null                 |null                  |
+----+------+-------------------+-----------------------+---------------------+----------------------+



In [13]:
# 6. Catálogos disponíveis
print("Catálogos:")
spark.sql("""SHOW CATALOGS""").show()

Catálogos:
+--------------+
|       catalog|
+--------------+
|hadoop_catalog|
+--------------+



In [14]:
# 7. Visualizando catálogo hadoop_catalog
print("Visualizando hadoop_catalog:")
catalog_conf = spark.sparkContext.getConf().getAll()

for key, value in catalog_conf:
    if 'hadoop_catalog' in key:
        print(f"{key}: {value}")

Visualizando hadoop_catalog:
spark.sql.catalog.hadoop_catalog.warehouse: /home/apolo/Dropbox/programacao/Udemy/2024/engenharia_de_dados_com_apache_iceberg_e_spark/00-scripts_apolo/warehouse
spark.sql.catalog.hadoop_catalog: org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.hadoop_catalog.type: hadoop


In [15]:
# 8. Novo schema sales
spark.sql("""CREATE SCHEMA IF NOT EXISTS hadoop_catalog.sales""")

DataFrame[]

In [16]:
# Listando os schemas
print("Schemas no hadoop_catalog após adicionar sales:")

spark.sql("""SHOW SCHEMAS IN hadoop_catalog""").show()

Schemas no hadoop_catalog após adicionar sales:
+---------+
|namespace|
+---------+
|    sales|
|  default|
+---------+



In [17]:
# 9. Criando tabela no schema sales
spark.sql("""
CREATE TABLE hadoop_catalog.sales.orders (
    order_id INT,
    customer_id INT,
    amount DOUBLE,
    order_date DATE
)
USING iceberg
""")

DataFrame[]

In [18]:
# Visualizando tabelas no schema sales
print("Tabelas no schema 'sales':")

spark.sql("SHOW TABLES IN hadoop_catalog.sales").show()

Tabelas no schema 'sales':
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|    sales|   orders|      false|
+---------+---------+-----------+

