In [1]:
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
current_dir = os.getcwd()
dir_warehouse = f"{current_dir}/warehouse"

In [3]:
spark = SparkSession.builder \
    .appName("IcebergWithSpark") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.6.1,org.postgresql:postgresql:42.3.1") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.hadoop_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.hadoop_catalog.type", "hadoop") \
    .config("spark.sql.catalog.hadoop_catalog.warehouse", dir_warehouse) \
    .config("spark.sql.default.catalog", "hadoop_catalog") \
    .getOrCreate()

25/01/09 21:38:45 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
25/01/09 21:38:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/apolo/anaconda3/envs/pyspark/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/apolo/.ivy2/cache
The jars for the packages stored in: /home/apolo/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.3_2.12 added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9a5f6236-9534-4ff5-84e2-69918e260623;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.6.1 in central
	found org.postgresql#postgresql;42.3.1 in central
	found org.checkerframework#checker-qual;3.5.0 in central
:: resolution report :: resolve 134ms :: artifacts dl 4ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.3_2.12;1.6.1 from central in [default]
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.3.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| se

25/01/09 21:38:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/09 21:38:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# !unzip ../Iceberg/vendas_iceberg.zip -d ./
# !mkdir -p ./warehouse/default/vendas_iceberg
# !cp -r ../Iceberg/vendas_iceberg/* ./warehouse/default/vendas_iceberg/

In [5]:
# Exclui se existir
spark.sql("DROP TABLE IF EXISTS hadoop_catalog.default.vendas")

# Cria a tabela Vendas
spark.sql("""
CREATE TABLE hadoop_catalog.default.vendas (
    id INT,
    produto STRING,
    quantidade INT,
    preco DOUBLE,
    data_venda DATE
)
USING iceberg
""")

DataFrame[]

In [6]:
# Inserir dados fracionados para criar vários arquivos

def inserir_dados(pequeno_lote):
    df = spark.createDataFrame(pequeno_lote, ["id", "produto", "quantidade", "preco", "data_venda"])
    df = df.withColumn("data_venda", F.to_date(F.col("data_venda"), "yyyy-MM-dd"))
    df.writeTo("hadoop_catalog.default.vendas").append()


# 10 Lotes de Dados
for i in range(1, 11):
    data = [(i, f"Produto {i}", i * 2, i * 10.0, f"2024-11-{i:02d}")]
    inserir_dados(data)

                                                                                

In [8]:
# Contar arquivos e registros
print("Antes da compactação:")
df_files_before = spark.sql("""
SELECT
    COUNT(*) AS registros,
    input_file_name() AS arquivo
FROM hadoop_catalog.default.vendas
GROUP BY input_file_name()
""")
df_files_before.show(100, truncate=False)


num_arquivos_antes = df_files_before.count()
print(f"Número total de arquivos: {num_arquivos_antes}")

Antes da compactação:
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|registros|arquivo                                                                                                                                                                                                |
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1        |/home/apolo/Dropbox/programacao/Udemy/2024/engenharia_de_dados_com_apache_iceberg_e_spark/00-scripts_apolo/warehouse/default/vendas/data/00015-79-2d80149b-1984-4028-912b-dd4b9671a311-0-00001.parquet |
|1        |/home/apolo/Dropbox/programacao/Udemy/2024/engenharia_de_dados_com_apache_iceberg_e_spark/00-scripts_apolo/warehouse/de

In [9]:
# Tamanho máximo de registros por arquivo
spark.conf.set("spark.sql.files.maxRecordsPerFile", 1000)

# Compactação com proc 'rewrite_data_files'
spark.sql("""
CALL hadoop_catalog.system.rewrite_data_files(
    table => 'default.vendas'
)
""")

ANTLR Tool version 4.9.3 used for code generation does not match the current runtime version 4.8ANTLR Runtime version 4.9.3 used for parser compilation does not match the current runtime version 4.8ANTLR Tool version 4.9.3 used for code generation does not match the current runtime version 4.8ANTLR Runtime version 4.9.3 used for parser compilation does not match the current runtime version 4.8

DataFrame[rewritten_data_files_count: int, added_data_files_count: int, rewritten_bytes_count: bigint]

In [10]:
# Contar arquivos e registros
print("Após a compactação:")
df_files_after = spark.sql("""
SELECT
    COUNT(*) AS registros,
    input_file_name() AS arquivo
FROM hadoop_catalog.default.vendas
GROUP BY input_file_name()
""")
df_files_after.show(truncate=False)

num_arquivos_depois = df_files_after.count()
print(f"Número total de arquivos após a compactação: {num_arquivos_depois}")

Após a compactação:
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|registros|arquivo                                                                                                                                                                                                |
+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|10       |/home/apolo/Dropbox/programacao/Udemy/2024/engenharia_de_dados_com_apache_iceberg_e_spark/00-scripts_apolo/warehouse/default/vendas/data/00000-164-fc415516-8d39-4154-9c16-bb1cde2ce0c7-0-00001.parquet|
+---------+-------------------------------------------------------------------------------------------------------------------------

In [11]:
# definindo período de retenação
spark.sql("""
CALL hadoop_catalog.system.expire_snapshots(
    table => 'default.vendas',
    retain_last => 1
)
""")

DataFrame[deleted_data_files_count: bigint, deleted_position_delete_files_count: bigint, deleted_equality_delete_files_count: bigint, deleted_manifest_files_count: bigint, deleted_manifest_lists_count: bigint, deleted_statistics_files_count: bigint]