In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName('dataincode') \
    .config("spark.jars", "/opt/spark/jars/iceberg-spark-runtime-3.5_2.12-1.6.0.jar") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local.warehouse", "s3a://datalake/iceberg") \
    .getOrCreate()

#Ajuste de log WARN log para ERROR
spark.sparkContext.setLogLevel("ERROR")

24/10/25 19:53:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# Importar funções 
from IPython.display import display, HTML

In [3]:
%run ./Includes/Utils.ipynb

In [4]:
%run ./Includes/Datasets.ipynb

## Criar Dataframes e escrever tabelas no catalogo

In [7]:
# Inicio rapido

# help(create_dataframe)
print(lista_amostras)
# create_dataframe?

['columns_schema', 'init_data', 'feb_data', 'mar_data', 'apr_data']


In [23]:
init_data_df = create_dataframe(columns_schema, init_data)

In [9]:
init_data_df.show(5)

                                                                                

+------------+----------+-----------+----------+----------+----------+---------------+---------+
|order_number|order_date|qty_ordered|unit_price|    status|product_id|product_line_id|  country|
+------------+----------+-----------+----------+----------+----------+---------------+---------+
|       10168|2024-01-23|          5|    98.115|  Disputed|  S10_1949|           1002|   France|
|       10180|2024-01-22|          1|    951.87|In Process|  S10_2016|           1002|   Norway|
|       10188|2024-01-21|         65|    95.202| Cancelled|  S10_4698|           1002|Australia|
|       10201|2024-01-26|          8|    951.17|   On Hold|  S10_4757|           1221|  Finland|
+------------+----------+-----------+----------+----------+----------+---------------+---------+



In [11]:
spark.sql("USE iceberg").show()

++
||
++
++



In [12]:
spark.sql("show catalogs").show()

+-------------+
|      catalog|
+-------------+
|      iceberg|
|spark_catalog|
+-------------+



### Criar Tabela
- ``` df.writeTo(t).create() ``` é equivalente a ``` CREATE TABLE AS SELECT ```
- ``` df.writeTo(t).replace() ``` é equivalente a ``` REPLACE TABLE AS SELECT ```
- ``` df.writeTo(t).append() ``` é equivalente a ``` INSERT INTO ```
- ``` df.writeTo(t).overwritePartitions() ``` é equivalente a dynamic ``` INSERT OVERWRITE ```
fonte: https://iceberg.apache.org/docs/1.6.0/spark-writes/?h=df.writeto%28t%29.create%28%29#writing-with-dataframes

In [24]:
# Escrever tabela no storage
(
    init_data_df
    .writeTo("iceberg.bronze.vendas")
    .partitionedBy("country")
    .createOrReplace()
)

In [25]:
spark.sql("SHOW TABLES in bronze").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   bronze|   vendas|      false|
+---------+---------+-----------+



In [15]:
## Visualizar os dados simples e leve

spark.sql("SELECT * FROM iceberg.bronze.vendas").show()

[Stage 6:>                                                          (0 + 1) / 1]

+------------+----------+-----------+----------+----------+----------+---------------+---------+
|order_number|order_date|qty_ordered|unit_price|    status|product_id|product_line_id|  country|
+------------+----------+-----------+----------+----------+----------+---------------+---------+
|       10180|2024-01-22|          1|    951.87|In Process|  S10_2016|           1002|   Norway|
|       10201|2024-01-26|          8|    951.17|   On Hold|  S10_4757|           1221|  Finland|
|       10168|2024-01-23|          5|    98.115|  Disputed|  S10_1949|           1002|   France|
|       10188|2024-01-21|         65|    95.202| Cancelled|  S10_4698|           1002|Australia|
+------------+----------+-----------+----------+----------+----------+---------------+---------+



                                                                                

In [16]:
## Visualizar os dados

spark.sql("SELECT * FROM iceberg.bronze.vendas").toPandas()

Unnamed: 0,order_number,order_date,qty_ordered,unit_price,status,product_id,product_line_id,country
0,10180,2024-01-22,1,951.87,In Process,S10_2016,1002,Norway
1,10201,2024-01-26,8,951.17,On Hold,S10_4757,1221,Finland
2,10168,2024-01-23,5,98.115,Disputed,S10_1949,1002,France
3,10188,2024-01-21,65,95.202,Cancelled,S10_4698,1002,Australia


In [26]:
## Inserir novos dados

feb_data_df = create_dataframe(columns_schema, feb_data)

feb_data_df.writeTo("iceberg.bronze.vendas").append()

### Primeiras Impressões

In [18]:
## Descrever tabela

spark.sql("DESCRIBE iceberg.bronze.vendas").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|        order_number|   string|   NULL|
|          order_date|   string|   NULL|
|         qty_ordered|   string|   NULL|
|          unit_price|   string|   NULL|
|              status|   string|   NULL|
|          product_id|   string|   NULL|
|     product_line_id|   string|   NULL|
|             country|   string|   NULL|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|             country|   string|   NULL|
+--------------------+---------+-------+



In [19]:
## Inspecionar tabelas

spark.sql("SELECT * FROM iceberg.bronze.vendas.history").show()

+--------------------+-------------------+-------------------+-------------------+
|     made_current_at|        snapshot_id|          parent_id|is_current_ancestor|
+--------------------+-------------------+-------------------+-------------------+
|2024-10-25 19:57:...|2782458204956677777|               NULL|               true|
|2024-10-25 19:58:...|5095974369396353990|2782458204956677777|               true|
+--------------------+-------------------+-------------------+-------------------+



In [20]:
spark.sql("SHOW TBLPROPERTIES iceberg.bronze.vendas").toPandas()

Unnamed: 0,key,value
0,current-snapshot-id,5095974369396353990
1,format,iceberg/parquet
2,format-version,2
3,write.parquet.compression-codec,zstd


In [21]:
spark.sql("SELECT COUNT(*) FROM iceberg.bronze.vendas").show()

+--------+
|count(1)|
+--------+
|       9|
+--------+



## Testes

In [22]:
## Para deletar por completo do catalog e storage
spark.sql("DROP TABLE iceberg.bronze.vendas PURGE")

                                                                                

DataFrame[]

In [27]:
spark.stop()