In [2]:
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
current_dir = os.getcwd()
dir_warehouse = f"{current_dir}/warehouse"

In [3]:
spark = SparkSession.builder \
    .appName("IcebergWithSpark") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.6.1,org.postgresql:postgresql:42.3.1") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.hadoop_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.hadoop_catalog.type", "hadoop") \
    .config("spark.sql.catalog.hadoop_catalog.warehouse", dir_warehouse) \
    .config("spark.sql.default.catalog", "hadoop_catalog") \
    .getOrCreate()

25/01/07 21:25:16 WARN Utils: Your hostname, dell resolves to a loopback address: 127.0.1.1; using 192.168.15.6 instead (on interface wlp0s20f3)
25/01/07 21:25:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/01/07 21:25:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# !unzip ../Iceberg/vendas_iceberg.zip -d ./
# !mkdir -p ./warehouse/default/vendas_iceberg
# !cp -r ../Iceberg/vendas_iceberg/* ./warehouse/default/vendas_iceberg/

In [4]:
spark.catalog.listDatabases()

[Database(name='default', description='default database', locationUri='file:/home/apolo/Dropbox/programacao/Udemy/2024/engenharia_de_dados_com_apache_iceberg_e_spark/00-scripts_apolo/spark-warehouse')]

In [5]:
spark.catalog.listTables(dbName="default")

[]

In [None]:
# Criamos a tabela vendas
spark.sql("DROP TABLE IF EXISTS hadoop_catalog.default.vendas_iceberg")

spark.sql("""
CREATE TABLE IF NOT EXISTS hadoop_catalog.default.vendas_iceberg (
    id_vendas INT,
    id_veiculos INT,
    id_concessionarias INT,
    id_vendedores INT,
    id_clientes INT,
    valor_pago DOUBLE,
    data_venda TIMESTAMP,
    data_inclusao TIMESTAMP,
    data_atualizacao TIMESTAMP
)
USING iceberg
PARTITIONED BY (days(data_venda))
""")

In [None]:
# Credenciais do PostgreSQL
jdbc_hostname = "159.223.187.110"
jdbc_port = 5432
jdbc_database = "novadrive"
jdbc_username = "etlreadonly"
jdbc_password = "novadrive376A@"

# URL JDBC de conexão
jdbc_url = f"jdbc:postgresql://{jdbc_hostname}:{jdbc_port}/{jdbc_database}"

# Propriedades de conexão JDBC
connection_properties = {
    "user": jdbc_username,
    "password": jdbc_password,
    "driver": "org.postgresql.Driver"
}

In [None]:
def read_postgres_data(last_run_timestamp):
    query = f"""(
    SELECT *
    FROM vendas
    WHERE data_inclusao > '{last_run_timestamp}' OR data_atualizacao > '{last_run_timestamp}'
    ) AS vendas_subquery
    """
    df = spark.read.jdbc(
        url=jdbc_url,
        table=query,
        properties=connection_properties
    )
    return df

In [4]:
# Timestamp apenas para a primeira execução

# Data atual
current_date = datetime.now()

# Data de seis meses atrás
last_run_timestamp = current_date - relativedelta(months=6)

# Formatar como string no formato desejado
last_run_timestamp_str = last_run_timestamp.strftime("%Y-%m-%d %H:%M:%S")

print("Data seis meses atrás:", last_run_timestamp_str)

Data seis meses atrás: 2024-07-07 21:57:33


In [None]:
# Ler dados do PostgreSQL
df_postgres = read_postgres_data(last_run_timestamp)

# Exibir os dados lidos
df_postgres.show()

In [None]:
# Criar uma visão temporária
df_postgres.createOrReplaceTempView("vendas_updates")

# Executar o MERGE INTO
spark.sql("""
MERGE INTO hadoop_catalog.default.vendas_iceberg AS target
USING vendas_updates AS source
ON target.id_vendas = source.id_vendas
WHEN MATCHED THEN
    UPDATE SET *
WHEN NOT MATCHED THEN
    INSERT *
""")

In [None]:
row_count = spark.sql("SELECT COUNT(*) FROM hadoop_catalog.default.vendas_iceberg").collect()[0][0]
print(f"Total de linhas na tabela vendas_iceberg: {row_count}")
# 26214

In [None]:
# Atualizar o timestamp da última execução
last_run_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Processo concluído. Novo timestamp da última execução: {last_run_timestamp}")