In [1]:
#DELTA
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType


conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
        .set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-core_2.12:2.4.0,org.postgresql:postgresql:42.6.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
        .set('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension')
        .set('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog')
        .set("spark.hadoop.fs.s3a.path.style.access", "true")
        .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
        .set("spark.hadoop.fs.s3a..connection.ssl.enabled", "true")
        .set("spark.hadoop.fs.s3a.endpoint", "http://minioserver:9000")
        .set('spark.hadoop.fs.s3a.access.key', 'SmRXzZ0KrpROvPSKDddy')
        .set('spark.hadoop.fs.s3a.secret.key', 'ApanXjLP51CxV3bjPqOlankgLDpeoIZyR4xewaOS')
)

# Inicializa a sessão Spark com suporte ao Delta Lake
spark = SparkSession.builder \
    .appName("DataIngestion") \
    .master("local")\
    .config(conf=conf)\
    .config("spark.driver.memory", "2g") \
    .getOrCreate()


In [1]:
#ICEBERG
import pyspark
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType


conf = (
    SparkConf()
    .set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.3.1,org.apache.hadoop:hadoop-aws:3.3.4,org.postgresql:postgresql:42.6.0,software.amazon.awssdk:bundle:2.17.178,software.amazon.awssdk:url-connection-client:2.17.178')
    .set("spark.sql.extensions", 
         "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") # Use Iceberg with Spark
    
    .set("spark.sql.catalog.data", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.data.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    .set("spark.sql.catalog.data.warehouse", "s3a://warehouse")
    .set("spark.sql.catalog.data.type", "hadoop")
    .set("spark.sql.catalog.data.s3.endpoint", "http://minioserver:9000")
    .set("spark.sql.defaultCatalog", "data") # Name of the Iceberg catalog
    .set("spark.sql.catalogImplementation", "in-memory")
    .set("spark.executor.heartbeatInterval", "300000")
    .set("spark.network.timeout", "400000")
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Disable below line to see INFO logs
spark.sparkContext.setLogLevel("ERROR")


def load_config(spark_context: SparkContext):
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.access.key", "SmRXzZ0KrpROvPSKDddy")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "ApanXjLP51CxV3bjPqOlankgLDpeoIZyR4xewaOS")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "http://minioserver:9000")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.ssl.enabled", "true")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.attempts.maximum", "1")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.establish.timeout", "5000")
    spark_context._jsc.hadoopConfiguration().set("fs.s3a.connection.timeout", "10000")

load_config(spark.sparkContext)

In [93]:
# Configurações do PostgreSQL
pg_url = "jdbc:postgresql://postgres:5432/db"
pg_properties = {
    "user": "user",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}

# Carrega dados das tabelas PostgreSQL
def load_table(table_name):
    return spark.read.jdbc(url=pg_url, table=table_name, properties=pg_properties)

condominios_df = load_table("public.condominios")
moradores_df = load_table("public.moradores")
transacoes_df = load_table("public.transacoes")
imoveis_df = load_table("public.imoveis")



In [3]:
# Configurações do MinIO
minio_endpoint = "http://minioserver:9000"
minio_access_key = "minioadmin"
minio_secret_key = "minioadmin"
minio_bucket = "condomanage"

# Salva os dados no MinIO em formato Parquet
def save_to_minio(df, path):
    df.write.mode("overwrite").parquet(f"s3a://{minio_bucket}/raw/{path}")

save_to_minio(condominios, "condominios")
save_to_minio(moradores, "moradores")
save_to_minio(imoveis, "imoveis")
save_to_minio(transacoes, "transacoes")

In [4]:
# Salva os dados no MinIO em formato Delta
def save_to_delta(df, path):
    df.write.format("delta").mode("overwrite").save(f"s3a://{minio_bucket}/raw/delta_{path}")

save_to_delta(condominios_df, "condominios_delta")
save_to_delta(moradores_df, "moradores_delta")
save_to_delta(imoveis_df, "imoveis_delta")
save_to_delta(transacoes_df, "transacoes_delta")

In [8]:
# Salva os dados no MinIO em formato iceberg
def save_to_iceberg(df, table):
    df.writeTo(f"{table}").using('iceberg').createOrReplace()

save_to_iceberg(condominios_df, "condominios_iceberg")
save_to_iceberg(moradores_df, "moradores_iceberg")
save_to_iceberg(imoveis_df, "imoveis_iceberg")
save_to_iceberg(transacoes_df, "transacoes_iceberg")

In [3]:
condominios_df.write.format('delta').save('s3a://condomanage/bronze/upsell/condominios')
moradores_df.write.format('delta').save('s3a://condomanage/bronze/upsell/moradores')
transacoes_df.write.format('delta').save('s3a://condomanage/bronze/upsell/transacoes')
imoveis_df.write.format('delta').save('s3a://condomanage/bronze/upsell/imoveis')

In [114]:
df = spark.read.parquet('s3a://condomanage/raw/postgres.public.moradores')

df_selecionado = df.select(
    "value.after.condominio_id",
    "value.after.data_registro",
    "value.after.morador_id",
    "value.after.nome",
    "value.op",
    "value.ts_ms"
)

# Renomeia as colunas para facilitar o uso, se necessário
df_selecionado = df_selecionado.withColumnRenamed("value.after.condominio_id", "condominio_id") \
                               .withColumnRenamed("value.after.data_registro", "data_registro") \
                               .withColumnRenamed("value.after.morador_id", "morador_id") \
                               .withColumnRenamed("value.after.nome", "nome") \
                               .withColumnRenamed("value.op", "op") \
                               .withColumnRenamed("value.ts_ms", "ts_ms")

from pyspark.sql.functions import col, expr, to_date
df_selecionado = df_selecionado.withColumn("data_registro", to_date(expr("date_add('1970-01-01', data_registro)")))


In [116]:
df_selecionado.dtypes

[('condominio_id', 'int'),
 ('data_registro', 'date'),
 ('morador_id', 'int'),
 ('nome', 'string'),
 ('op', 'string'),
 ('ts_ms', 'bigint')]

+-------------+-------------+----------+-----------------+---+-------------+----------+
|condominio_id|data_registro|morador_id|             nome| op|        ts_ms|   as_date|
+-------------+-------------+----------+-----------------+---+-------------+----------+
|           80|        18826|         9|Valerie Garza &&&|  u|1722358655663|2021-07-18|
|           80|        18826|         9| Valerie Garza &&|  u|1722358664771|2021-07-18|
|           80|        18826|         9|   Valerie Garza |  u|1722356513277|2021-07-18|
|           32|        18608|        10|       Lisa Ryan |  u|1722356513279|2020-12-12|
|           78|        18492|        17|     Casey Young |  u|1722356513280|2020-08-18|
|           80|        18826|         9|    Valerie Garza|  u|1722359294390|2021-07-18|
+-------------+-------------+----------+-----------------+---+-------------+----------+



In [107]:
df_selecionado.createOrReplaceTempView('teste')

In [113]:
cdc = spark.sql('''
WITH 
    qualify as (select *, ROW_NUMBER() over(partition by morador_id order by ts_ms desc) as qualify from teste)
select * from qualify where qualify = 1''')

In [53]:
spark.read.parquet('s3a://condomanage/bronze/upsell/moradores').show()

+----------+----------------+-------------+-------------+
|morador_id|            nome|condominio_id|data_registro|
+----------+----------------+-------------+-------------+
|         2|  Kevin Andersen|           48|   2023-03-11|
|         3|  Melissa Harmon|           71|   2020-03-14|
|         4|Jennifer Montoya|           15|   2021-07-03|
|         5|     Dawn Obrien|           17|   2021-07-15|
|         8|  Michael Martin|           40|   2022-09-21|
|        11|   Cindy Barrett|           61|   2021-01-24|
|        12|  Lawrence Wiley|           38|   2021-12-26|
|        13|   Marcus Parker|           22|   2024-07-08|
|        14|    Justin Avila|           84|   2021-12-08|
|        15|  Tiffany Palmer|           20|   2020-12-03|
|        16|Stephanie Parker|           15|   2023-01-16|
|        18|    Terry Murray|           59|   2022-04-10|
|        19|     Gary Parker|           69|   2021-01-31|
|        20|Jason Cunningham|           10|   2020-02-16|
|        22|  

In [111]:
from delta.tables import DeltaTable
bronze = DeltaTable.forPath(spark, 's3a://condomanage/bronze/upsell/moradores')
cdc.dtypes

[('condominio_id', 'int'),
 ('data_registro', 'date'),
 ('morador_id', 'int'),
 ('nome', 'string'),
 ('op', 'string'),
 ('ts_ms', 'bigint'),
 ('qualify', 'int')]

In [112]:
cdc = cdc.withColumn("data_registro", to_date(expr("date_add('1970-01-01', data_registro)")))

(bronze.alias('b')
    .merge(cdc.alias('d'), 
    'b.morador_id = d.morador_id')
    .whenMatchedUpdateAll(condition = "d.op = 'u'")
    .whenMatchedDelete(condition = "d.op = 'd'")
    .whenNotMatchedInsertAll(condition = "d.op = 'i'")
    .execute()
)

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "date_add(1970-01-01, data_registro)" due to data type mismatch: Parameter 2 requires the ("INT" or "SMALLINT" or "TINYINT") type, however "data_registro" has the type "DATE".; line 1 pos 0;
'Project [condominio_id#5913, to_date(date_add(cast(1970-01-01 as date), data_registro#5962), None, Some(Etc/UTC)) AS data_registro#5995, morador_id#5915, nome#5916, op#5917, ts_ms#5918L, qualify#5969]
+- WithCTE
   :- CTERelationDef 8, false
   :  +- SubqueryAlias qualify
   :     +- Project [condominio_id#5913, data_registro#5962, morador_id#5915, nome#5916, op#5917, ts_ms#5918L, qualify#5969]
   :        +- Project [condominio_id#5913, data_registro#5962, morador_id#5915, nome#5916, op#5917, ts_ms#5918L, qualify#5969, qualify#5969]
   :           +- Window [row_number() windowspecdefinition(morador_id#5915, ts_ms#5918L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS qualify#5969], [morador_id#5915], [ts_ms#5918L DESC NULLS LAST]
   :              +- Project [condominio_id#5913, data_registro#5962, morador_id#5915, nome#5916, op#5917, ts_ms#5918L]
   :                 +- SubqueryAlias teste
   :                    +- View (`teste`, [condominio_id#5913,data_registro#5962,morador_id#5915,nome#5916,op#5917,ts_ms#5918L])
   :                       +- Project [condominio_id#5913, to_date(date_add(cast(1970-01-01 as date), data_registro#5914), None, Some(Etc/UTC)) AS data_registro#5962, morador_id#5915, nome#5916, op#5917, ts_ms#5918L]
   :                          +- Project [value#5911.after.condominio_id AS condominio_id#5913, value#5911.after.data_registro AS data_registro#5914, value#5911.after.morador_id AS morador_id#5915, value#5911.after.nome AS nome#5916, value#5911.op AS op#5917, value#5911.ts_ms AS ts_ms#5918L]
   :                             +- Relation [value#5911] parquet
   +- Project [condominio_id#5913, data_registro#5962, morador_id#5915, nome#5916, op#5917, ts_ms#5918L, qualify#5969]
      +- Filter (qualify#5969 = 1)
         +- SubqueryAlias qualify
            +- CTERelationRef 8, true, [condominio_id#5913, data_registro#5962, morador_id#5915, nome#5916, op#5917, ts_ms#5918L, qualify#5969]


In [105]:
spark.read.parquet('s3a://condomanage/bronze/upsell/moradores').show()

+----------+----------------+-------------+-------------+
|morador_id|            nome|condominio_id|data_registro|
+----------+----------------+-------------+-------------+
|         2|  Kevin Andersen|           48|   2023-03-11|
|         3|  Melissa Harmon|           71|   2020-03-14|
|         4|Jennifer Montoya|           15|   2021-07-03|
|         5|     Dawn Obrien|           17|   2021-07-15|
|         8|  Michael Martin|           40|   2022-09-21|
|        11|   Cindy Barrett|           61|   2021-01-24|
|        12|  Lawrence Wiley|           38|   2021-12-26|
|        13|   Marcus Parker|           22|   2024-07-08|
|        14|    Justin Avila|           84|   2021-12-08|
|        15|  Tiffany Palmer|           20|   2020-12-03|
|        16|Stephanie Parker|           15|   2023-01-16|
|        18|    Terry Murray|           59|   2022-04-10|
|        19|     Gary Parker|           69|   2021-01-31|
|        20|Jason Cunningham|           10|   2020-02-16|
|        22|  