##### Enriquecimento da tabela personalfinance.bronze.gastos para a silver layer
01. Realizar a leitura da bronze layer com spark.table()

In [0]:
df = spark.table('personalfinance.bronze.gastos_bradesco')

In [0]:
from pyspark.sql.functions import col, lit, year, concat, trim, upper, to_date, regexp_extract, length, when, count, asc, sum, regexp_replace, split, make_date, year, month, dayofmonth, concat_ws
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, DateType

In [0]:
schemas_silver = {
 'card_final': StringType(),
 'data_da_compra': DateType(),
 'descricao': StringType(),
 'valor_usd': DoubleType(),
 'valor_brl': DoubleType(),
 'status_fatura': StringType(),
 'file_name': StringType(),
 'file_path': StringType(),
 'ingestion_timestamp': TimestampType()}

In [0]:
df = df.filter(col('data_da_compra').rlike(r'^\d{2}/\d{2}'))

In [0]:
for coluna in df.columns:
    if coluna not in ['ingestion_timestamp', 'file_name', 'file_path']:
        df = df.withColumn(coluna, upper(trim(col(coluna))))

In [0]:
df = df \
    .withColumn("compra_dia", split(col("data_da_compra"), "/").getItem(0).cast("int")) \
    .withColumn("compra_mes", split(col("data_da_compra"), "/").getItem(1).cast("int")) \
    .withColumn(
        "compra_ano",
        when(
            (length(col("data_da_compra")) == 5) &
            (col("file_name").contains("2025")) &
            (col("compra_mes") > 1),
            lit(2025)
        )
        .when(
            (length(col("data_da_compra")) == 5) &
            (col("file_name").contains("2026")) &
            (col("compra_mes") == 1),
            lit(2026)
        )
    )


In [0]:
display(df.withColumn('bool_ano', col('file_name').contains('2026')).select('data_da_compra', 'file_name', 'bool_ano'))

In [0]:
display(df)

In [0]:
df.select('compra_ano').distinct().show()

In [0]:
display(df.filter(df.compra_ano.isNull()))

In [0]:
for coluna in schemas_silver:
    if coluna == 'data_da_compra':
        df = df.withColumn(
            coluna,
            to_date(
                concat_ws("/", col("compra_dia"), col("compra_mes"), col("compra_ano")),
                "d/M/yyyy"
            )
        )
        
    elif coluna in ["valor_usd", "valor_brl"]:
        df = df.withColumn(
            coluna,
            regexp_replace(col(coluna), ",", ".").cast(schemas_silver.get(coluna))
        )
        
    else:
        df = df.withColumn(coluna, col(coluna).cast(schemas_silver.get(coluna)))


In [0]:
display(df)

In [0]:
df_cards = spark.table('personalfinance.silver.dim_cards')

In [0]:
df = df.join(df_cards.select("card_final", "full_descricao", "bank", "bandeira", "is_current", "dia_vencimento"), on='card_final', how='left')

In [0]:
df = df.withColumn('data_vencimento_fatura', 
                concat(
                regexp_extract(col('file_name'), r'202\d{1}', 0),
                lit('-'),
                regexp_extract(col('file_name'), r"\d{2}\.", 0),
                lit('-'),
                col('dia_vencimento'))) \
        .withColumn('data_vencimento_fatura', to_date(col('data_vencimento_fatura'), 'yyyy-MM.-d'))

In [0]:
df = df.withColumn('data_da_compra',
                   when(col('data_da_compra') > col('data_vencimento_fatura'),
                        make_date(year(col('data_da_compra')) - lit(1), 
                                  month(col('data_da_compra')), 
                                  dayofmonth(col('data_da_compra')))) \
                    .otherwise(col('data_da_compra')))

In [0]:
df = df.withColumn('tipo_compra', 
                   when(col('descricao').rlike(r'/'), 'Parcela sem juros') \
                    .when(col('descricao').rlike(r'IOF'), 'IOF de compra internacional') \
                    .otherwise('Compra Ã  vista')
                   )

In [0]:
df = df.fillna({'valor_usd': 0})

In [0]:
# df.write \
#     .mode("overwrite") \
#     .format("delta") \
#     .option("overwriteSchema", "true") \
#     .saveAsTable('personalfinance.silver.gastos_bradesco')