In [0]:
# Databricks notebook source
# 02_processing.py

from pyspark.sql.functions import (
    col,
    when,
    lit,
    to_date,
    concat_ws,
    sum as _sum,
    count,
    max as _max
)
from pyspark.sql import SparkSession

# Utils functions (você pode importar de um módulo externo se preferir)
def load_bronze_data(spark, bronze_path):
    clients_df = (
        spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(f"{bronze_path}/clients.csv")
    )
    transactions_df = (
        spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(f"{bronze_path}/transactions.csv")
    )
    high_risk_countries_df = (
        spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(f"{bronze_path}/high_risk_countries.csv")
    )
    return clients_df, transactions_df, high_risk_countries_df

def save_to_silver(df, silver_path):
    (
        df.write.format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")  # <- ESSA LINHA RESOLVE
        .partitionBy("evaluation_date")
        .save(f"{silver_path}/client_transactions_risk")
    )

def enrich_risk(clients_df, transactions_df, high_risk_countries_df, evaluation_timestamp):
    joined_df = (
        transactions_df.alias("t")
        .join(clients_df.alias("c"), on="client_id", how="inner")
        .join(
            high_risk_countries_df.alias("h").withColumnRenamed("country", "high_risk_country"),
            col("c.country") == col("h.high_risk_country"),
            how="left"
        )
        .withColumn("is_high_risk_country", col("h.high_risk_country").isNotNull())
    )

    risk_df = (
        joined_df
        .withColumn("is_high_value_transaction", when(col("transaction_amount") > 10000, lit(1)).otherwise(lit(0)))
        .withColumn("is_minor", when(col("age") < 18, lit(1)).otherwise(lit(0)))
        .withColumn(
            "risk_score",
            col("is_high_risk_country").cast("int") * 1 +
            col("is_high_value_transaction").cast("int") * 1.5 +
            col("is_minor").cast("int") * 1
        )
        .withColumn("risk_flag", when(col("risk_score") >= 2, lit(True)).otherwise(lit(False)))
        .withColumn("evaluation_timestamp", lit(evaluation_timestamp).cast("timestamp"))
        .withColumn("evaluation_date", to_date(col("evaluation_timestamp")))
        .withColumn("event_id", concat_ws("_", col("client_id"), col("transaction_id")))
    )

    return risk_df

# Parâmetros
dbutils.widgets.text("bronze_path", "/mnt/kycproject/raw_data")
dbutils.widgets.text("silver_path", "/mnt/datalake/silver/kyc_risk_analysis")
dbutils.widgets.text("evaluation_timestamp", "2025-06-10 12:00:00")

bronze_path = dbutils.widgets.get("bronze_path")
silver_path = dbutils.widgets.get("silver_path")
evaluation_timestamp = dbutils.widgets.get("evaluation_timestamp")

spark = SparkSession.builder.appName("processing-kyc-risk").getOrCreate()

# Carregar bronze
clients_df, transactions_df, high_risk_countries_df = load_bronze_data(spark, bronze_path)

# Enriquecer com risco
risk_df = enrich_risk(clients_df, transactions_df, high_risk_countries_df, evaluation_timestamp)

# Salvar silver
save_to_silver(risk_df, silver_path)

print("Processamento e gravação no Silver concluídos.")
