In [0]:
# Databricks notebook source
# 03_orchestration.py

from pyspark.sql.functions import (
    col,
    when,
    lit,
    to_date,
    concat_ws,
    sum as _sum,
    count,
    max as _max
)
from pyspark.sql import SparkSession


def aggregate_by_client(risk_df):
    aggr_df = (
        risk_df.groupBy("client_id")
        .agg(
            count("*").alias("total_transactions"),
            _sum("transaction_amount").alias("total_amount"),
            _sum(when(col("risk_flag"), 1).otherwise(0)).alias("high_risk_transactions"),
            _max("risk_score").alias("max_risk_score"),
            _max("is_high_risk_country").alias("ever_high_risk_country"),
            _max("is_minor").alias("is_minor")
        )
        .withColumn("high_risk_ratio", col("high_risk_transactions") / col("total_transactions"))
    )
    return aggr_df

def save_to_gold(aggr_df, gold_path):
    (
        aggr_df.write.format("delta")
        .mode("overwrite")
        .save(f"{gold_path}/aggregated_client_risk")
    )

# Parâmetros
dbutils.widgets.text("silver_path", "/mnt/datalake/silver/kyc_risk_analysis")
dbutils.widgets.text("gold_path", "/mnt/datalake/gold/kyc_risk_analysis")

silver_path = dbutils.widgets.get("silver_path")
gold_path = dbutils.widgets.get("gold_path")

spark = SparkSession.builder.appName("orchestration-kyc-risk").getOrCreate()

# Ler silver
risk_df = spark.read.format("delta").load(f"{silver_path}/client_transactions_risk")

# Agregar
aggr_df = aggregate_by_client(risk_df)

# Salvar gold
save_to_gold(aggr_df, gold_path)

print("Agregação e gravação no Gold concluídas.")



# COMMAND ----------

# to display the silver table(client_transactions_risk)
df_silver = spark.read.format("delta").load("/mnt/datalake/silver/kyc_risk_analysis/client_transactions_risk")
df_silver.display()
