In [0]:
# Databricks notebook source
# 01_ingestion_uc.py - Ingestion with Unity Catalog (production-ready version)

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DecimalType
import random
from datetime import datetime, timedelta

# -----------------------------------------------
# Step 1: Define catalog and schema via widgets
# -----------------------------------------------
dbutils.widgets.text("catalog", "governance_risk")
dbutils.widgets.text("schema", "kyc_project")

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")

print(f"Using catalog: {catalog}, schema: {schema}")

# --- Sample data ---
first_names = [
    'Carlos', 'Sofia', 'Lucas', 'Isabela', 'Gabriel',
    'Laura', 'Mateus', 'Júlia', 'Pedro', 'Beatriz',
    'John', 'Maria', 'James', 'Patricia', 'Robert',
    'Wei', 'Li', 'Jing', 'Yang', 'Wei',
    'Fatima', 'Mohammed', 'Ahmed', 'Aisha'
]

last_names = [
    '', '', '', '', '',
    'C.', 'G.', 'S.', 'R.', 'S.',
    'S.', 'G.', 'J.', 'B.', 'J.',
    'C.', 'N.', 'W.', 'L.', 'Z.',
    'A.', 'A.', 'A.', 'A.'
]

countries = [
    'Brazil', 'Portugal', 'Spain', 'China', 'Oman',
    'Germany', 'India', 'Russia', 'Argentina', 'Mexico',
    'Iran', 'Venezuela', 'Myanmar', 'Lebanon', 'North Korea'
]

age_range = (15, 75)

# --- Generate clients ---
n_clients = 150
clients = []
for i in range(1, n_clients + 1):
    clients.append((
        i,
        random.choice(first_names),
        random.choice(last_names),
        random.randint(*age_range),
        random.choice(countries)
    ))

clients_schema = StructType([
    StructField("client_id", IntegerType(), False),
    StructField("first_name", StringType(), False),
    StructField("last_name", StringType(), False),
    StructField("date_birth", DateType(), False),
    StructField("residency_country", StringType(), False)
])

clients_df = spark.createDataFrame(clients, schema=clients_schema)

# Save as Delta table in Unity Catalog
clients_df.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema}.clients")

import random
from datetime import datetime, timedelta
import decimal
from pyspark.sql.types import StructType, StructField, IntegerType, DecimalType, DateType

# --- Generate transactions ---
n_transactions = 500
transactions = []
for i in range(1, n_transactions + 1):
    client = random.choice(clients)
    amount_value = random.uniform(10.0, 50000.0)
    # converter para decimal.Decimal com 2 casas decimais
    amount = decimal.Decimal(f"{amount_value:.2f}")
    days_ago = random.randint(0, 365)
    date = (datetime.now() - timedelta(days=days_ago)).date()

    transactions.append((
        i,
        client[0],  # client_id
        amount,
        date  # passar como objeto date, não string
    ))

transactions_schema = StructType([
    StructField("transaction_id", IntegerType(), False),
    StructField("client_id", IntegerType(), False),
    StructField("transaction_amount", DecimalType(15,2), False),
    StructField("transaction_date", DateType(), False)
])

transactions_df = spark.createDataFrame(transactions, schema=transactions_schema)
transactions_df.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema}.transactions")

# --- Generate high risk countries ---
n_high_risk_countries = 10
high_risk_sample = random.sample(countries, n_high_risk_countries)
high_risk = [(c,) for c in high_risk_sample]

high_risk_schema = StructType([
    StructField("high_risk_country", StringType(), False)
])

high_risk_df = spark.createDataFrame(high_risk, schema=high_risk_schema)
high_risk_df.write.mode("overwrite").format("delta").saveAsTable(f"{catalog}.{schema}.high_risk_countries")

print("Delta tables successfully created in Unity Catalog.")


In [0]:
print(f"Using catalog: {catalog}, schema: {schema}")

display(high_risk_df)
print(high_risk_df.schema)