In [1]:
import pandas as pd 
import psycopg2 
from psycopg2 import extras
from pathlib import Path

In [2]:
DDL_GOLD_PATH = Path("scripts/ddl_gold.sql")

conn_params = {
    "host": "localhost",
    "port": 5433,
    "dbname": "transactions",
    "user": "admin",
    "password": "admin"
}

In [None]:
def extract_table(conn_params: dict, batch_size=100_000) -> pd.DataFrame:
    print("EXtraindo tabela da camada silver")
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    cursor.itersize = batch_size

    query = "SELECT * FROM silver.transactions_cards_users_mcc_fraud;"
    cursor.execute(query)

    chunks = []
    rows = 0
    
    while True:
        r = cursor.fetchmany(batch_size)
        if not r: 
            break 
        rows += len(r)

        if not chunks:
            columns = [desc[0] for desc in cursor.description]
        
        chunk_df = pd.DataFrame(r, columns=columns)
        chunks.append(chunk_df)
        print(f"Lote processado ---> Linhas {len(chunk_df)} de {rows}")

    df = pd.concat(chunks, ignore_index=True)
    cursor.close()
    conn.close()

    print(f"Extração concluída. Linhas {len(df)}")
    return df 

In [4]:
def transform_dimensions(df: pd.DataFrame): 
    df = df.copy(deep=False)
    
    print("Criando dimensões...")

    print("Criando dimensão client (dim_client)")
    dim_client = df[[
        "client_id", "gender", "current_age", "birth_year", "birth_month",
        "per_capita_income", "yearly_income", "total_debt", "credit_score"
    ]].drop_duplicates().reset_index(drop=True)
    print(f"Dimensão cliente criada com sucesso. Registros: {len(dim_client)}")

    print("Criando dimensão cards (dim_cards)")
    dim_cards = df[[
        "card_id", "card_brand", "card_type", "has_chip", "credit_limit",
        "acct_open_date", "card_on_dark_web", "num_cards_issued"
    ]].drop_duplicates().reset_index(drop=True)
    print(f"Dimensão cards criada com sucesso. Registros: {len(dim_cards)}")

    print("Criando dimensão time (dim_time)")
    dim_time = df[["date"]].drop_duplicates().reset_index(drop=True)
    dim_time["year"] = dim_time["date"].dt.year
    dim_time["quarter"] = dim_time["date"].dt.quarter
    dim_time["month"] = dim_time["date"].dt.month
    dim_time["day"] = dim_time["date"].dt.day
    print(f"Dimensão time criada com sucesso. Registros: {len(dim_time)}")

    print("Criando dimensão merchant (dim_merchant)")
    dim_merchant = df[[
        "merchant_id", "merchant_city", "merchant_state", "zip", "mcc", "mcc_description"
    ]].drop_duplicates().reset_index(drop=True)
    print(f"Dimensão merchant criada com sucesso. Registros: {len(dim_merchant)}")

    print("Todas as dimensões foram criadas com sucesso.")
    return dim_client, dim_time, dim_cards, dim_merchant

In [5]:
def load_to_dw(df_fact, dim_client, dim_time, dim_cards, dim_merchant, conn_params, ddl_gold_path: Path):
    conn = psycopg2.connect(**conn_params)
    cur = conn.cursor()

    print(f"Lendo o DDL da camada gold: {ddl_gold_path}")
    ddl = ddl_gold_path.read_text()
    cur.execute(ddl)
    conn.commit()
    print("Tabelas da camada gold criadas (ou já existentes).")

    print("Carregando dimensões...")

    tables = {
        "dw.dim_client": dim_client,
        "dw.dim_time": dim_time,
        "dw.dim_cards": dim_cards,
        "dw.dim_merchant": dim_merchant
    }

    for table, df in tables.items():
        cols = ", ".join(df.columns)
        values = [tuple(x) for x in df.to_numpy()]
        insert = f"INSERT INTO {table} ({cols}) VALUES %s"
        extras.execute_values(cur, insert, values, page_size=20000)
        conn.commit()
        print(f" -> {table} carregada com {len(df)} registros.")

    print("Criando e carregando a tabela fato...")
    fact = df_fact[[
        "amount", "use_chip", "errors", "is_fraud",
        "client_id", "date", "card_id", "merchant_id"
    ]].copy()

    fact = fact.merge(dim_client.reset_index().rename(columns={"index": "fk_client"}), on="client_id", how="left")
    fact = fact.merge(dim_time.reset_index().rename(columns={"index": "fk_time"}), on="date", how="left")
    fact = fact.merge(dim_cards.reset_index().rename(columns={"index": "fk_card"}), on="card_id", how="left")
    fact = fact.merge(dim_merchant.reset_index().rename(columns={"index": "fk_merchant"}), on="merchant_id", how="left")

    fact_final = fact[[
        "amount", "use_chip", "errors", "is_fraud",
        "fk_client", "fk_time", "fk_card", "fk_merchant"
    ]]

    cols = ", ".join(fact_final.columns)
    values = [tuple(x) for x in fact_final.to_numpy()]
    insert_fact = f"INSERT INTO dw.fact_transactions ({cols}) VALUES %s"
    extras.execute_values(cur, insert_fact, values, page_size=50000)
    conn.commit()

    print("Carga da camada gold concluída com sucesso!")
    cur.close()
    conn.close()


In [6]:
df_silver = extract_table(conn_params)

EXtraindo tabela da camada silver
Lote processado ---> Linhas 100000 de 100000
Lote processado ---> Linhas 100000 de 200000
Lote processado ---> Linhas 100000 de 300000
Lote processado ---> Linhas 100000 de 400000
Lote processado ---> Linhas 100000 de 500000
Lote processado ---> Linhas 100000 de 600000
Lote processado ---> Linhas 100000 de 700000
Lote processado ---> Linhas 100000 de 800000
Lote processado ---> Linhas 100000 de 900000
Lote processado ---> Linhas 100000 de 1000000
Lote processado ---> Linhas 100000 de 1100000
Lote processado ---> Linhas 100000 de 1200000
Lote processado ---> Linhas 100000 de 1300000
Lote processado ---> Linhas 100000 de 1400000
Lote processado ---> Linhas 100000 de 1500000
Lote processado ---> Linhas 100000 de 1600000
Lote processado ---> Linhas 100000 de 1700000
Lote processado ---> Linhas 100000 de 1800000
Lote processado ---> Linhas 100000 de 1900000
Lote processado ---> Linhas 100000 de 2000000
Lote processado ---> Linhas 100000 de 2100000
Lote proc

In [7]:
dim_client, dim_time, dim_cards, dim_merchant = transform_dimensions(df_silver)

Criando dimensões...
Criando dimensão client (dim_client)
Dimensão cliente criada com sucesso. Registros: 1212
Criando dimensão cards (dim_cards)
Dimensão cards criada com sucesso. Registros: 3609
Criando dimensão time (dim_time)
Dimensão time criada com sucesso. Registros: 1171215
Criando dimensão merchant (dim_merchant)
Dimensão merchant criada com sucesso. Registros: 111973
Todas as dimensões foram criadas com sucesso.


In [17]:
load_to_dw(df_silver, dim_client, dim_time, dim_cards, dim_merchant, conn_params, DDL_GOLD_PATH)

Lendo o DDL da camada gold: scripts/ddl_gold.sql
Tabelas da camada gold criadas (ou já existentes).
Carregando dimensões...


NumericValueOutOfRange: integer out of range


In [None]:
def load_to_dw(
    df_fact: pd.DataFrame,
    dim_client: pd.DataFrame,
    dim_time: pd.DataFrame,
    dim_cards: pd.DataFrame,
    dim_merchant: pd.DataFrame,
    conn_params: dict,
    ddl_gold_path: Path,
    batch_size: int = 10_000
):
    """
    Executa a carga da camada GOLD:
    - Cria o schema e tabelas (via DDL);
    - Insere dimensões;
    - Monta e carrega a tabela fato;
    - Tudo via psycopg2 puro e em batches.
    """

    print(f"Lendo DDL da camada gold: {ddl_gold_path}")
    ddl = ddl_gold_path.read_text()

    dim_time = dim_time.copy()
    dim_time["date"] = pd.to_datetime(dim_time["date"], errors="coerce").dt.date
    dim_time = dim_time.dropna(subset=["date"]).drop_duplicates().reset_index(drop=True)
    dim_time["year"] = pd.to_datetime(dim_time["date"]).dt.year
    dim_time["quarter"] = pd.to_datetime(dim_time["date"]).dt.quarter
    dim_time["month"] = pd.to_datetime(dim_time["date"]).dt.month
    dim_time["day"] = pd.to_datetime(dim_time["date"]).dt.day

    with psycopg2.connect(**conn_params) as conn:
        with conn.cursor() as cur:
            cur.execute(ddl)
            conn.commit()
            print("Tabelas da camada gold criadas (ou já existentes).")

            dim_tables = {
                "dw.dim_client": dim_client,
                "dw.dim_time": dim_time,
                "dw.dim_cards": dim_cards,
                "dw.dim_merchant": dim_merchant,
            }

            for table, df in dim_tables.items():
                print(f"\nCarregando {table}...")

                cols = ", ".join(df.columns)
                insert = f"INSERT INTO {table} ({cols}) VALUES %s"

                total_rows = len(df)
                for start in range(0, total_rows, batch_size):
                    end = min(start + batch_size, total_rows)
                    chunk = df.iloc[start:end]
                    values = [
                        tuple(None if (pd.isna(v) or str(v).strip() in ["", "NaT", "nan", "None"]) else v for v in row)
                        for row in chunk.values
                    ]
                    extras.execute_values(cur, insert, values, page_size=batch_size)
                    conn.commit()
                    print(f" -> Batch {start:,}–{end:,} inserido ({len(chunk)} registros)")

                print(f"✅ {table} carregada ({total_rows:,} registros).")

            print("\nConstruindo tabela fato...")

            fact = df_fact[[
                "amount", "use_chip", "errors", "is_fraud",
                "client_id", "date", "card_id", "merchant_id"
            ]].copy()
            fact["date"] = pd.to_datetime(fact["date"], errors="coerce").dt.date

            def get_dim_map(cur, table, id_col, sk_col):
                cur.execute(f"SELECT {id_col}, {sk_col} FROM {table}")
                return dict(cur.fetchall())

            client_map = get_dim_map(cur, "dw.dim_client", "client_id", "sk_client")
            time_map = get_dim_map(cur, "dw.dim_time", "date", "sk_time")
            card_map = get_dim_map(cur, "dw.dim_cards", "card_id", "sk_card")
            merchant_map = get_dim_map(cur, "dw.dim_merchant", "merchant_id", "sk_merchant")

            fact["fk_client"] = fact["client_id"].map(client_map)
            fact["fk_time"] = fact["date"].map(time_map)
            fact["fk_card"] = fact["card_id"].map(card_map)
            fact["fk_merchant"] = fact["merchant_id"].map(merchant_map)

            fact_final = fact[[
                "amount", "use_chip", "errors", "is_fraud",
                "fk_client", "fk_time", "fk_card", "fk_merchant"
            ]]

            print("\nCarregando tabela fato (em batches)...")

            cols = ", ".join(fact_final.columns)
            insert_fact = f"INSERT INTO dw.fact_transactions ({cols}) VALUES %s"

            total_rows = len(fact_final)
            for start in range(0, total_rows, batch_size):
                end = min(start + batch_size, total_rows)
                chunk = fact_final.iloc[start:end]
                values = [
                    tuple(None if (pd.isna(v) or str(v).strip() in ["", "NaT", "nan", "None"]) else v for v in row)
                    for row in chunk.values
                ]
                extras.execute_values(cur, insert_fact, values, page_size=batch_size)
                conn.commit()
                print(f" -> Batch {start:,}–{end:,} inserido ({len(chunk)} registros)")

            print(f"✅ Tabela fato carregada ({total_rows:,} registros).")

    print("\n🚀 Carga da camada gold concluída com sucesso!")

In [32]:
load_to_dw(
    df_silver,
    dim_client,
    dim_time,
    dim_cards,
    dim_merchant,
    conn_params,
    DDL_GOLD_PATH,
    batch_size=10000
)


Lendo DDL da camada gold: scripts/ddl_gold.sql
✅ Tabelas da camada gold criadas (ou já existentes).

Carregando dw.dim_client...
 -> Batch 0–1,212 inserido (1212 registros)
✅ dw.dim_client carregada (1,212 registros).

Carregando dw.dim_time...
 -> Batch 0–1,034 inserido (1034 registros)
✅ dw.dim_time carregada (1,034 registros).

Carregando dw.dim_cards...
 -> Batch 0–3,609 inserido (3609 registros)
✅ dw.dim_cards carregada (3,609 registros).

Carregando dw.dim_merchant...
 -> Batch 0–10,000 inserido (10000 registros)
 -> Batch 10,000–20,000 inserido (10000 registros)
 -> Batch 20,000–30,000 inserido (10000 registros)
 -> Batch 30,000–40,000 inserido (10000 registros)
 -> Batch 40,000–50,000 inserido (10000 registros)
 -> Batch 50,000–60,000 inserido (10000 registros)
 -> Batch 60,000–70,000 inserido (10000 registros)
 -> Batch 70,000–80,000 inserido (10000 registros)
 -> Batch 80,000–90,000 inserido (10000 registros)
 -> Batch 90,000–100,000 inserido (10000 registros)
 -> Batch 100,0