In [38]:
import pandas as pd 
import psycopg2 
from psycopg2 import extras
from pathlib import Path

In [39]:
# DDL_GOLD_PATH = Path("scripts/ddl_gold.sql")
DDL_GOLD_PATH = Path("scripts/ddl_gold.sql")

conn_params = {
    "host": "localhost",
    "port": 5433,
    "dbname": "transactions",
    "user": "admin",
    "password": "admin"
}

In [40]:
def extract_table(conn_params: dict, batch_size=100_000) -> pd.DataFrame:
    print("EXtraindo tabela da camada silver")
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    cursor.itersize = batch_size

    query = "SELECT * FROM silver.transactions_cards_users_mcc;"
    cursor.execute(query)

    chunks = []
    rows = 0
    
    while True:
        r = cursor.fetchmany(batch_size)
        if not r: 
            break 
        rows += len(r)

        if not chunks:
            columns = [desc[0] for desc in cursor.description]
        
        chunk_df = pd.DataFrame(r, columns=columns)
        chunks.append(chunk_df)
        print(f"Lote processado ---> Linhas {len(chunk_df)} de {rows}")

    df = pd.concat(chunks, ignore_index=True)
    cursor.close()
    conn.close()

    print(f"Extração concluída. Linhas {len(df)}")
    return df 

In [41]:
def transform_dimensions(df: pd.DataFrame): 
    df = df.copy(deep=False)
    
    print("[GOLD] Criando dimensões para a camada Gold...")

    print("[GOLD] Criando dimensão client (dim_client)")
    dim_clt = df[[
        "client_id", "gender", "current_age", "per_capita_income", "yearly_income", "total_debt", "credit_score", "num_credit_cards"
    ]].drop_duplicates().reset_index(drop=True)
    dim_clt = dim_clt.rename(columns={
        "client_id": "clt_ide",
        "gender": "gdr",
        "current_age": "crt_age", 
        "per_capita_income": "per_cpt_icm",
        "yearly_income": "yrl_icm",
        "total_debt": "ttl_dbt",
        "credit_score": "cdt_scr",
        "num_credit_cards": "num_cdt_crd"
    })
    print(f"[GOLD] Dimensão cliente criada com sucesso. Registros: {len(dim_clt)}")

    print("[GOLD] Criando dimensão cards (dim_cards)")
    dim_crd = df[[
        "card_id", "card_brand", "card_type", "has_chip", "credit_limit",
        "acct_open_date", "num_cards_issued", "year_pin_last_changed",
    ]].drop_duplicates().reset_index(drop=True)
    dim_crd = dim_crd.rename(columns={
        "card_id": "crd_ide",
        "card_brand": "crd_brd",
        "card_type": "crd_tpe",
        "has_chip": "has_chp",
        "credit_limit": "cdt_lmt",
        "acct_open_date": "act_opn_dat",
        "num_cards_issued": "num_crd_isd",
        "year_pin_last_changed": "yrr_pin_lst_cgd"
    })
    print(f"[GOLD] Dimensão cards criada com sucesso. Registros: {len(dim_crd)}")

    print("[GOLD] Criando dimensão time (dim_time)")
    dim_tim = df[["date"]].drop_duplicates().reset_index(drop=True)
    dim_tim["yrr"] = dim_tim["date"].dt.year
    dim_tim["qtr"] = dim_tim["date"].dt.quarter
    dim_tim["mth"] = dim_tim["date"].dt.month
    dim_tim["day"] = dim_tim["date"].dt.day
    dim_tim = dim_tim.rename(columns={"date": "dat"})
    print(f"[GOLD] Dimensão time criada com sucesso. Registros: {len(dim_tim)}")

    print("[GOLD] Criando dimensão merchant (dim_merchant)")
    dim_mer = df[[
        "merchant_id", "merchant_city", "merchant_state", "zip", "mcc", "mcc_description"
    ]].drop_duplicates().reset_index(drop=True)
    dim_mer = dim_mer.rename(columns={
        "merchant_id": "mer_ide",
        "merchant_city": "mer_cit",
        "merchant_state": "mer_stt",
        "zip": "zip",
        "mcc": "mcc",
        "mcc_description": "mcc_dcp"
    })
    print(f"[GOLD] Dimensão merchant criada com sucesso. Registros: {len(dim_mer)}")

    print("[GOLD] Todas as dimensões foram criadas com sucesso.")
    return dim_clt, dim_tim, dim_crd, dim_mer

In [42]:
def load_to_dwh(
    df_fat: pd.DataFrame,
    dim_clt: pd.DataFrame,
    dim_tim: pd.DataFrame,
    dim_crd: pd.DataFrame,
    dim_mer: pd.DataFrame,
    conn_params: dict,
    ddl_gold_path: Path,
    batch_size: int = 10_000
):
    """
    Executa a carga da camada GOLD:
    - Cria o schema e tabelas (via DDL);
    - Insere dimensões (client, time, cards, merchant);
    - Monta e carrega a tabela fato;
    - Tudo via psycopg2 puro e em batches.
    """

    print(f"[GOLD] Lendo DDL da camada gold: {ddl_gold_path}")
    ddl = ddl_gold_path.read_text(encoding="utf-8")

    # # Ajustes de data na dimensão tempo
    # dim_time = dim_time.copy()
    # dim_time["date"] = pd.to_datetime(dim_time["date"], errors="coerce").dt.date
    # dim_time = dim_time.dropna(subset=["date"]).drop_duplicates().reset_index(drop=True)
    # dim_time["year"] = pd.to_datetime(dim_time["date"]).dt.year
    # dim_time["quarter"] = pd.to_datetime(dim_time["date"]).dt.quarter
    # dim_time["month"] = pd.to_datetime(dim_time["date"]).dt.month
    # dim_time["day"] = pd.to_datetime(dim_time["date"]).dt.day

    with psycopg2.connect(**conn_params) as conn:
        with conn.cursor() as cur:
            cur.execute(ddl)
            conn.commit()
            print("[GOLD] Tabelas da camada gold criadas (ou já existentes).")

            dim_tables = {
                "dwh.dim_clt": dim_clt,
                "dwh.dim_tim": dim_tim,
                "dwh.dim_crd": dim_crd,
                "dwh.dim_mer": dim_mer,
            }

            for table, df in dim_tables.items():
                print(f"\n[GOLD] Carregando {table}...")

                cols = ", ".join(df.columns)
                insert = f"INSERT INTO {table} ({cols}) VALUES %s"

                total_rows = len(df)
                for start in range(0, total_rows, batch_size):
                    end = min(start + batch_size, total_rows)
                    chunk = df.iloc[start:end]
                    values = [
                        tuple(None if (pd.isna(v) or str(v).strip() in ["", "NaT", "nan", "None"]) else v for v in row)
                        for row in chunk.values
                    ]
                    extras.execute_values(cur, insert, values, page_size=batch_size)
                    conn.commit()
                    print(f" -> Batch {start:,}–{end:,} inserido ({len(chunk)} registros)")

                print(f"[GOLD] {table} carregada ({total_rows:,} registros).")

            print("\nConstruindo tabela fato...")

            fact = df_fat[[
                "amount", "use_chip", "errors",
                "client_id", "date", "card_id", "merchant_id"
            ]].copy()

            fact["date"] = pd.to_datetime(fact["date"], errors="coerce").dt.date

            def get_dim_map(cur, table, id_col, sk_col):
                cur.execute(f"SELECT {id_col}, {sk_col} FROM {table}")
                return dict(cur.fetchall())

            client_map = get_dim_map(cur, "dwh.dim_clt", "clt_ide", "srk_clt")
            time_map = get_dim_map(cur, "dwh.dim_tim", "dat", "srk_tim")
            card_map = get_dim_map(cur, "dwh.dim_crd", "crd_ide", "srk_crd")
            merchant_map = get_dim_map(cur, "dwh.dim_mer", "mer_ide", "srk_mer")

            fact["srk_clt"] = fact["client_id"].map(client_map)
            fact["srk_tim"] = fact["date"].map(time_map)
            fact["srk_crd"] = fact["card_id"].map(card_map)
            fact["srk_mer"] = fact["merchant_id"].map(merchant_map)

            fact_final = fact.rename(columns={
                "amount": "amt",
                "use_chip": "use_chp",
                "errors": "err"
            })[[
                "amt", "use_chp", "err",
                "srk_clt", "srk_tim", "srk_crd", "srk_mer"
            ]]

            print("\n[GOLD] Carregando tabela fato (em batches)...")

            cols = ", ".join(fact_final.columns)
            insert_fact = f"INSERT INTO dwh.fat_trn ({cols}) VALUES %s"

            total_rows = len(fact_final)
            for start in range(0, total_rows, batch_size):
                end = min(start + batch_size, total_rows)
                chunk = fact_final.iloc[start:end]
                values = [
                    tuple(None if (pd.isna(v) or str(v).strip() in ["", "NaT", "nan", "None"]) else v for v in row)
                    for row in chunk.values
                ]
                extras.execute_values(cur, insert_fact, values, page_size=batch_size)
                conn.commit()
                print(f"[GOLD] -> Batch {start:,}–{end:,} inserido ({len(chunk)} registros)")

            print(f"[GOLD] Tabela fato carregada com sucesso ({total_rows:,} registros).")

    print("\n[GOLD] Carga da camada gold concluída com sucesso!")

In [43]:
df_silver = extract_table(conn_params)

EXtraindo tabela da camada silver
Lote processado ---> Linhas 100000 de 100000
Lote processado ---> Linhas 100000 de 200000
Lote processado ---> Linhas 100000 de 300000
Lote processado ---> Linhas 100000 de 400000
Lote processado ---> Linhas 100000 de 500000
Lote processado ---> Linhas 100000 de 600000
Lote processado ---> Linhas 100000 de 700000
Lote processado ---> Linhas 100000 de 800000
Lote processado ---> Linhas 100000 de 900000
Lote processado ---> Linhas 100000 de 1000000
Lote processado ---> Linhas 100000 de 1100000
Lote processado ---> Linhas 100000 de 1200000
Lote processado ---> Linhas 100000 de 1300000
Lote processado ---> Linhas 100000 de 1400000
Lote processado ---> Linhas 100000 de 1500000
Lote processado ---> Linhas 100000 de 1600000
Lote processado ---> Linhas 100000 de 1700000
Lote processado ---> Linhas 100000 de 1800000
Lote processado ---> Linhas 100000 de 1900000
Lote processado ---> Linhas 100000 de 2000000
Lote processado ---> Linhas 45955 de 2045955
Extração c

In [44]:
dim_clt, dim_tim, dim_crd, dim_mer = transform_dimensions(df_silver)

[GOLD] Criando dimensões para a camada Gold...
[GOLD] Criando dimensão client (dim_client)
[GOLD] Dimensão cliente criada com sucesso. Registros: 1212
[GOLD] Criando dimensão cards (dim_cards)
[GOLD] Dimensão cards criada com sucesso. Registros: 3605
[GOLD] Criando dimensão time (dim_time)
[GOLD] Dimensão time criada com sucesso. Registros: 739
[GOLD] Criando dimensão merchant (dim_merchant)
[GOLD] Dimensão merchant criada com sucesso. Registros: 83302
[GOLD] Todas as dimensões foram criadas com sucesso.


In [48]:
load_to_dwh(
    df_silver,
    dim_clt,
    dim_tim,
    dim_crd,
    dim_mer,
    conn_params,
    DDL_GOLD_PATH,
    batch_size=25000
)

[GOLD] Lendo DDL da camada gold: scripts/ddl_gold.sql
[GOLD] Tabelas da camada gold criadas (ou já existentes).

[GOLD] Carregando dwh.dim_clt...
 -> Batch 0–1,212 inserido (1212 registros)
[GOLD] dwh.dim_clt carregada (1,212 registros).

[GOLD] Carregando dwh.dim_tim...
 -> Batch 0–739 inserido (739 registros)
[GOLD] dwh.dim_tim carregada (739 registros).

[GOLD] Carregando dwh.dim_crd...
 -> Batch 0–3,605 inserido (3605 registros)
[GOLD] dwh.dim_crd carregada (3,605 registros).

[GOLD] Carregando dwh.dim_mer...
 -> Batch 0–25,000 inserido (25000 registros)
 -> Batch 25,000–50,000 inserido (25000 registros)
 -> Batch 50,000–75,000 inserido (25000 registros)
 -> Batch 75,000–83,302 inserido (8302 registros)
[GOLD] dwh.dim_mer carregada (83,302 registros).

Construindo tabela fato...

[GOLD] Carregando tabela fato (em batches)...
[GOLD] -> Batch 0–25,000 inserido (25000 registros)
[GOLD] -> Batch 25,000–50,000 inserido (25000 registros)
[GOLD] -> Batch 50,000–75,000 inserido (25000 regi