In [2]:
import pandas as pd 
import psycopg2 
from psycopg2 import extras
from pathlib import Path

In [3]:
DDL_GOLD_PATH = Path("scripts/ddl_gold.sql")

conn_params = {
    "host": "localhost",
    "port": 5433,
    "dbname": "transactions",
    "user": "admin",
    "password": "admin"
}

In [4]:
def extract_table(conn_params: dict, batch_size=100_000) -> pd.DataFrame:
    print("EXtraindo tabela da camada silver")
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    cursor.itersize = batch_size

    query = "SELECT * FROM public.transactions_cards_users_mcc_fraud;"
    cursor.execute(query)

    chunks = []
    rows = 0
    
    while True:
        r = cursor.fetchmany(batch_size)
        if not r: 
            break 
        rows += len(r)

        if not chunks:
            columns = [desc[0] for desc in cursor.description]
        
        chunk_df = pd.DataFrame(r, columns=columns)
        chunks.append(chunk_df)
        print(f"Lote processado ---> Linhas {len(chunk_df)} de {rows}")

    df = pd.concat(chunks, ignore_index=True)
    cursor.close()
    conn.close()

    print(f"Extração concluída. Linhas {len(df)}")
    return df 

In [5]:
def transform_dimensions(df: pd.DataFrame): 
    df = df.copy(deep=False)
    
    print("Criando dimensões...")

    print("Criando dimensão client (dim_client)")
    dim_client = df[[
        "client_id", "gender", "current_age", "birth_year", "birth_month",
        "per_capita_income", "yearly_income", "total_debt", "credit_score"
    ]].drop_duplicates().reset_index(drop=True)
    print(f"Dimensão cliente criada com sucesso. Registros: {len(dim_client)}")

    print("Criando dimensão cards (dim_cards)")
    dim_cards = df[[
        "card_id", "card_brand", "card_type", "has_chip", "credit_limit",
        "acct_open_date", "card_on_dark_web", "num_cards_issued"
    ]].drop_duplicates().reset_index(drop=True)
    print(f"Dimensão cards criada com sucesso. Registros: {len(dim_cards)}")

    print("Criando dimensão time (dim_time)")
    dim_time = df[["date"]].drop_duplicates().reset_index(drop=True)
    dim_time["year"] = dim_time["date"].dt.year
    dim_time["quarter"] = dim_time["date"].dt.quarter
    dim_time["month"] = dim_time["date"].dt.month
    dim_time["day"] = dim_time["date"].dt.day
    print(f"Dimensão time criada com sucesso. Registros: {len(dim_time)}")

    print("Criando dimensão merchant (dim_merchant)")
    dim_merchant = df[[
        "merchant_id", "merchant_city", "merchant_state", "zip", "mcc", "mcc_description"
    ]].drop_duplicates().reset_index(drop=True)
    print(f"Dimensão merchant criada com sucesso. Registros: {len(dim_merchant)}")

    print("Todas as dimensões foram criadas com sucesso.")
    return dim_client, dim_time, dim_cards, dim_merchant

In [7]:
def load_to_dw(df_fact, dim_client, dim_time, dim_cards, dim_merchant, conn_params, ddl_gold_path: Path):
    conn = psycopg2.connect(**conn_params)
    cur = conn.cursor()

    print(f"Lendo o DDL da camada gold: {ddl_gold_path}")
    ddl = ddl_gold_path.read_text()
    cur.execute(ddl)
    conn.commit()
    print("Tabelas da camada gold criadas (ou já existentes).")

    print("Carregando dimensões...")

    tables = {
        "dw.dim_client": dim_client,
        "dw.dim_time": dim_time,
        "dw.dim_cards": dim_cards,
        "dw.dim_merchant": dim_merchant
    }

    for table, df in tables.items():
        cols = ", ".join(df.columns)
        values = [tuple(x) for x in df.to_numpy()]
        insert = f"INSERT INTO {table} ({cols}) VALUES %s"
        extras.execute_values(cur, insert, values, page_size=20000)
        conn.commit()
        print(f" -> {table} carregada com {len(df)} registros.")

    print("Criando e carregando a tabela fato...")
    fact = df_fact[[
        "amount", "use_chip", "errors", "is_fraud",
        "client_id", "date", "card_id", "merchant_id"
    ]].copy()

    fact = fact.merge(dim_client.reset_index().rename(columns={"index": "fk_client"}), on="client_id", how="left")
    fact = fact.merge(dim_time.reset_index().rename(columns={"index": "fk_time"}), on="date", how="left")
    fact = fact.merge(dim_cards.reset_index().rename(columns={"index": "fk_card"}), on="card_id", how="left")
    fact = fact.merge(dim_merchant.reset_index().rename(columns={"index": "fk_merchant"}), on="merchant_id", how="left")

    fact_final = fact[[
        "amount", "use_chip", "errors", "is_fraud",
        "fk_client", "fk_time", "fk_card", "fk_merchant"
    ]]

    cols = ", ".join(fact_final.columns)
    values = [tuple(x) for x in fact_final.to_numpy()]
    insert_fact = f"INSERT INTO dw.fact_transactions ({cols}) VALUES %s"
    extras.execute_values(cur, insert_fact, values, page_size=50000)
    conn.commit()

    print("Carga da camada gold concluída com sucesso!")
    cur.close()
    conn.close()


In [None]:
df_silver = extract_table(conn_params)
dim_client, dim_time, dim_cards, dim_merchant = transform_dimensions(df_silver)
load_to_dw(df_silver, dim_client, dim_time, dim_cards, dim_merchant, conn_params, DDL_GOLD_PATH)

EXtraindo tabela da camada silver
Lote processado ---> Linhas 100000 de 100000
Lote processado ---> Linhas 100000 de 200000
Lote processado ---> Linhas 100000 de 300000
Lote processado ---> Linhas 100000 de 400000
Lote processado ---> Linhas 100000 de 500000
Lote processado ---> Linhas 100000 de 600000
Lote processado ---> Linhas 100000 de 700000
Lote processado ---> Linhas 100000 de 800000
Lote processado ---> Linhas 100000 de 900000
Lote processado ---> Linhas 100000 de 1000000
Lote processado ---> Linhas 100000 de 1100000
Lote processado ---> Linhas 100000 de 1200000
Lote processado ---> Linhas 100000 de 1300000
Lote processado ---> Linhas 100000 de 1400000
Lote processado ---> Linhas 100000 de 1500000
Lote processado ---> Linhas 100000 de 1600000
Lote processado ---> Linhas 100000 de 1700000
Lote processado ---> Linhas 100000 de 1800000
Lote processado ---> Linhas 100000 de 1900000
Lote processado ---> Linhas 100000 de 2000000
Lote processado ---> Linhas 100000 de 2100000
Lote proc