# Step 1: Data Audit + Grain Lock

Run this notebook after placing raw CSVs in `data/raw/`.


In [None]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("../data/raw")

orders = pd.read_csv(DATA_PATH / "olist_orders_dataset.csv")
items = pd.read_csv(DATA_PATH / "olist_order_items_dataset.csv")
payments = pd.read_csv(DATA_PATH / "olist_order_payments_dataset.csv")
customers = pd.read_csv(DATA_PATH / "olist_customers_dataset.csv")
products = pd.read_csv(DATA_PATH / "olist_products_dataset.csv")

def audit(df, name, pk=None):
    print(f"\n=== {name} ===")
    print("Shape:", df.shape)
    print("Missing (%):")
    print((df.isna().mean() * 100).sort_values(ascending=False).head(10))
    if pk:
        dup = df.duplicated(pk).sum()
        print(f"Duplicate PK rows ({pk}):", dup)

audit(orders, "orders", pk=["order_id"])
audit(items, "order_items", pk=["order_id", "order_item_id"])
audit(payments, "payments", pk=["order_id"])
audit(customers, "customers", pk=["customer_id"])
audit(products, "products", pk=["product_id"])


In [None]:
# A. orders -> order_items coverage
orders_with_items = orders.merge(
    items[["order_id"]].drop_duplicates(),
    on="order_id",
    how="left",
    indicator=True,
)

orders_with_items["_merge"].value_counts(normalize=True)


In [None]:
# B. payments must aggregate to order grain
payments_agg = (
    payments
    .groupby("order_id", as_index=False)["payment_value"]
    .sum()
    .rename(columns={"payment_value": "payment_value_total"})
)

payments_agg.head()


In [None]:
# C. customer_id vs customer_unique_id
customers[["customer_id", "customer_unique_id"]].nunique()


In [None]:
# D. order status distribution
orders["order_status"].value_counts(normalize=True)
