In [5]:
import pandas as pd
from pathlib import Path

DATA = Path("../data")

orders = pd.read_csv(DATA/"olist_orders_dataset.csv",
                     parse_dates=[
                         "order_purchase_timestamp", "order_approved_at",
                         "order_delivered_carrier_date", "order_delivered_customer_date",
                         "order_estimated_delivery_date"
                     ])
items = pd.read_csv(DATA/"olist_order_items_dataset.csv")
products = pd.read_csv(DATA/"olist_products_dataset.csv")
cust = pd.read_csv(DATA/"olist_customers_dataset.csv")
sellers = pd.read_csv(DATA/"olist_sellers_dataset.csv")
pay = pd.read_csv(DATA/"olist_order_payments_dataset.csv")
reviews = pd.read_csv(DATA/"olist_order_reviews_dataset.csv")
geo = pd.read_csv(DATA/"olist_geolocation_dataset.csv")

print("✅ Tous les fichiers sont bien chargés :", len(orders), "commandes")

# --- Fusion principale ---
df = (
    orders
    .merge(items, on="order_id", how="left")
    .merge(products, on="product_id", how="left")
    .merge(
        pay.groupby("order_id", as_index=False)
           .agg({"payment_value": "sum", "payment_installments": "max"}),
        on="order_id", how="left"
    )
    .merge(
        cust[["customer_id", "customer_unique_id", "customer_zip_code_prefix",
              "customer_city", "customer_state"]],
        on="customer_id", how="left"
    )
    .merge(
        sellers[["seller_id", "seller_zip_code_prefix", "seller_city", "seller_state"]],
        on="seller_id", how="left"
    )
)

# --- Feature engineering ---
df["delivered_late"] = (
    df["order_delivered_customer_date"] > df["order_estimated_delivery_date"]
).astype("int")

df["days_to_deliver"] = (
    df["order_delivered_customer_date"] - df["order_purchase_timestamp"]
).dt.total_seconds() / 3600 / 24

df["seller_handling_hours"] = (
    df["order_delivered_carrier_date"] - df["order_approved_at"]
).dt.total_seconds() / 3600

df["carrier_to_customer_hours"] = (
    df["order_delivered_customer_date"] - df["order_delivered_carrier_date"]
).dt.total_seconds() / 3600

df["purchase_dow"] = df["order_purchase_timestamp"].dt.dayofweek
df["purchase_hour"] = df["order_purchase_timestamp"].dt.hour
df["purchase_month"] = df["order_purchase_timestamp"].dt.month

print("✅ DataFrame fusionné :", df.shape)
print(df.head())


✅ Tous les fichiers sont bien chargés : 99441 commandes
✅ DataFrame fusionné : (113425, 38)
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp   order_approved_at  \
0    delivered      2017-10-02 10:56:33 2017-10-02 11:07:15   
1    delivered      2018-07-24 20:41:37 2018-07-26 03:24:27   
2    delivered      2018-08-08 08:38:49 2018-08-08 08:55:23   
3    delivered      2017-11-18 19:28:06 2017-11-18 19:45:59   
4    delivered      2018-02-13 21:18:39 2018-02-13 22:20:29   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:5