In [None]:
# ----------------------- README.md -----------------------
"""
Order‑Delay Predictor
=====================
A streamlined project that predicts whether an Olist marketplace order will be
delivered **late (1) or on time (0)**.  Fits in a single weekend build and
covers practical skills expected for the Vinted Data Science & Analytics
Academy: data wrangling, feature engineering, model training, evaluation, and
a Streamlit demo.

Quick‑start
-----------
```bash
python -m venv .venv && source .venv/bin/activate
pip install -r requirements.txt

# Step 1: preprocess & feature engineering
python -m src.feature_engineering

# Step 2: train + evaluation
python -m src.train_model

# Step 3: launch the Streamlit app
streamlit run app/streamlit_app.py
```

Directory layout
----------------
```
order_delay_predictor/
├── app/
│   └── streamlit_app.py
├── src/
│   ├── __init__.py
│   ├── config.py
│   ├── data_loader.py
│   ├── feature_engineering.py
│   ├── train_model.py
│   └── predict.py
└─ data
   └─ raw
      ├─ olist_customers_dataset.csv
      ├─ olist_order_reviews_dataset.csv
      ├─ olist_orders_dataset.csv
      ├─ olist_products_dataset.csv
      ├─ olist_sellers_dataset.csv
      ├─ olist_geolocation_dataset.csv
      ├─ olist_order_items_dataset.csv
      ├─ olist_order_payments_dataset.csv
      └─ product_category_name_translation.csv   ← ignored for now (name doesn’t start with “olist_”)

├── models/
├── requirements.txt
└── README.md

```
"""

# ----------------------- src/__init__.py -----------------------
"""Package shortcut imports for notebooks and Streamlit."""

from .config import CFG
from .predict import predict_delay

# ----------------------- src/config.py -----------------------
"""Centralised paths and hyper‑parameters."""

from pathlib import Path

class _CFG:
    ROOT = Path(__file__).resolve().parents[2]
    DATA_RAW = ROOT / "data" / "raw"
    DATA_PROC = ROOT / "data" / "processed"
    MODELS = ROOT / "models"

    ORDERS_PARQUET = DATA_PROC / "orders.parquet"
    TRAIN_TEST_PARQUET = DATA_PROC / "train_test.parquet"

    CLF_PATH = MODELS / "delay_clf.joblib"
    SCALER_PATH = MODELS / "scaler.joblib"

    TEST_SIZE = 0.2
    RANDOM_STATE = 42
    TARGET_COL = "is_late"

CFG = _CFG()

# ----------------------- src/data_loader.py -----------------------
"""Load and merge Olist CSVs into a single orders DataFrame."""

import pandas as pd
from pathlib import Path
from .config import CFG

def load_raw_tables():
    prefix = CFG.DATA_RAW
    files = {
        f.stem: pd.read_csv(prefix / f)
        for f in prefix.iterdir()
        if f.suffix == ".csv" and f.name.startswith("olist_")
    }
    return files


def build_orders():
    tbl = load_raw_tables()
    orders = tbl["olist_orders_dataset"]
    customers = tbl["olist_customers_dataset"][["customer_id", "customer_zip_code_prefix"]]
    items = tbl["olist_order_items_dataset"][
        ["order_id", "seller_id", "price", "freight_value"]
    ]
    sellers = tbl["olist_sellers_dataset"][["seller_id", "seller_zip_code_prefix"]]

    # Merge basic dimensions
    df = (
        orders.merge(customers, on="customer_id", how="left")
        .merge(items.groupby("order_id").agg({"price": "sum", "freight_value": "sum"}).reset_index(), on="order_id", how="left")
        .merge(sellers, on="seller_id", how="left")
    )

    # Dates → datetime
    for col in [
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_carrier_date",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ]:
        df[col] = pd.to_datetime(df[col])

    # Target: late if delivered after estimated date
    df[CFG.TARGET_COL] = (df["order_delivered_customer_date"] > df["order_estimated_delivery_date"]).astype(int)

    return df

# ----------------------- src/feature_engineering.py -----------------------
"""Create flat feature table and train/test split."""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

from .config import CFG
from .data_loader import build_orders
from tqdm.auto import tqdm

tqdm.pandas()

CATEGORICAL = ["order_status", "customer_zip_code_prefix", "seller_zip_code_prefix"]
NUMERIC = [
    "price",
    "freight_value",
    "days_estimated",
    "order_hour",
    "days_to_approve",
]


def engineer(df: pd.DataFrame) -> pd.DataFrame:
    # Simple numeric features
    df["days_estimated"] = (
        (df["order_estimated_delivery_date"] - df["order_purchase_timestamp"]).dt.days
    )
    df["order_hour"] = df["order_purchase_timestamp"].dt.hour
    df["days_to_approve"] = (
        (df["order_approved_at"] - df["order_purchase_timestamp"]).dt.total_seconds() / 3600
    )
    df["days_to_approve"].fillna(df["days_to_approve"].median(), inplace=True)

    # Drop high‑cardinality IDs
    drop_cols = [
        "order_id",
        "customer_id",
        "seller_id",
        "order_purchase_timestamp",
        "order_approved_at",
        "order_delivered_customer_date",
        "order_estimated_delivery_date",
    ]
    return df.drop(columns=drop_cols)


def main():
    CFG.DATA_PROC.mkdir(parents=True, exist_ok=True)
    df = build_orders()
    df = engineer(df)

    # Train/test split & save scaler for numeric cols
    train_df, test_df = train_test_split(
        df, test_size=CFG.TEST_SIZE, random_state=CFG.RANDOM_STATE, stratify=df[CFG.TARGET_COL]
    )
    scaler = StandardScaler()
    train_df[NUMERIC] = scaler.fit_transform(train_df[NUMERIC])
    test_df[NUMERIC] = scaler.transform(test_df[NUMERIC])

    train_df.to_parquet(CFG.DATA_PROC / "train.parquet", index=False)
    test_df.to_parquet(CFG.DATA_PROC / "test.parquet", index=False)
    joblib.dump(scaler, CFG.SCALER_PATH)

    print(f"✔ Processed data saved to {CFG.DATA_PROC}")


if __name__ == "__main__":
    main()

# ----------------------- src/train_model.py -----------------------
"""Train delay classifier and output metrics."""

import pandas as pd
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import joblib

from .config import CFG
from .feature_engineering import CATEGORICAL, NUMERIC


def main():
    train_df = pd.read_parquet(CFG.DATA_PROC / "train.parquet")
    test_df = pd.read_parquet(CFG.DATA_PROC / "test.parquet")

    X_train = train_df.drop(columns=[CFG.TARGET_COL])
    y_train = train_df[CFG.TARGET_COL]
    X_test = test_df.drop(columns=[CFG.TARGET_COL])
    y_test = test_df[CFG.TARGET_COL]

    preproc = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), CATEGORICAL),
        ("num", "passthrough", NUMERIC),
    ])

    clf = GradientBoostingClassifier(random_state=CFG.RANDOM_STATE)

    pipe = Pipeline([
        ("preproc", preproc),
        ("clf", clf),
    ])

    pipe.fit(X_train, y_train)
    joblib.dump(pipe, CFG.CLF_PATH)

    preds = pipe.predict(X_test)
    print("\n--- Classification Report ---")
    print(classification_report(y_test, preds, digits=3))
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    print(f"✔ Model saved to {CFG.CLF_PATH}")


if __name__ == "__main__":
    main()

# ----------------------- src/predict.py -----------------------
"""Inference helper for Streamlit or external usage."""

import pandas as pd
import joblib
from pathlib import Path
from .config import CFG
from .feature_engineering import CATEGORICAL, NUMERIC

# Load model once at import time
_MODEL = joblib.load(CFG.CLF_PATH)
_SCALER = joblib.load(CFG.SCALER_PATH)


def predict_delay(row: dict):
    """Take a dict of order features (raw) → probability of delay (0‑1)."""
    df = pd.DataFrame([row])
    # Numeric scaling (Streamlit supplies raw values)
    df[NUMERIC] = _SCALER.transform(df[NUMERIC])
    proba = _MODEL.predict_proba(df)[:, 1][0]
    return proba

# ----------------------- app/streamlit_app.py -----------------------
"""Simple Streamlit front‑end."""

import streamlit as st
import pandas as pd

from src.predict import predict_delay
from src.feature_engineering import NUMERIC, CATEGORICAL

st.set_page_config(page_title="Order Delay Predictor", layout="centered")

st.title("🚚 Order‑Delay Predictor (Olist)")

with st.form("delay_form"):
    st.markdown("#### Enter order details")
    order_status = st.selectbox("Order status", ["processing", "shipped", "delivered"])
    cust_zip = st.text_input("Customer ZIP prefix", "01086")
    seller_zip = st.text_input("Seller ZIP prefix", "04538")
    price = st.number_input("Total item price (R$)", min_value=0.0, value=100.0)
    freight = st.number_input("Freight value (R$)", min_value=0.0, value=15.0)
    days_est = st.slider("Estimated delivery days", 1, 20, 5)
    order_hour = st.slider("Purchase hour (0‑23)", 0, 23, 14)
    days_to_appr = st.slider("Hours to approve", 0, 72, 4)

    submitted = st.form_submit_button("Predict delay risk →")

if submitted:
    row = {
        "order_status": order_status,
        "customer_zip_code_prefix": cust_zip,
        "seller_zip_code_prefix": seller_zip,
        "price": price,
        "freight_value": freight,
        "days_estimated": days_est,
        "order_hour": order_hour,
        "days_to_approve": days_to_appr,
    }
    proba = predict_delay(row)
    st.subheader("Result")
    st.metric("Probability of being late", f"{proba:.2%}")
    if proba > 0.5:
        st.error("⚠️ High risk of delay. Consider proactive communication with the buyer.")
    else:
        st.success("✅ Likely on‑time.")

# ----------------------- requirements.txt -----------------------
pandas>=2.0
numpy>=1.24
scikit-learn>=1.4
joblib>=1.4
streamlit>=1.35
tqdm>=4.66


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params: {'model__learning_rate': 0.1, 'model__l2_leaf_reg': 1, 'model__depth': 10}
MAE  :  0.525


TypeError: got an unexpected keyword argument 'squared'