# UC1: Feature Engineering - Late Payment Risk

**!!! This notebook is for exploration only.
The production feature pipeline lives in src/uc1_late_risk/build_features.py.**

## 0. Setup

In [None]:
%pip install -r ../requirements.txt

In [None]:
import sys
from pathlib import Path

# Add ML/ to Python path so "import src...." works
ML_ROOT = Path.cwd().parent  # notebooks/ -> ML/
if str(ML_ROOT) not in sys.path:
    sys.path.insert(0, str(ML_ROOT))

ML_ROOT: c:\Users\PC\Documents\DXC Hackathon\bnpl-intelligent-analytics\ML


In [29]:
from src.uc1_late_risk.build_features import main
main()

MLflow run_id: 5ea3b6cfee88492f8093c6b62e397efb
Saved gold to: c:\Users\PC\Documents\DXC Hackathon\bnpl-intelligent-analytics\data\gold\gold_uc1_features.csv
late_rate: 0.198


In [30]:
import pandas as pd
from src.config import GOLD_UC1_FILE

gold_uc1 = pd.read_csv(GOLD_UC1_FILE)
gold_uc1.head()


Unnamed: 0,installment_id,order_id,user_id,merchant_id,installment_number,due_date,paid_date,status,late_days,anchor_date,...,checkout_abandon_rate_30d,checkout_friction_score,merchant_name,category,city_merchant,merchant_status,merchant_status_num,merchant_dispute_rate_90d,merchant_refund_rate_90d,merchant_risk_score
0,inst_0000001,order_000006,user_00002,merchant_0109,1,2026-01-31,2026-01-30,paid,0.0,2026-01-31,...,0.5,1.693147,Merchant 109,electronics,Casablanca,active,1,0.0,0.0,0.0
1,inst_0000003,order_000006,user_00002,merchant_0109,3,2026-04-01,2026-04-06,late,5.0,2026-04-01,...,0.0,0.0,Merchant 109,electronics,Casablanca,active,1,0.0,0.0,0.0
2,inst_0000004,order_000007,user_00002,merchant_0053,1,2025-12-10,2025-12-10,paid,0.0,2025-12-10,...,0.0,0.0,Merchant 53,travel,Marrakech,active,1,0.0,0.0,0.0
3,inst_0000005,order_000007,user_00002,merchant_0053,2,2026-01-09,2026-01-08,paid,0.0,2026-01-09,...,0.0,0.0,Merchant 53,travel,Marrakech,active,1,0.130435,0.043478,0.304348
4,inst_0000006,order_000007,user_00002,merchant_0053,3,2026-02-08,2026-02-07,paid,0.0,2026-02-08,...,1.0,2.693147,Merchant 53,travel,Marrakech,active,1,0.166667,0.111111,0.444444


## 2. HARD validation (must-pass)

In [31]:
# Target checks
assert TARGET in gold_uc1.columns
assert gold_uc1[TARGET].notna().all()

# Anchor date
assert gold_uc1["anchor_date"].notna().all()

# Duplicates
assert gold_uc1.duplicated().sum() == 0

# Feature presence
missing = [c for c in GOLD_UC1_FEATURES if c not in gold_uc1.columns]
assert len(missing) == 0, f"Missing features: {missing}"


## 3. Feature completeness check

In [32]:
from src.config import GOLD_UC1_FEATURES, TARGET, ID_COLS

missing = [c for c in GOLD_UC1_FEATURES if c not in gold_uc1.columns]
print("Missing features:", missing)

assert len(missing) == 0, "Missing features in gold table"


Missing features: []


## 4. Leakage safety (critical)

In [33]:
leakage_cols = ["paid_date", "late_days", "late_days_final", "status"]

bad = [c for c in leakage_cols if c in GOLD_UC1_FEATURES]
assert len(bad) == 0, f"LEAKAGE FEATURES FOUND: {bad}"


## 5. Distribution sanity (soft checks)

In [34]:
print("Target balance:")
print(gold_uc1[TARGET].value_counts(normalize=True))

print("\nMissingness (top 10):")
print(gold_uc1[GOLD_UC1_FEATURES].isna().mean().sort_values(ascending=False).head(10))


Target balance:
is_late
0    0.802032
1    0.197968
Name: proportion, dtype: float64

Missingness (top 10):
sum_order_amount_30d        0.523217
avg_order_amount_30d        0.523217
max_order_amount_30d        0.523217
avg_late_days_90d           0.156716
on_time_payment_rate_90d    0.156716
max_late_days_90d           0.156716
late_payment_rate_90d       0.156716
account_age_days            0.000000
user_city                   0.000000
kyc_level_num               0.000000
dtype: float64
