In [3]:
# Load packages

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

no_flag_missing = True

## Load Imputed data
df_train_imp = pd.read_parquet("../data/processed/merged/imputed/train.parquet")
df_train_imp['first_time_homebuyer_flag'] = df_train_imp['first_time_homebuyer_flag'].fillna(False) # add this line into the imputer

df_val_imp = pd.read_parquet("../data/processed/merged/imputed/validation.parquet")





if no_flag_missing :
    # 1) Lister les colonnes "missing" (suffixe _missing, insensible à la casse)
    missing_flag_cols = [c for c in df_train_imp.columns if re.search(r'(?i)_missing$', c)]
    print("Missing flags (train):", missing_flag_cols)

    # 2) Créer des vues SANS ces flags (train/val)
    df_train_imp = df_train_imp.drop(columns=missing_flag_cols, errors="ignore")
    df_val_imp   = df_val_imp.drop(columns=missing_flag_cols, errors="ignore")  # si tu veux traiter la val ensuite


print(df_train_imp.shape)
print(df_val_imp.shape)


# Méhode 1 opti :



# ============================================
# Pipeline "max |Gini|" — version complète, robuste et parallélisée
# ============================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Parallélisation
try:
    from joblib import Parallel, delayed
except Exception:  # fallback si joblib indisponible
    Parallel = None
    def delayed(f): return f

# --------------------------------------------
# Chargement des données (imputed uniquement)
# --------------------------------------------
df_train_imp = pd.read_parquet("../data/processed/merged/imputed/train.parquet")
df_val_imp   = pd.read_parquet("../data/processed/merged/imputed/validation.parquet")

# Ajustement spécifique (idéalement à déplacer dans l'imputer)
if "first_time_homebuyer_flag" in df_train_imp.columns:
    df_train_imp["first_time_homebuyer_flag"] = df_train_imp["first_time_homebuyer_flag"].fillna(False)
if "first_time_homebuyer_flag" in df_val_imp.columns:
    df_val_imp["first_time_homebuyer_flag"] = df_val_imp["first_time_homebuyer_flag"].fillna(False)

# ============================================
# Utils Gini (X=Good, Y=Bad)
# ============================================
def gini_trapz(df_cum,
               y_col="bad_client_share_cumsum",
               x_col="good_client_share_cumsum",
               signed=False):
    """
    Gini = 1 - 2 * aire(y vs x).
    Par défaut on renvoie |Gini| (signed=False).
    Sécurise les endpoints (0,0) et (1,1) et clamp dans [0,1].
    """
    df = df_cum[[x_col, y_col]].astype(float).copy().sort_values(x_col)
    # clamp
    df[x_col] = df[x_col].clip(0, 1)
    df[y_col] = df[y_col].clip(0, 1)
    # endpoints
    if df[x_col].iloc[0] > 0 or df[y_col].iloc[0] > 0:
        df = pd.concat([pd.DataFrame({x_col: [0.0], y_col: [0.0]}), df], ignore_index=True)
    if df[x_col].iloc[-1] < 1 - 1e-12 or df[y_col].iloc[-1] < 1 - 1e-12:
        df = pd.concat([df, pd.DataFrame({x_col: [1.0], y_col: [1.0]})], ignore_index=True)
    x = df[x_col].to_numpy()
    y = df[y_col].to_numpy()
    area = np.trapezoid(y, x) if hasattr(np, "trapezoid") else np.trapz(y, x)
    g = 1 - 2 * area
    return g if signed else abs(g)

# ======================================================
# Étape 0 — Dé-one-hot (protège la cible)
# ======================================================
def detect_onehot_groups(df, allow_singleton=True, exclude_cols=None):
    """
    Détecte les groupes one-hot en scindant au DERNIER underscore.
    Accepte tout suffixe (state_CA, grade_A, ...). 0/1 ou bool requis.
    exclude_cols : colonnes à ignorer (ex: la cible).
    """
    exclude = set(exclude_cols or [])
    groups = {}
    for col in df.columns:
        if col in exclude or "_" not in col:
            continue
        base, label = col.rsplit("_", 1)
        s = df[col]
        is_ohe = (pd.api.types.is_bool_dtype(s) or
                  (pd.api.types.is_numeric_dtype(s) and s.dropna().isin([0, 1]).all()))
        if is_ohe:
            groups.setdefault(base, []).append((col, label))

    clean = {}
    for base, items in groups.items():
        if len(items) >= 2 or allow_singleton:
            clean[base] = items
    return clean

def deonehot_categoricals(df, allow_singleton=False, exclude_cols=None):
    """
    Recompose des colonnes one-hot en une seule catégorie (dtype category).
    - multi-colonnes -> fusion
    - singleton -> fusion seulement si allow_singleton=True
    - "<NA>" (texte) -> NaN, rangé en dernier
    - exclude_cols protégées (ex: cible)
    """
    groups = detect_onehot_groups(df, allow_singleton=allow_singleton, exclude_cols=exclude_cols)
    out = df.copy()

    def label_sort_key(lab):
        return (1, "") if lab == "<NA>" else (0, str(lab))

    for base, items in groups.items():
        items_sorted = sorted(items, key=lambda x: label_sort_key(x[1]))

        if len(items_sorted) == 1 and allow_singleton:
            col, lab = items_sorted[0]
            ser = pd.Series("__OTHER__", index=df.index, dtype="object")
            mask = (df[col] == 1)
            ser[mask] = (pd.NA if lab == "<NA>" else lab)
            out[base] = ser.astype("category")
            out.drop(columns=[col], inplace=True, errors="ignore")
            continue

        if len(items_sorted) >= 2:
            cols_sorted = [c for c, _ in items_sorted]
            labels = [lab for _, lab in items_sorted]
            ser = pd.Series(pd.NA, index=df.index, dtype="object")
            for c, lab in zip(cols_sorted, labels):
                mask = (df[c] == 1)
                ser[mask] = (pd.NA if lab == "<NA>" else lab)
            out[base] = ser.astype("category")
            out.drop(columns=cols_sorted, inplace=True, errors="ignore")

    return out

# ======================================
# Étape 1 — Cible binaire (auto/forcée)
# ======================================
def infer_binary_target(df, prefer_name_patterns=('default', 'delinq', 'bad', 'target', 'label')):
    candidates = []
    for col in df.columns:
        s = df[col]
        is_bool = pd.api.types.is_bool_dtype(s)
        is_binary_int = (pd.api.types.is_integer_dtype(s) or pd.api.types.is_numeric_dtype(s)) and s.dropna().isin([0, 1]).all()
        is_binary_cat = isinstance(s.dtype, pd.CategoricalDtype) and s.dropna().nunique() == 2
        if is_bool or is_binary_int or is_binary_cat:
            score = 0.0
            name_lower = col.lower()
            for p in prefer_name_patterns:
                if p in name_lower:
                    score += 10.0
            try:
                score += float(1 - min(max(float(s.astype("Int64").mean(skipna=True)), 1e-6), 1 - 1e-6))
            except Exception:
                pass
            candidates.append((score, col))
    if not candidates:
        raise ValueError("Aucune colonne binaire éligible trouvée pour servir de cible.")
    candidates.sort(reverse=True)
    return candidates[0][1]

# ===================================================
# Étape 2 — Colonnes catégorielles brutes
# ===================================================
def find_categorical_columns(df, target_col=None, max_levels_object=50, exclude_ids=None):
    exclude_ids = set(exclude_ids or [])
    cat_cols = []
    for col in df.columns:
        if col == target_col or col in exclude_ids:
            continue
        s = df[col]
        if isinstance(s.dtype, pd.CategoricalDtype) or pd.api.types.is_bool_dtype(s):
            cat_cols.append(col)
        elif pd.api.types.is_object_dtype(s) or str(s.dtype).startswith("string"):
            if s.nunique(dropna=True) <= max_levels_object:
                cat_cols.append(col)
        elif pd.api.types.is_integer_dtype(s) and s.nunique(dropna=True) <= 8:
            if not any(k in col.lower() for k in ['id', 'sequence', 'loan_sequence']):
                cat_cols.append(col)
    return cat_cols

def extract_ordinal_info(df, cat_cols):
    ordinal_cols, explicit_orders = [], {}
    for col in cat_cols:
        s = df[col]
        if isinstance(s.dtype, pd.CategoricalDtype) and getattr(s.dtype, "ordered", False):
            ordinal_cols.append(col)
            explicit_orders[col] = list(s.dtype.categories)
    return ordinal_cols, explicit_orders

# =========================================================
# Étape 3 — Binning catégoriel (fusion pour max |Gini|)
# =========================================================
def _cat_stats(df, col, target_col="target",
               include_missing=True, missing_label="__MISSING__"):
    target = df[target_col].astype(int)
    ser = df[col]
    if include_missing:
        ser = ser.astype("object").where(ser.notna(), missing_label)
    tmp = pd.DataFrame({col: ser, target_col: target})
    agg = tmp.groupby(col, dropna=not include_missing)[target_col].agg(["sum", "count"])
    agg.rename(columns={"sum": "n_bad", "count": "n_total"}, inplace=True)
    agg["n_good"] = agg["n_total"] - agg["n_bad"]
    n_total = len(df)
    n_bad = int(target.sum())
    n_good = n_total - n_bad
    denom_bad = n_bad if n_bad > 0 else 1
    denom_good = n_good if n_good > 0 else 1
    agg["bad_rate"] = agg["n_bad"] / agg["n_total"].where(agg["n_total"] > 0, 1)
    agg["bad_share"] = agg["n_bad"] / denom_bad
    agg["good_share"] = agg["n_good"] / denom_good
    return agg.reset_index().rename(columns={col: "modality"})

def _groups_df_from_bins(stats_df, bins, order_key="bad_rate", ascending=True):
    rows = []
    for i, mods in enumerate(bins):
        sub = stats_df[stats_df["modality"].isin(mods)]
        n_bad = int(sub["n_bad"].sum())
        n_good = int(sub["n_good"].sum())
        n_tot = int(sub["n_total"].sum())
        br = n_bad / n_tot if n_tot > 0 else 0.0
        rows.append({"bin_id": i, "modalities": tuple(mods),
                     "n_total": n_tot, "n_bad": n_bad, "n_good": n_good,
                     "bad_rate": br,
                     "bad_share": sub["bad_share"].sum(),
                     "good_share": sub["good_share"].sum()})
    gdf = pd.DataFrame(rows).sort_values(order_key, ascending=ascending, kind="mergesort").reset_index(drop=True)
    gdf["bad_cum"] = gdf["bad_share"].cumsum()
    gdf["good_cum"] = gdf["good_share"].cumsum()
    return gdf

def _gini_from_bins(stats_df, bins, order_key="bad_rate", ascending=True):
    gdf = _groups_df_from_bins(stats_df, bins, order_key, ascending)
    df_cum = gdf.rename(columns={"good_cum": "good_client_share_cumsum",
                                 "bad_cum": "bad_client_share_cumsum"})[["good_client_share_cumsum", "bad_client_share_cumsum"]]
    return gini_trapz(df_cum, y_col="bad_client_share_cumsum",
                      x_col="good_client_share_cumsum", signed=False)

def _initial_order(stats_df, ordered=False, explicit_order=None, nominal_order_key="bad_rate"):
    if ordered:
        order = list(explicit_order) if explicit_order is not None else list(stats_df["modality"])
        order = [m for m in order if m in set(stats_df["modality"])] + \
                [m for m in stats_df["modality"] if m not in set(order)]
    else:
        order = list(stats_df.sort_values(nominal_order_key)["modality"])
    return order

def _reorder_after_merge(groups, stats_df, ordered, nominal_order_key="bad_rate"):
    if ordered:
        return groups

    def grp_bad_rate(mods):
        sub = stats_df[stats_df["modality"].isin(mods)]
        nb, nt = sub["n_bad"].sum(), sub["n_total"].sum()
        return (nb / nt) if nt > 0 else 0.0

    return sorted(groups, key=lambda mods: grp_bad_rate(mods))

def maximize_gini_via_merging(
    df, col, target_col,
    include_missing=True, missing_label="__MISSING__",
    ordered=False, explicit_order=None,
    max_bins=6, min_bin_size=200,
    order_key_for_curve="bad_rate", nominal_order_key="bad_rate"
):
    stats_df = _cat_stats(df, col, target_col, include_missing, missing_label)
    order = _initial_order(stats_df, ordered, explicit_order, nominal_order_key)
    groups = [[m] for m in order]
    if len(groups) <= 1:
        mapping = {m: 0 for m in order}
        gdf_final = _groups_df_from_bins(stats_df, groups, order_key_for_curve, True)
        g = _gini_from_bins(stats_df, groups, order_key_for_curve, True)
        return {"mapping": mapping, "gini_before": float(g), "gini_after": float(g),
                "bins_table": gdf_final, "bins": [tuple(g) for g in groups]}

    # Contraintes
    def constraints_ok(groups):
        if max_bins is not None and len(groups) > max_bins:
            return False
        if min_bin_size and min_bin_size > 0:
            for mods in groups:
                if int(stats_df[stats_df["modality"].isin(mods)]["n_total"].sum()) < min_bin_size:
                    return False
        return True

    # Merge glouton tant que les contraintes ne sont pas respectées
    while not constraints_ok(groups):
        best_g, best_i = -np.inf, None
        for i in range(len(groups) - 1):
            merged = groups[:i] + [groups[i] + groups[i + 1]] + groups[i + 2:]
            merged = _reorder_after_merge(merged, stats_df, ordered, nominal_order_key)
            g_try = _gini_from_bins(stats_df, merged, order_key_for_curve, True)
            if g_try > best_g:
                best_g, best_i = g_try, i
        if best_i is None:
            best_i = 0
        groups = groups[:best_i] + [groups[best_i] + groups[best_i + 1]] + groups[best_i + 2:]
        groups = _reorder_after_merge(groups, stats_df, ordered, nominal_order_key)

    gini_before = _gini_from_bins(stats_df, [[m] for m in order], order_key_for_curve, True)
    gini_after = _gini_from_bins(stats_df, groups, order_key_for_curve, True)
    final_bins = [tuple(mods) for mods in groups]
    mapping = {m: b for b, mods in enumerate(final_bins) for m in mods}
    gdf_final = _groups_df_from_bins(stats_df, groups, order_key_for_curve, True)
    return {"mapping": mapping, "gini_before": float(gini_before), "gini_after": float(gini_after),
            "bins_table": gdf_final, "bins": final_bins}

# ==========================================================
# Étape 4 — Binning numérique (quantiles glouton, max |Gini|)
#            + conversions dates -> jours + edges sûres
# ==========================================================
def _is_period_dtype(dt):
    try:
        return pd.api.types.is_period_dtype(dt)
    except Exception:
        return False

def _to_float_series(s):
    # Period -> début de période -> jours depuis epoch
    if _is_period_dtype(s.dtype):
        ts = s.dt.to_timestamp(how="start")
        days = (ts.view("int64") // 86_400_000_000_000)  # ns -> jours
        return days.astype("float64")
    # Datetime -> jours depuis epoch (protège les tz)
    if pd.api.types.is_datetime64_any_dtype(s):
        if hasattr(s.dt, "tz") and s.dt.tz is not None:
            s = s.dt.tz_localize(None)
        days = (s.astype("datetime64[ns]").view("int64") // 86_400_000_000_000)
        return days.astype("float64")
    # Numérique
    if pd.api.types.is_numeric_dtype(s):
        return pd.to_numeric(s, errors="coerce").astype("float64")
    # Objet -> numérique (coerce)
    return pd.to_numeric(s, errors="coerce").astype("float64")

def _safe_edges_for_cut(edges, s_float):
    """
    edges: liste triée [-inf, t1, ..., +inf] -> array strictement croissante
    élargit extrémités et corrige les égalités numériques.
    """
    e = np.array(edges, dtype="float64")
    for i in range(1, len(e)):
        if not (e[i] > e[i - 1]):
            e[i] = np.nextafter(e[i - 1], np.inf)

    s_vals = s_float.to_numpy()
    try:
        s_min = float(np.nanmin(s_vals))
        s_max = float(np.nanmax(s_vals))
    except ValueError:
        # tout NaN
        s_min, s_max = -1.0, 1.0

    rel_eps_lo = 1e-6 * (abs(e[1]) + 1.0) if len(e) > 1 else 1e-6
    rel_eps_hi = 1e-6 * (abs(e[-2]) + 1.0) if len(e) > 1 else 1e-6
    if len(e) >= 2:
        e[0] = min(e[1] - rel_eps_lo, s_min - rel_eps_lo)
        e[-1] = max(e[-2] + rel_eps_hi, s_max + rel_eps_hi)
    return e

def _gini_from_numeric_bins(y_int, x_float, edges, include_missing=True):
    y = y_int.astype(int).to_numpy()
    x = x_float.to_numpy()
    bins_idx = np.digitize(x, edges[1:-1], right=True)
    K = len(edges) - 1
    n_total = len(y)
    n_bad = int(y.sum())
    n_good = n_total - n_bad
    denom_bad = n_bad if n_bad > 0 else 1
    denom_good = n_good if n_good > 0 else 1
    rows = []
    for k in range(K):
        mask = (bins_idx == k) & ~np.isnan(x)
        nk = int(mask.sum())
        nb = int(y[mask].sum())
        ng = nk - nb
        br = nb / nk if nk > 0 else 0.0
        rows.append({"bin": k, "n_total": nk, "n_bad": nb, "n_good": ng, "bad_rate": br})
    if include_missing and np.isnan(x).any():
        mask = np.isnan(x)
        nk = int(mask.sum())
        nb = int(y[mask].sum())
        ng = nk - nb
        br = nb / nk if nk > 0 else 0.0
        rows.append({"bin": K, "n_total": nk, "n_bad": nb, "n_good": ng, "bad_rate": br})

    gdf = pd.DataFrame(rows)
    if gdf.empty:
        return 0.0, gdf
    gdf["bad_share"] = gdf["n_bad"] / denom_bad
    gdf["good_share"] = gdf["n_good"] / denom_good
    gdf = gdf.sort_values("bad_rate").reset_index(drop=True)
    gdf["bad_cum"] = gdf["bad_share"].cumsum()
    gdf["good_cum"] = gdf["good_share"].cumsum()
    df_cum = gdf.rename(columns={"good_cum": "good_client_share_cumsum",
                                 "bad_cum": "bad_client_share_cumsum"})[["good_client_share_cumsum", "bad_client_share_cumsum"]]
    g = gini_trapz(df_cum, y_col="bad_client_share_cumsum",
                   x_col="good_client_share_cumsum", signed=False)
    return g, gdf

def optimize_numeric_binning_by_quantiles(
    df, col, target_col,
    max_bins=6, min_bin_size=200,
    n_quantiles=50, q_low=0.02, q_high=0.98,
    include_missing=True, min_gain=1e-5
):
    s = _to_float_series(df[col])
    y = df[target_col].astype(int)
    nunique = s.dropna().nunique()
    if nunique < 2:
        # une seule modalité -> un bin
        g0, _ = _gini_from_numeric_bins(y, s, [-np.inf, np.inf], include_missing)
        return {"edges": [-np.inf, np.inf], "edges_for_cut": [-1.0, 1.0], "labels": ["(-inf, inf]"],
                "gini_before": float(g0), "gini_after": float(g0), "bins_table": pd.DataFrame()}

    qs = np.linspace(q_low, q_high, n_quantiles)
    cand_vals = s.quantile(qs).dropna().unique()
    cand_vals = np.unique(cand_vals)
    edges = [-np.inf, np.inf]

    def edges_ok(e):
        bins_idx = np.digitize(s.to_numpy(), e[1:-1], right=True)
        for k in range(len(e) - 1):
            if int(((bins_idx == k) & ~np.isnan(s.to_numpy())).sum()) < min_bin_size:
                return False
        return True

    gini0, _ = _gini_from_numeric_bins(y, s, edges, include_missing)
    best_gini = gini0
    improved = True
    while improved and (len(edges) - 1) < max_bins:
        improved = False
        best_gain = min_gain
        best_t = None
        g_best = best_gini
        for t in cand_vals:
            if t in edges:
                continue
            new_edges = sorted([*edges, t])
            # ignore seuils quasi-identiques
            if any(np.isclose(new_edges[i], new_edges[i + 1]) for i in range(len(new_edges) - 1)):
                continue
            if not edges_ok(new_edges):
                continue
            g_try, _ = _gini_from_numeric_bins(y, s, new_edges, include_missing)
            gain = g_try - best_gini
            if gain > best_gain:
                best_gain, best_t, g_best = gain, t, g_try
        if best_t is not None:
            edges = sorted([*edges, best_t])
            best_gini = g_best
            improved = True

    gini_after, bins_table = _gini_from_numeric_bins(y, s, edges, include_missing)
    e = sorted(edges)
    e_cut = _safe_edges_for_cut(e, s)
    labels = [f"({e[i]}, {e[i + 1]}]" for i in range(len(e) - 1)]
    return {"edges": e, "edges_for_cut": e_cut, "labels": labels,
            "gini_before": float(gini0), "gini_after": float(gini_after),
            "bins_table": bins_table}

# ==========================================================
# Parallélisation — helpers
# ==========================================================
def _compute_cat_bin_result(df_small, col, target_col,
                            include_missing, missing_label,
                            is_ord, explicit_order,
                            max_bins, min_bin_size,
                            order_key_for_curve, nominal_order_key):
    # df_small contient uniquement [col, target_col]
    res = maximize_gini_via_merging(
        df=df_small, col=col, target_col=target_col,
        include_missing=include_missing, missing_label=missing_label,
        ordered=is_ord, explicit_order=explicit_order,
        max_bins=max_bins, min_bin_size=min_bin_size,
        order_key_for_curve=order_key_for_curve, nominal_order_key=nominal_order_key
    )
    return col, res

def _compute_num_bin_result(df_small, col, target_col,
                            max_bins, min_bin_size, n_quantiles,
                            include_missing):
    # df_small contient uniquement [col, target_col]
    res = optimize_numeric_binning_by_quantiles(
        df=df_small, col=col, target_col=target_col,
        max_bins=max_bins, min_bin_size=min_bin_size,
        n_quantiles=n_quantiles, include_missing=include_missing
    )
    return col, res

# ==========================================================
# Étape 3 bis — Catégorielles (parallélisées)
# ==========================================================
def auto_bin_all_categoricals(
    df, cat_columns, target_col,
    include_missing=True, missing_label="__MISSING__",
    ordinal_cols=None, explicit_orders=None,
    max_bins=6, min_bin_size=200,
    order_key_for_curve="bad_rate", nominal_order_key="bad_rate",
    add_binned_columns=True, bin_col_suffix="__BIN",
    n_jobs=1, verbose=0
):
    ordinal_cols = set(ordinal_cols or [])
    explicit_orders = explicit_orders or {}
    df_out = df.copy()
    results, summary_rows = {}, []

    # 1) calcule les binnings en parallèle (pas de mapping ici)
    if n_jobs != 1 and Parallel is not None and len(cat_columns) > 0:
        tasks = (
            delayed(_compute_cat_bin_result)(
                df_out[[col, target_col]].copy(),
                col, target_col,
                include_missing, missing_label,
                (col in ordinal_cols), explicit_orders.get(col),
                max_bins, min_bin_size,
                order_key_for_curve, nominal_order_key
            )
            for col in cat_columns
        )
        out = Parallel(n_jobs=n_jobs, backend="loky", verbose=verbose)(list(tasks))
        for col, res in out:
            results[col] = res
    else:
        for col in cat_columns:
            is_ord = col in ordinal_cols
            res = maximize_gini_via_merging(
                df=df_out, col=col, target_col=target_col,
                include_missing=include_missing, missing_label=missing_label,
                ordered=is_ord, explicit_order=explicit_orders.get(col),
                max_bins=max_bins, min_bin_size=min_bin_size,
                order_key_for_curve=order_key_for_curve, nominal_order_key=nominal_order_key
            )
            results[col] = res

    # 2) mapping (série, pour limiter la conso mémoire)
    for col, res in results.items():
        summary_rows.append({
            "variable": col, "type": "categorical",
            "n_bins_final": len(res["bins"]),
            "gini_before": res["gini_before"],
            "gini_after": res["gini_after"],
            "gini_gain": res["gini_after"] - res["gini_before"]
        })
        if add_binned_columns and col in df_out.columns:
            ser = df_out[col].astype("object")
            if include_missing:
                ser = ser.where(ser.notna(), missing_label)
            df_out[col + bin_col_suffix] = ser.map(res["mapping"]).astype("Int64")

    summary = (pd.DataFrame(summary_rows)
               .sort_values("gini_after", ascending=False)
               .reset_index(drop=True))
    return {"results": results, "summary": summary, "df": df_out}

# ==========================================================
# Étape 4 bis — Numériques (parallélisées)
# ==========================================================
def auto_bin_all_numerics(
    df, target_col,
    max_bins=6, min_bin_size=200,
    n_quantiles=50, include_missing=True,
    add_binned_columns=True, bin_col_suffix="__BIN",
    exclude_ids=None,
    n_jobs=1, verbose=0
):
    exclude_ids = set(exclude_ids or [])
    df_out = df.copy()
    results, summary_rows = {}, []

    # Détection colonnes numériques/period/datetime
    numeric_cols = []
    for col in df.columns:
        if col == target_col or col in exclude_ids:
            continue
        s = df[col]
        if pd.api.types.is_numeric_dtype(s) and not pd.api.types.is_bool_dtype(s):
            if s.dropna().isin([0, 1]).all():
                continue
            if pd.api.types.is_integer_dtype(s) and s.dropna().nunique() <= 8:
                continue
            if any(k in col.lower() for k in ['id', 'sequence', 'postal', 'zip', 'msa', 'code', 'seller', 'servicer']):
                continue
            numeric_cols.append(col)
        elif _is_period_dtype(s.dtype) or pd.api.types.is_datetime64_any_dtype(s):
            numeric_cols.append(col)

    # 1) calcule les binnings en parallèle (pas de cut ici)
    if n_jobs != 1 and Parallel is not None and len(numeric_cols) > 0:
        tasks = (
            delayed(_compute_num_bin_result)(
                df_out[[col, target_col]].copy(),
                col, target_col,
                max_bins, min_bin_size, n_quantiles,
                include_missing
            )
            for col in numeric_cols
        )
        out = Parallel(n_jobs=n_jobs, backend="loky", verbose=verbose)(list(tasks))
        for col, res in out:
            results[col] = res
    else:
        for col in numeric_cols:
            res = optimize_numeric_binning_by_quantiles(
                df=df_out, col=col, target_col=target_col,
                max_bins=max_bins, min_bin_size=min_bin_size,
                n_quantiles=n_quantiles, include_missing=include_missing
            )
            results[col] = res

    # 2) application des cuts (série, mémoire friendly)
    for col, res in results.items():
        summary_rows.append({
            "variable": col, "type": "numeric",
            "n_bins_final": len(res["edges"]) - 1,
            "gini_before": res["gini_before"],
            "gini_after": res["gini_after"],
            "gini_gain": res["gini_after"] - res["gini_before"]
        })
        if add_binned_columns and col in df_out.columns:
            s = _to_float_series(df_out[col])
            b = pd.cut(s, bins=res["edges_for_cut"], include_lowest=True, duplicates="drop")
            b = b.cat.codes.astype("Int64")
            if include_missing and s.isna().any():
                b = b.where(~s.isna(), -1).astype("Int64")
            df_out[col + bin_col_suffix] = b

    summary = (pd.DataFrame(summary_rows)
               .sort_values("gini_after", ascending=False)
               .reset_index(drop=True))
    return {"results": results, "summary": summary, "df": df_out}

# ============================================
# Étape 5 — Assemblage final + One-Hot des BIN
# ============================================
def build_final_datasets(out_cat, out_num, drop_original=True, bin_col_suffix="__BIN",
                         keep_vars=None):
    """
    keep_vars: iterable de noms de variables (sans suffixe __BIN) à conserver.
               Si None -> conserve toutes les variables binned.
    """
    df_enrichi = out_num["df"].copy()
    # récupère aussi les BIN caté ajoutés dans out_cat
    for c in out_cat["df"].columns:
        if c.endswith(bin_col_suffix) and c not in df_enrichi.columns:
            df_enrichi[c] = out_cat["df"][c]

    cat_cols_all = list(out_cat["results"].keys())
    num_cols_all = list(out_num["results"].keys())

    if keep_vars is not None:
        keep_vars = set(keep_vars)
        cat_cols = [c for c in cat_cols_all if c in keep_vars]
        num_cols = [c for c in num_cols_all if c in keep_vars]
    else:
        cat_cols = cat_cols_all
        num_cols = num_cols_all

    bin_cols = [c + bin_col_suffix for c in cat_cols + num_cols if c + bin_col_suffix in df_enrichi.columns]

    # Dataset binned (optionnel) : on supprime les brutes
    if drop_original:
        df_binned = (df_enrichi
                     .drop(columns=cat_cols_all + num_cols_all, errors="ignore")
                     .rename(columns={c: c.replace(bin_col_suffix, "") for c in bin_cols}))
    else:
        df_binned = df_enrichi.copy()

    # One-Hot final uniquement des colonnes BIN retenues
    df_ohe = pd.get_dummies(
        df_enrichi.drop(columns=cat_cols_all + num_cols_all, errors="ignore"),
        columns=bin_cols,
        prefix={c: c.replace(bin_col_suffix, "") for c in bin_cols},
        dummy_na=False
    )
    # Option : ordre stable des colonnes
    df_ohe = df_ohe.reindex(sorted(df_ohe.columns), axis=1)
    return df_enrichi, df_binned, df_ohe

# ============================================
# Étape 6 — LANCEUR complet (protège la cible, parallélisé)
# ============================================
def run_full_pipeline_on_onehot_df(
    df_onehot,
    target_col=None,                         # passe "default_24m" pour forcer
    max_bins_categ=6, min_bin_size_categ=200,
    max_bins_num=6,   min_bin_size_num=200, n_quantiles_num=50,
    include_missing=True, missing_label="__MISSING__", max_levels_object=50,
    bin_col_suffix="__BIN",
    exclude_ids=("loan_sequence_number", "postal_code", "seller_name", "servicer_name", "msa_md"),
    n_jobs_categ=-1, n_jobs_num=-1, verbose=0,
    min_gini_keep=None   # ⇦ optionnel: filtre les variables à faible Gini (ex: 1e-6)
):
    # 0) dé-one-hot en protégeant la cible si fournie
    DF = deonehot_categoricals(
        df_onehot,
        allow_singleton=False,                         # évite d'avaler des singletons ambigus
        exclude_cols=[target_col] if target_col else None
    )

    # 1) cible
    if target_col is not None and target_col not in DF.columns:
        print(f"[INFO] Colonne cible '{target_col}' introuvable après préparation. Inférence automatique...")
        TARGET = infer_binary_target(DF)
    else:
        TARGET = target_col if target_col is not None else infer_binary_target(DF)

    # 2) catégorielles
    cat_cols = find_categorical_columns(DF, target_col=TARGET, max_levels_object=max_levels_object,
                                        exclude_ids=exclude_ids)
    ordinal_cols, explicit_orders = extract_ordinal_info(DF, cat_cols)
    out_cat = auto_bin_all_categoricals(
        df=DF, cat_columns=cat_cols, target_col=TARGET,
        include_missing=include_missing, missing_label=missing_label,
        ordinal_cols=ordinal_cols, explicit_orders=explicit_orders,
        max_bins=max_bins_categ, min_bin_size=min_bin_size_categ,
        order_key_for_curve="bad_rate", nominal_order_key="bad_rate",
        add_binned_columns=True, bin_col_suffix=bin_col_suffix,
        n_jobs=n_jobs_categ, verbose=verbose
    )

    # 3) numériques
    out_num = auto_bin_all_numerics(
        df=out_cat["df"], target_col=TARGET,
        max_bins=max_bins_num, min_bin_size=min_bin_size_num,
        n_quantiles=n_quantiles_num, include_missing=include_missing,
        add_binned_columns=True, bin_col_suffix=bin_col_suffix,
        exclude_ids=exclude_ids,
        n_jobs=n_jobs_num, verbose=verbose
    )

    # 4) datasets finaux (+ filtrage Gini optionnel)
    summary = (pd.concat([out_cat["summary"], out_num["summary"]], ignore_index=True)
               .sort_values(["type", "gini_after"], ascending=[True, False])
               .reset_index(drop=True))
    keep_vars = None
    if min_gini_keep is not None:
        keep_vars = summary.loc[summary["gini_after"] >= float(min_gini_keep), "variable"].tolist()
        if verbose:
            nb_drop = (summary["gini_after"] < float(min_gini_keep)).sum()
            print(f"[INFO] min_gini_keep={min_gini_keep} -> exclusion de {nb_drop} variables.")

    df_enrichi, df_binned, df_ohe = build_final_datasets(
        out_cat, out_num,
        drop_original=True,
        bin_col_suffix=bin_col_suffix,
        keep_vars=keep_vars
    )

    return {
        "target": TARGET,
        "summary": summary,
        "df_enrichi": df_enrichi,    # contient les colonnes *_BIN
        "df_binned": df_binned,      # (optionnel) DF avec colonnes BIN renommées
        "df_ohe": df_ohe,            # OHE final prêt pour le modèle
        "cat_results": out_cat["results"],
        "num_results": out_num["results"]
    }

# ============================================
# Étape 7 — Transformer val/test avec bins appris (protège la cible)
# ============================================
def transform_with_learned_bins(df_raw_onehot, res, bin_col_suffix="__BIN",
                                include_missing=True,
                                exclude_ids=("loan_sequence_number", "postal_code", "seller_name", "servicer_name", "msa_md")):
    DF = deonehot_categoricals(
        df_raw_onehot,
        allow_singleton=False,
        exclude_cols=[res["target"]]  # protège la colonne cible
    )

    # 1) catégorielles (mappings appris)
    for col, r in res["cat_results"].items():
        if col not in DF.columns:
            continue
        s = DF[col].astype("object").where(DF[col].notna(), "__MISSING__")
        mapped = s.map(r["mapping"]).astype("Int64")
        mapped = mapped.fillna(-2).astype("Int64")  # catégories jamais vues -> -2
        DF[col + bin_col_suffix] = mapped

    # 2) numériques (edges appris)
    for col, r in res["num_results"].items():
        if col not in DF.columns:
            continue
        s = _to_float_series(DF[col])
        e = np.array(r["edges_for_cut"], dtype="float64")
        b = pd.cut(s, bins=e, include_lowest=True, duplicates="drop")
        b = b.cat.codes.astype("Int64")
        if include_missing and s.isna().any():
            b = b.where(~s.isna(), -1).astype("Int64")
        DF[col + bin_col_suffix] = b

    # 3) One-hot final des colonnes BIN
    cat_cols = list(res["cat_results"].keys())
    num_cols = list(res["num_results"].keys())
    bin_cols = [c + bin_col_suffix for c in cat_cols + num_cols if c + bin_col_suffix in DF.columns]

    df_model = pd.get_dummies(
        DF.drop(columns=cat_cols + num_cols, errors="ignore"),
        columns=bin_cols,
        prefix={c: c.replace(bin_col_suffix, "") for c in bin_cols},
        dummy_na=False
    )
    # Option : ordre stable
    df_model = df_model.reindex(sorted(df_model.columns), axis=1)
    # Retire IDs
    df_model = df_model.drop(columns=[c for c in exclude_ids if c in df_model.columns], errors="ignore")
    return df_model

# ============================================
# Étape 8 — Plots des courbes (départ à 0,0)
# ============================================
def _curve_from_binned(df, bcol, target):
    y = df[target].astype(int)
    s = df[bcol].astype("Int64").fillna(-1).astype("int64")

    agg = pd.DataFrame({bcol: s, target: y}).groupby(bcol)[target].agg(["sum", "count"])
    agg.columns = ["n_bad", "n_total"]
    agg["n_good"] = agg["n_total"] - agg["n_bad"]

    n_bad = int(agg["n_bad"].sum())
    n_good = int(agg["n_good"].sum())
    if n_bad == 0 or n_good == 0:
        df_cum = pd.DataFrame({"good_client_share_cumsum": [0.0, 1.0],
                               "bad_client_share_cumsum": [0.0, 1.0]})
        return df_cum, 0.0

    agg["bad_rate"] = agg["n_bad"] / agg["n_total"].where(agg["n_total"] > 0, 1)
    agg["bad_share"] = agg["n_bad"] / n_bad
    agg["good_share"] = agg["n_good"] / n_good

    agg = agg.sort_values("bad_rate", kind="mergesort")
    good_cum = np.r_[0.0, agg["good_share"].cumsum().values]
    bad_cum = np.r_[0.0, agg["bad_share"].cumsum().values]

    df_cum = pd.DataFrame({"good_client_share_cumsum": good_cum,
                           "bad_client_share_cumsum": bad_cum})
    g = gini_trapz(df_cum, signed=False)
    return df_cum, float(g)

def plot_all_concentration_curves_from_binned(res, top_n=None, types=("categorical", "numeric")):
    df_base = res["df_enrichi"]  # contient *_BIN
    target = res["target"]

    # calcul des courbes
    rows = []
    for t, store in (("categorical", res["cat_results"]), ("numeric", res["num_results"])):
        if t not in types:
            continue
        for var, info in store.items():
            bcol = f"{var}__BIN"
            if bcol not in df_base.columns:
                continue
            df_cum, g = _curve_from_binned(df_base, bcol, target)
            rows.append((t, var, g, df_cum))
    rows.sort(key=lambda x: x[2], reverse=True)
    if top_n is not None:
        rows = rows[:int(top_n)]

    # plots
    for t, var, g, df_cum in rows:
        plt.figure(figsize=(6, 6))
        plt.plot(df_cum["good_client_share_cumsum"], df_cum["bad_client_share_cumsum"], marker="o")
        plt.plot([0, 1], [0, 1], linestyle="--")  # pas de couleur spécifique
        plt.title(f"{var} [{t}] — Gini = {g:.4f}")
        plt.xlabel("Cumulative good share")
        plt.ylabel("Cumulative bad share")
        plt.grid(True)
        plt.xlim(0, 1)
        plt.ylim(0, 1)
        plt.tight_layout()
        plt.show()
        print(f"{t} — {var}: Gini = {g:.6f}, nb_points={len(df_cum)}")

# =======================
# Exemple d'utilisation :
# =======================

# 1) Fit sur train (DF "one-hot" ou mélange)
res = run_full_pipeline_on_onehot_df(
    df_onehot=df_train_imp,
    target_col="default_24m",   # 👈 sera PROTÉGÉ du dé-one-hot
    max_bins_categ=6, min_bin_size_categ=200,
    max_bins_num=6,   min_bin_size_num=200, n_quantiles_num=50,
    include_missing=True, missing_label="__MISSING__", bin_col_suffix="__BIN",
    n_jobs_categ=-1, n_jobs_num=-1,  # ← utilise tous les cœurs disponibles
    verbose=10,
    # min_gini_keep=1e-6,        # ← décommente pour exclure auto les variables trop faibles
)

# 2) Jeu final pour le modèle (X, y)
cols_id = ["loan_sequence_number", "postal_code", "seller_name", "servicer_name", "msa_md"]
df_final = res["df_ohe"].drop(columns=[c for c in cols_id if c in res["df_ohe"].columns], errors="ignore")
y_train = df_final.pop(res["target"]).astype(int)
X_train = df_final

# 3) Transformer validation/test avec les bins appris
df_val_final = transform_with_learned_bins(df_val_imp, res)
y_val = df_val_final.pop(res["target"]).astype(int)
X_val = df_val_final.reindex(columns=X_train.columns, fill_value=0)  # aligne les colonnes

# 4) (Optionnel) Plots des 30 meilleures courbes
plot_all_concentration_curves_from_binned(res, top_n=30)


Missing flags (train): ['cs_missing', 'mi_missing', 'dti_missing', 'cltv_missing', 'original_loan_term_missing', 'number_of_borrowers_missing']
(9590892, 35)
(931745, 35)


KeyboardInterrupt: 