# Credit Risk Model Stability — Submission Notebook

Self-contained Kaggle-submittable notebook. Reproduces the full pipeline:

1. Load raw parquet data
2. Build features (depth-0, depth-1, depth-2 aggregations)
3. Train tuned CatBoost / LightGBM / XGBoost with 5-fold StratifiedGroupKFold
4. Stack via CalibratedClassifierCV(RidgeClassifier) meta-learner
5. Save `submission.csv`

**No Optuna tuning loops** — uses fixed, pre-tuned hyperparameters.

In [None]:
import gc
import json
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
from catboost import CatBoostClassifier
from category_encoders import CatBoostEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

# ── Paths ────────────────────────────────────────────────────────
KAGGLE = os.path.exists("/kaggle/input")
DATA_PATH = Path(
    "/kaggle/input/home-credit-credit-risk-model-stability"
    if KAGGLE else "../data"
)

# ── Tuned hyperparameters ────────────────────────────────────────
# Defaults below; overwritten by artifacts/best_params.json when available.
TUNED_PARAMS = {
    "catboost": {
        "depth": 6, "learning_rate": 0.05, "l2_leaf_reg": 3.0,
        "subsample": 0.8, "colsample_bylevel": 0.8,
        "bootstrap_type": "Bernoulli",
    },
    "lightgbm": {
        "max_depth": 7, "num_leaves": 64, "learning_rate": 0.05,
        "reg_lambda": 3.0, "subsample": 0.8, "colsample_bytree": 0.8,
    },
    "xgboost": {
        "max_depth": 6, "learning_rate": 0.05, "reg_lambda": 3.0,
        "subsample": 0.8, "colsample_bytree": 0.8,
    },
}

params_path = Path("../artifacts/best_params.json") if not KAGGLE else None
if params_path and params_path.exists():
    with open(params_path) as f:
        saved = json.load(f)
    for model_name in TUNED_PARAMS:
        if model_name in saved and "best_params" in saved[model_name]:
            TUNED_PARAMS[model_name] = saved[model_name]["best_params"]
    print("Loaded tuned params from", params_path)
else:
    print("Using default tuned params (update from Optuna results)")

for name, params in TUNED_PARAMS.items():
    print(f"  {name}: {params}")

N_SPLITS = 5
SEED = 42
ENSEMBLE_SEEDS = [42, 123, 456]
ENSEMBLE_ALPHA = 1.0

---

## Pipeline Functions

In [None]:
def load_table_group(data_path, table_name, split="train"):
    """Load and concatenate all parquet files for a table group."""
    data_path = Path(data_path)
    full_dir = data_path / "parquet_files" / split
    pattern = f"{split}_{table_name}"
    matching_files = sorted(full_dir.glob(f"{pattern}*.parquet"))
    if not matching_files:
        raise FileNotFoundError(f"No files: '{pattern}*.parquet' in {full_dir}")
    dfs = [pl.read_parquet(f) for f in matching_files]
    return pl.concat(dfs, how="vertical_relaxed") if len(dfs) > 1 else dfs[0]


def preprocess_table(df, missing_threshold=0.98, max_string_cardinality=10_000):
    """Downcast numerics, drop high-missing and high-cardinality string columns."""
    casts = []
    for col in df.columns:
        dtype = df[col].dtype
        if dtype == pl.Float64:
            casts.append(pl.col(col).cast(pl.Float32))
        elif dtype == pl.Int64:
            casts.append(pl.col(col).cast(pl.Int32))
        else:
            casts.append(pl.col(col))
    df = df.select(casts)

    n = df.height
    if n > 0:
        null_rates = df.null_count() / n
        df = df.select([c for c in df.columns if null_rates[c][0] <= missing_threshold])

    to_drop = [
        c for c in df.columns
        if df[c].dtype in (pl.String, pl.Utf8) and df[c].n_unique() > max_string_cardinality
    ]
    return df.drop(to_drop) if to_drop else df

In [None]:
NUMERIC_DTYPES = frozenset({
    pl.Int8, pl.Int16, pl.Int32, pl.Int64,
    pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
    pl.Float32, pl.Float64,
})
STRING_DTYPES = frozenset({pl.String, pl.Utf8, pl.Categorical})


def handle_dates(df):
    """Transform date/year columns to numeric features relative to date_decision."""
    if "date_decision" not in df.columns:
        return df
    date_d_cols = [c for c in df.columns if c.endswith("D") and c != "date_decision"]
    year_cols = [
        c for c in df.columns
        if "year" in c.lower() and c not in date_d_cols and c != "date_decision"
    ]
    exprs = []
    for col in date_d_cols:
        exprs.append(
            ((pl.col(col) - pl.col("date_decision")).dt.total_days() / -365)
            .cast(pl.Float32).alias(col)
        )
    for col in year_cols:
        exprs.append(
            (pl.col(col) - pl.col("date_decision").dt.year())
            .cast(pl.Float32).alias(col)
        )
    if exprs:
        df = df.with_columns(exprs)
    return df.drop([c for c in ("date_decision", "MONTH") if c in df.columns])


def create_domain_ratios(df):
    """Create loan burden, disbursement, debt, and EIR ratio features."""
    COL = {
        "price": "price_1097A", "annuity": "annuity_780A",
        "disbursed": "disbursedcredamount_1113A", "credit_amount": "credamount_770A",
        "total_debt": "totaldebt_9A", "eir": "eir_270L",
    }
    avail = {k: v for k, v in COL.items() if v in df.columns}
    ratios = []
    if {"price", "annuity"} <= avail.keys():
        ratios.append(
            (pl.col(avail["price"]) / pl.col(avail["annuity"]))
            .cast(pl.Float32).alias("loan_burden_ratio"))
    if {"disbursed", "credit_amount"} <= avail.keys():
        ratios.append(
            (pl.col(avail["disbursed"]) / pl.col(avail["credit_amount"]))
            .cast(pl.Float32).alias("disbursed_credit_ratio"))
    if {"total_debt", "credit_amount"} <= avail.keys():
        ratios.append(
            (pl.col(avail["total_debt"]) / (1 + pl.col(avail["credit_amount"])))
            .cast(pl.Float32).alias("debt_credit_ratio"))
    if {"eir", "credit_amount"} <= avail.keys():
        ratios.append(
            (pl.col(avail["eir"]) / pl.col(avail["credit_amount"]))
            .cast(pl.Float32).alias("eir_credit_ratio"))
    return df.with_columns(ratios) if ratios else df


def _build_agg_exprs(df, skip):
    """Classify columns and build Polars aggregation expressions."""
    n_rows = df.height
    numeric_cols, amount_cols, string_mode_cols = [], [], []
    cat_count_map = {}
    for col in df.columns:
        if col in skip:
            continue
        dtype = df[col].dtype
        if dtype in NUMERIC_DTYPES:
            numeric_cols.append(col)
            if col.endswith("A"):
                amount_cols.append(col)
        elif dtype in STRING_DTYPES:
            n_uniq = df[col].n_unique()
            null_rate = df[col].null_count() / n_rows if n_rows > 0 else 1.0
            if n_uniq <= 200:
                string_mode_cols.append(col)
            if n_uniq <= 10 and null_rate < 0.9:
                cat_count_map[col] = df[col].drop_nulls().unique().to_list()
    agg = []
    for col in numeric_cols:
        agg.extend([
            pl.col(col).mean().alias(f"{col}_mean"),
            pl.col(col).max().alias(f"{col}_max"),
            pl.col(col).min().alias(f"{col}_min"),
            pl.col(col).first().alias(f"{col}_first"),
            pl.col(col).last().alias(f"{col}_last"),
            pl.col(col).std().alias(f"{col}_std"),
        ])
    for col in amount_cols:
        agg.append(
            (pl.col(col).std() / (pl.col(col).mean().abs() + 1e-9))
            .alias(f"{col}_cv"))
    for col in string_mode_cols:
        agg.extend([
            pl.col(col).drop_nulls().mode().first().alias(f"{col}_mode"),
            pl.col(col).n_unique().alias(f"{col}_nunique"),
        ])
    for col, vals in cat_count_map.items():
        for val in vals:
            safe = str(val).replace(" ", "_").replace("/", "_")
            agg.append((pl.col(col) == val).sum().alias(f"{col}_{safe}_count"))
    return agg


def aggregate_depth1(df, group_col="case_id"):
    """Aggregate a depth-1 table by case_id after sorting by num_group1."""
    skip = {group_col, "num_group1", "num_group2"}
    if "num_group1" in df.columns:
        df = df.sort(group_col, "num_group1")
    agg = _build_agg_exprs(df, skip)
    return df.group_by(group_col).agg(agg) if agg else df.select(group_col).unique()


def aggregate_depth2(df):
    """Two-pass aggregation for depth-2 tables (num_group2 then num_group1)."""
    skip1 = {"case_id", "num_group1", "num_group2"}
    if "num_group2" in df.columns:
        df = df.sort("case_id", "num_group1", "num_group2")
    agg1 = _build_agg_exprs(df, skip1)
    if not agg1:
        return df.select("case_id").unique()
    pass1 = df.group_by(["case_id", "num_group1"]).agg(agg1)
    pass1 = pass1.sort("case_id", "num_group1")
    agg2 = _build_agg_exprs(pass1, {"case_id", "num_group1"})
    return pass1.group_by("case_id").agg(agg2) if agg2 else pass1.select("case_id").unique()


def drop_correlated_columns(df, threshold=0.95, sample_n=50_000):
    """Drop one column from each pair whose |Pearson r| > threshold."""
    protect = {"case_id", "target", "WEEK_NUM"}
    num_cols = [c for c in df.columns if df[c].dtype in NUMERIC_DTYPES and c not in protect]
    if len(num_cols) < 2:
        return df
    sub = df.select(num_cols)
    if sub.height > sample_n:
        sub = sub.sample(n=sample_n, seed=42)
    mat = sub.fill_null(0).to_numpy().astype(np.float32)
    with np.errstate(invalid="ignore"):
        corr = np.abs(np.corrcoef(mat, rowvar=False))
    np.nan_to_num(corr, copy=False, nan=0.0)
    null_rates = np.array([df[c].null_count() / df.height for c in num_cols])
    to_drop = set()
    for i in range(len(num_cols)):
        if i in to_drop:
            continue
        for j in range(i + 1, len(num_cols)):
            if j in to_drop:
                continue
            if corr[i, j] > threshold:
                if null_rates[i] >= null_rates[j]:
                    to_drop.add(i)
                    break
                else:
                    to_drop.add(j)
    drop_names = sorted(num_cols[i] for i in to_drop)
    if drop_names:
        print(f"  Dropped {len(drop_names)} correlated columns (|r|>{threshold})")
    return df.drop(drop_names) if drop_names else df


def collapse_rare_categories(df, max_unique=200, keep_top=20):
    """For high-cardinality string columns, keep top values, null the rest."""
    exprs = []
    for col in df.columns:
        if df[col].dtype not in STRING_DTYPES:
            continue
        if df[col].n_unique() <= max_unique:
            continue
        top = (
            df[col].value_counts().sort("count", descending=True)
            .head(keep_top)[col].to_list()
        )
        exprs.append(
            pl.when(pl.col(col).is_in(top)).then(pl.col(col))
            .otherwise(None).alias(col)
        )
    return df.with_columns(exprs) if exprs else df

In [None]:
def gini_stability(week_num, target, score, w_falling=88.0, w_std=0.5):
    """Kaggle Gini stability metric: mean(gini) + 88*min(0,slope) - 0.5*std(resid)."""
    weeks, y_true, y_score = np.asarray(week_num), np.asarray(target), np.asarray(score)
    ginis = []
    for w in np.sort(np.unique(weeks)):
        mask = weeks == w
        if len(np.unique(y_true[mask])) < 2:
            continue
        ginis.append(2.0 * roc_auc_score(y_true[mask], y_score[mask]) - 1.0)
    ginis = np.array(ginis)
    mean_gini = float(np.mean(ginis)) if len(ginis) else 0.0
    if len(ginis) < 2:
        return {"stability_score": mean_gini, "mean_gini": mean_gini}
    x = np.arange(len(ginis))
    slope, intercept = np.polyfit(x, ginis, 1)
    residuals = ginis - (slope * x + intercept)
    return {
        "stability_score": float(mean_gini + w_falling * min(0.0, slope) - w_std * np.std(residuals)),
        "mean_gini": mean_gini,
        "falling_rate": float(min(0.0, slope)),
        "std_residuals": float(np.std(residuals)),
    }


def build_stacking_ensemble(oof_scores, y, week_num, test_scores,
                            seeds=None, alpha=1.0, n_splits=5, cv_seed=42):
    """Stack base-model OOF predictions via CalibratedClassifierCV(RidgeClassifier)."""
    seeds = seeds or [42, 123, 456]
    outer_cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=cv_seed)
    oof_ens = np.zeros(len(y))
    test_ens = np.zeros(len(test_scores))

    for seed in seeds:
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        oof_seed = np.zeros(len(y))
        for tr, va in outer_cv.split(oof_scores, y, week_num):
            meta = CalibratedClassifierCV(
                RidgeClassifier(alpha=alpha), cv=inner_cv, method="sigmoid")
            meta.fit(oof_scores[tr], y[tr])
            oof_seed[va] = meta.predict_proba(oof_scores[va])[:, 1]
        oof_ens += oof_seed / len(seeds)

        meta_full = CalibratedClassifierCV(
            RidgeClassifier(alpha=alpha), cv=inner_cv, method="sigmoid")
        meta_full.fit(oof_scores, y)
        test_ens += meta_full.predict_proba(test_scores)[:, 1] / len(seeds)
        print(f"  Seed {seed}: AUC={roc_auc_score(y, oof_seed):.6f}")

    oof_auc = roc_auc_score(y, oof_ens)
    stab = gini_stability(week_num, y, oof_ens)
    print(f"\n  Ensemble ({len(seeds)}-seed avg): "
          f"AUC={oof_auc:.6f}  Stability={stab['stability_score']:.6f}")
    return {"oof_preds": oof_ens, "test_preds": test_ens, "oof_auc": oof_auc}

---

## Feature Engineering

In [None]:
DEPTH1_NAMES = [
    "applprev_1", "credit_bureau_a_1", "credit_bureau_b_1",
    "person_1", "tax_registry_a_1", "tax_registry_b_1", "tax_registry_c_1",
]
DEPTH2_NAMES = ["applprev_2", "person_2", "credit_bureau_a_2"]
CLOSED_INDICATORS = [
    "dateofcredend_353D", "dateofcredstart_739D",
    "credlmt_228A", "contractst_964M",
]


def build_split(split):
    """Load raw tables, preprocess, aggregate, and merge for one split."""
    base = load_table_group(DATA_PATH, "base", split=split)
    df = base.clone()

    for tg in ["static_0", "static_cb_0"]:
        try:
            df = df.join(
                preprocess_table(load_table_group(DATA_PATH, tg, split=split)),
                on="case_id", how="left")
        except FileNotFoundError:
            pass

    df = handle_dates(df)
    df = create_domain_ratios(df)
    print(f"  [{split}] depth-0: {df.shape}")

    for name in DEPTH1_NAMES:
        try:
            t = preprocess_table(load_table_group(DATA_PATH, name, split=split))
            if name == "credit_bureau_a_1":
                avail = [c for c in CLOSED_INDICATORS if c in t.columns]
                if avail:
                    t = t.filter(pl.col(avail[0]).is_not_null())
            df = df.join(aggregate_depth1(t), on="case_id", how="left")
        except FileNotFoundError:
            pass
    print(f"  [{split}] + depth-1: {df.shape}")

    for name in DEPTH2_NAMES:
        try:
            t = preprocess_table(load_table_group(DATA_PATH, name, split=split))
            df = df.join(aggregate_depth2(t), on="case_id", how="left")
        except FileNotFoundError:
            pass
    print(f"  [{split}] + depth-2: {df.shape}")
    return df


print("Building train features …")
train = build_split("train")
train = drop_correlated_columns(train, threshold=0.95)
train = collapse_rare_categories(train, max_unique=200, keep_top=20)
print(f"  Train final: {train.shape}\n")

print("Building test features …")
test = build_split("test")
test = collapse_rare_categories(test, max_unique=200, keep_top=20)

train_cols = [c for c in train.columns if c != "target"]
missing = [c for c in train_cols if c not in test.columns]
if missing:
    print(f"  Adding {len(missing)} null columns missing from test")
    test = test.with_columns([pl.lit(None).cast(train[c].dtype).alias(c) for c in missing])
test = test.select(train_cols)
print(f"  Test final (aligned): {test.shape}")

gc.collect()

---

## Model Training — Tuned Hyperparameters, 5-Fold StratifiedGroupKFold

In [None]:
META_COLS = {"case_id", "target", "WEEK_NUM"}
feature_cols = [c for c in train.columns if c not in META_COLS]
cat_cols = [c for c in feature_cols if train[c].dtype in (pl.String, pl.Utf8, pl.Categorical)]

high_card = {c for c in cat_cols if train[c].n_unique() > 200}
lgb_cat_cols = [c for c in cat_cols if c not in high_card]
lgb_feature_cols = [c for c in feature_cols if c not in high_card]

train_pd = train.to_pandas()
test_pd = test.to_pandas()
del train, test; gc.collect()

X = train_pd[feature_cols]
y = train_pd["target"].values
week_num = train_pd["WEEK_NUM"].values

X_lgb = train_pd[lgb_feature_cols].copy()
X_test_lgb = test_pd[lgb_feature_cols].copy()
for col in lgb_cat_cols:
    X_lgb[col] = X_lgb[col].astype("category")
    X_test_lgb[col] = X_test_lgb[col].astype("category")

sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

print(f"Features: {len(feature_cols)} total ({len(cat_cols)} cat)")
print(f"LGB/XGB features: {len(lgb_feature_cols)} ({len(lgb_cat_cols)} cat, "
      f"{len(high_card)} high-card excluded)")
print(f"Train: {len(X):,}  Test: {len(test_pd):,}")

In [None]:
print("Training CatBoost …\n")
cb_params = TUNED_PARAMS["catboost"]
oof_cb = np.zeros(len(X))
test_cb = np.zeros(len(test_pd))

for fold, (tr_idx, va_idx) in enumerate(sgkf.split(X, y, week_num)):
    model = CatBoostClassifier(
        iterations=1000, **cb_params,
        random_seed=SEED + fold, eval_metric="AUC",
        cat_features=cat_cols, allow_writing_files=False,
    )
    model.fit(
        X.iloc[tr_idx], y[tr_idx],
        eval_set=(X.iloc[va_idx], y[va_idx]),
        early_stopping_rounds=100, verbose=0,
    )
    oof_cb[va_idx] = model.predict_proba(X.iloc[va_idx])[:, 1]
    test_cb += model.predict_proba(test_pd[feature_cols])[:, 1] / N_SPLITS
    print(f"  Fold {fold+1}: AUC={roc_auc_score(y[va_idx], oof_cb[va_idx]):.6f}")

stab = gini_stability(week_num, y, oof_cb)
print(f"\n  CatBoost OOF: AUC={roc_auc_score(y, oof_cb):.6f}  "
      f"Stability={stab['stability_score']:.6f}")

In [None]:
print("Training LightGBM …\n")
lgb_params = TUNED_PARAMS["lightgbm"]
oof_lgb = np.zeros(len(X))
test_lgb = np.zeros(len(test_pd))

for fold, (tr_idx, va_idx) in enumerate(sgkf.split(X, y, week_num)):
    model = lgb.LGBMClassifier(
        n_estimators=1000, **lgb_params,
        random_state=SEED + fold, verbose=-1,
    )
    model.fit(
        X_lgb.iloc[tr_idx], y[tr_idx],
        eval_set=[(X_lgb.iloc[va_idx], y[va_idx])],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(100, verbose=False)],
    )
    oof_lgb[va_idx] = model.predict_proba(X_lgb.iloc[va_idx])[:, 1]
    test_lgb += model.predict_proba(X_test_lgb)[:, 1] / N_SPLITS
    print(f"  Fold {fold+1}: AUC={roc_auc_score(y[va_idx], oof_lgb[va_idx]):.6f}")

stab = gini_stability(week_num, y, oof_lgb)
print(f"\n  LightGBM OOF: AUC={roc_auc_score(y, oof_lgb):.6f}  "
      f"Stability={stab['stability_score']:.6f}")

In [None]:
print("Training XGBoost (fold-safe encoding) …\n")
xgb_params = TUNED_PARAMS["xgboost"]
oof_xgb = np.zeros(len(X))
test_xgb = np.zeros(len(test_pd))

for fold, (tr_idx, va_idx) in enumerate(sgkf.split(X, y, week_num)):
    X_tr = train_pd[lgb_feature_cols].iloc[tr_idx].copy()
    X_val = train_pd[lgb_feature_cols].iloc[va_idx].copy()

    encoder = CatBoostEncoder(cols=lgb_cat_cols, random_state=SEED + fold)
    X_tr[lgb_cat_cols] = encoder.fit_transform(X_tr[lgb_cat_cols], y[tr_idx])
    X_val[lgb_cat_cols] = encoder.transform(X_val[lgb_cat_cols])

    model = XGBClassifier(
        n_estimators=1000, **xgb_params,
        random_state=SEED + fold, eval_metric="auc",
        tree_method="hist", early_stopping_rounds=100, verbosity=0,
    )
    model.fit(X_tr, y[tr_idx], eval_set=[(X_val, y[va_idx])], verbose=0)

    oof_xgb[va_idx] = model.predict_proba(X_val)[:, 1]

    X_te = test_pd[lgb_feature_cols].copy()
    X_te[lgb_cat_cols] = encoder.transform(X_te[lgb_cat_cols])
    test_xgb += model.predict_proba(X_te)[:, 1] / N_SPLITS

    print(f"  Fold {fold+1}: AUC={roc_auc_score(y[va_idx], oof_xgb[va_idx]):.6f}")

stab = gini_stability(week_num, y, oof_xgb)
print(f"\n  XGBoost OOF: AUC={roc_auc_score(y, oof_xgb):.6f}  "
      f"Stability={stab['stability_score']:.6f}")

---

## Stacking Ensemble & Submission

In [None]:
print("Stacking ensemble …\n")

oof_stack = np.column_stack([oof_cb, oof_lgb, oof_xgb])
test_stack = np.column_stack([test_cb, test_lgb, test_xgb])

result = build_stacking_ensemble(
    oof_stack, y, week_num, test_stack,
    seeds=ENSEMBLE_SEEDS, alpha=ENSEMBLE_ALPHA,
    n_splits=N_SPLITS, cv_seed=SEED,
)

# ── Build submission ──────────────────────────────────────────
scores = np.clip(result["test_preds"], 0.0, 1.0)

submission = pd.DataFrame({
    "case_id": test_pd["case_id"].astype(int),
    "score": scores,
})

# Integrity checks
assert list(submission.columns) == ["case_id", "score"], "Bad columns"
assert submission["case_id"].nunique() == len(submission), "Duplicate case_id"
assert submission["score"].notna().all(), "Null scores"
assert (submission["score"] >= 0).all() and (submission["score"] <= 1).all(), "Score OOB"

submission.to_csv("submission.csv", index=False)

print(f"\nsubmission.csv saved  ({submission.shape[0]:,} rows)")
print(submission.describe())