In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv


In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
import optuna
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

In [3]:
import os, warnings, gc, math, random
import numpy as np, pandas as pd
warnings.filterwarnings("ignore")
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.isotonic import IsotonicRegression
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

In [4]:
# --------------- Utils
def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); os.environ["PYTHONHASHSEED"]=str(seed)
SEED = 42
seed_everything(SEED)

In [5]:
# --------------- Locate dataset dir (train.csv, test.csv, sample_submission.csv)
def find_data_dir(root="/kaggle/input"):
    cands = []
    for d, _, files in os.walk(root):
        fs = set(files)
        if {"train.csv","test.csv","sample_submission.csv"}.issubset(fs):
            cands.append(d)
    if not cands:
        raise FileNotFoundError("Cannot find dataset folder with train/test/sample_submission.")
    return sorted(cands, key=len)[0]

DATA_DIR = find_data_dir()
print("DATA_DIR:", DATA_DIR)

train = pd.read_csv(f"{DATA_DIR}/train.csv")
test  = pd.read_csv(f"{DATA_DIR}/test.csv")
sub   = pd.read_csv(f"{DATA_DIR}/sample_submission.csv")

TARGET = "loan_paid_back" if "loan_paid_back" in train.columns else "target"
ID_COL = "id" if "id" in train.columns else train.columns[0]


DATA_DIR: /kaggle/input/playground-series-s5e11


In [6]:
# --------------- Basic cleaning
# Drop perfect duplicates in train (keep first)
if train.duplicated().any():
    train = train.drop_duplicates().reset_index(drop=True)
    
    # Identify dtypes
num_cols = [c for c in train.columns if c not in [TARGET, ID_COL] and pd.api.types.is_numeric_dtype(train[c])]
cat_cols = [c for c in train.columns if c not in [TARGET, ID_COL] and not pd.api.types.is_numeric_dtype(train[c])]# Also treat low-cardinality numerics as categories (robust to int-coded categories)
for c in num_cols.copy():
    if train[c].nunique() <= 12 and train[c].dtype != float:  # small buckets → likely categorical
        cat_cols.append(c); num_cols.remove(c)


# Align test columns
for c in cat_cols:
    train[c] = train[c].astype("category")
    test[c]  = test[c].astype("category")

# Missing handling
for c in num_cols:
    med = train[c].median()
    train[c] = train[c].fillna(med)
    test[c]  = test[c].fillna(med)
for c in cat_cols:
    train[c] = train[c].cat.add_categories(["__MISSING__"]).fillna("__MISSING__")
    test[c]  = test[c].cat.add_categories(["__MISSING__"]).fillna("__MISSING__")
    
    # Clip outliers (1% - 99%)
for c in num_cols:
    lo, hi = train[c].quantile(0.01), train[c].quantile(0.99)
    train[c] = train[c].clip(lo, hi)
    test[c]  = test[c].clip(lo, hi)
    
    # --------------- Finance-inspired automatic features (created only if source cols exist)
def add_finance_features(df):
    cols = df.columns
    def pick(name): 
        return [c for c in cols if name in c]
    # Heuristic picks
    inc  = pick("income") or pick("salary")
    amt  = pick("loan_amount") or pick("amount")
    rate = pick("interest_rate") or pick("rate")
    debt = pick("debt") + pick("debt_to_income")
    score= pick("credit_score") or pick("score")

    if inc and amt:
        a, i = amt[0], inc[0]
        df["loan_to_income"] = (df[a] / (df[i].replace(0, np.nan))).fillna(0)
        df["log_income"]     = np.log1p(df[i])
        df["log_amount"]     = np.log1p(df[a])

    if inc and amt and rate:
        a, i, r = amt[0], inc[0], rate[0]
        df["interest_burden"] = (df[a] * df[r]) / (df[i].replace(0, np.nan))
        df["interest_burden"] = df["interest_burden"].replace([np.inf, -np.inf], np.nan).fillna(0)

    if score:
        s = score[0]
        # quantile buckets as risk tiers
        df["credit_tier"] = pd.qcut(df[s].rank(method="first"), q=10, labels=False, duplicates="drop").astype("int16")

    if debt:
        d = debt[0]
        df["debt_ratio_logit"] = np.log1p(df[d] / (1 - np.clip(df[d], 1e-6, 1-1e-6))) if df[d].max()<=1.0 else np.log1p(df[d])

    return df

train = add_finance_features(train)
test  = add_finance_features(test)

# Update dtypes after feature add
new_num = [c for c in train.columns if c not in [TARGET, ID_COL] and pd.api.types.is_numeric_dtype(train[c])]
new_cat = [c for c in train.columns if c not in [TARGET, ID_COL] and not pd.api.types.is_numeric_dtype(train[c])]
for c in new_cat:
    train[c] = train[c].astype("category"); test[c] = test[c].astype("category")
num_cols, cat_cols = new_num, new_cat



In [7]:
# --------------- Adversarial Validation: train vs test shift → sample weights
def adversarial_weights(train_df, test_df, num_cols, cat_cols, seed=SEED):
    tmp_tr = train_df[[ID_COL] + num_cols + cat_cols].copy(); tmp_tr["is_train"] = 1
    tmp_te = test_df [[ID_COL] + num_cols + cat_cols].copy(); tmp_te["is_train"] = 0
    ad = pd.concat([tmp_tr, tmp_te], axis=0, ignore_index=True)

    # Encode cats with LabelEncoder (CatBoost can handle cats but here we want a simple LGBM)
    ad_enc = ad.copy()
    encoders = {}
    for c in cat_cols:
        le = LabelEncoder()
        ad_enc[c] = le.fit_transform(ad_enc[c].astype(str))
        encoders[c] = le

    feats = num_cols + cat_cols
    y_ad = ad_enc["is_train"].values
    X_ad = ad_enc[feats].values

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros(len(ad_enc))
    for tr_idx, va_idx in skf.split(X_ad, y_ad):
        model = LGBMClassifier(
            n_estimators=400,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8, colsample_bytree=0.8,
            random_state=seed, objective="binary",
            min_child_samples=60
        )
        # >>> 删除 verbose 参数，其他不变
        model.fit(
            X_ad[tr_idx], y_ad[tr_idx],
            eval_set=[(X_ad[va_idx], y_ad[va_idx])],
            eval_metric="auc"
        )
        oof[va_idx] = model.predict_proba(X_ad[va_idx])[:, 1]

    # Probability of being train-sourced
    p_train = oof[:len(train_df)]
    # Higher prob(train) → downweight, using inverse propensity
    eps = 1e-3
    w = (1 - p_train + eps) / (p_train + eps)
    w = np.clip(w / np.mean(w), 0.25, 4.0)  # stabilized
    return w
    
    
    # --------------- Monotonic constraints for LGBM (numeric only)
# Spearman sign wrt target; categorical set to 0
def spearman_sign(x, y):
    return np.sign(pd.Series(x).rank().corr(pd.Series(y), method="spearman") or 0.0)

y = train[TARGET].values
mono_map = {}
for c in num_cols:
    mono_map[c] = int(spearman_sign(train[c].values, y))
    
  # Monotonic constraints (numeric only, by Spearman sign)
# =========================
def spearman_sign(x, y):
    return int(np.sign(pd.Series(x).rank().corr(pd.Series(y), method="spearman") or 0.0))

y = train[TARGET].values
mono_map = {c: spearman_sign(train[c].values, y) for c in num_cols}

features_order = num_cols + cat_cols
lgb_mono = [mono_map.get(c, 0) if c in num_cols else 0 for c in features_order]


# ============== 继续：计算对抗验证权重 ==============
# 若失败则回退为等权，避免 NameError
try:
    sample_weights = adversarial_weights(train, test, num_cols, cat_cols, seed=SEED)
    print("Computed adversarial sample weights.")
except Exception as e:
    print("Adversarial weighting failed → using uniform weights. Reason:", e)
    sample_weights = np.ones(len(train), dtype=float)

# 对齐长度以防错位
sample_weights = np.asarray(pd.Series(sample_weights).reset_index(drop=True))
if len(sample_weights) != len(train):
    print("Sample weights length mismatch; using uniform weights.")
    sample_weights = np.ones(len(train), dtype=float)
    
   # ============== 单调约束：按 Spearman 方向 ==============
y = train[TARGET].values
def spearman_sign(x, y):
    return int(np.sign(pd.Series(x).rank().corr(pd.Series(y), method="spearman") or 0.0))

mono_map = {c: spearman_sign(train[c].values, y) for c in num_cols}

features_order = num_cols + cat_cols
# LightGBM 需要与特征顺序等长的约束向量；非数值特征置 0（不约束）
lgb_mono = [mono_map.get(c, 0) if c in num_cols else 0 for c in features_order]


[LightGBM] [Info] Number of positive: 475195, number of negative: 203655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2594
[LightGBM] [Info] Number of data points in the train set: 678850, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.700000 -> initscore=0.847298
[LightGBM] [Info] Start training from score 0.847298
[LightGBM] [Info] Number of positive: 475195, number of negative: 203655
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2593
[LightGBM] [Info] Number of data points in the train set: 678850, number of used features: 17
[LightGBM] [In

In [8]:
# ============== 交叉验证训练：CatBoost + LightGBM（单调） ==============
FOLDS = 5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

oof_cb  = np.zeros(len(train))
oof_lgb = np.zeros(len(train))
pred_cb  = np.zeros(len(test))
pred_lgb = np.zeros(len(test))

for fold, (tr_idx, va_idx) in enumerate(skf.split(train[features_order], train[TARGET])):
    X_tr, X_va = train.iloc[tr_idx][features_order], train.iloc[va_idx][features_order]
    y_tr, y_va = train.iloc[tr_idx][TARGET],      train.iloc[va_idx][TARGET]
    w_tr = sample_weights[tr_idx]

    # ---- CatBoost（原生类别）
    pool_tr = Pool(X_tr, y_tr, cat_features=[features_order.index(c) for c in cat_cols], weight=w_tr)
    pool_va = Pool(X_va, y_va, cat_features=[features_order.index(c) for c in cat_cols])

    cb = CatBoostClassifier(
        iterations=3000,
        learning_rate=0.03,
        depth=6,
        eval_metric="AUC",
        loss_function="Logloss",
        l2_leaf_reg=3.0,
        random_state=SEED,
        verbose=False,
        early_stopping_rounds=200,
        border_count=128,
        task_type="CPU"
    )
    cb.fit(pool_tr, eval_set=pool_va, verbose=False)
    oof_cb[va_idx] = cb.predict_proba(pool_va)[:, 1]
    pred_cb += cb.predict_proba(Pool(test[features_order], cat_features=[features_order.index(c) for c in cat_cols]))[:, 1] / FOLDS

    # ---- LightGBM（单调约束 + 早停 + 静默）
    lgb = LGBMClassifier(
        n_estimators=6000,
        learning_rate=0.015,
        num_leaves=64,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.85,
        min_child_samples=80,
        reg_alpha=1.0, reg_lambda=2.0,
        objective="binary",
        random_state=SEED,
        monotone_constraints=lgb_mono,   # 注意逗号
        force_row_wise=True,
        verbosity=-1
    )
    lgb.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        categorical_feature=cat_cols,
        sample_weight=w_tr,
        callbacks=[
            early_stopping(stopping_rounds=300),  # 300 轮无提升即停
            log_evaluation(period=0)              # 关闭逐轮打印
        ]
    )
    oof_lgb[va_idx] = lgb.predict_proba(X_va)[:, 1]
    pred_lgb += lgb.predict_proba(test[features_order])[:, 1] / FOLDS

    auc_cb  = roc_auc_score(y_va, oof_cb[va_idx])
    auc_lgb = roc_auc_score(y_va, oof_lgb[va_idx])
    print(f"Fold {fold+1}: AUC CatBoost={auc_cb:.5f} | LGBM={auc_lgb:.5f}")

auc_cb  = roc_auc_score(train[TARGET], oof_cb)
auc_lgb = roc_auc_score(train[TARGET], oof_lgb)
print(f"\nOOF AUC - CatBoost: {auc_cb:.5f} | LGBM: {auc_lgb:.5f}")

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[462]	valid_0's auc: 0.915834	valid_0's binary_logloss: 0.253156
Fold 1: AUC CatBoost=0.92061 | LGBM=0.91583
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[436]	valid_0's auc: 0.914849	valid_0's binary_logloss: 0.254564
Fold 2: AUC CatBoost=0.92041 | LGBM=0.91485
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[441]	valid_0's auc: 0.913647	valid_0's binary_logloss: 0.255545
Fold 3: AUC CatBoost=0.91876 | LGBM=0.91365
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[438]	valid_0's auc: 0.914538	valid_0's binary_logloss: 0.254004
Fold 4: AUC CatBoost=0.91959 | LGBM=0.91454
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[461]	valid_0's auc: 0.914379	valid_0's binary_logloss: 0.253077
Fold 5: AUC Ca

In [9]:
# ============== 融合 + 概率校准（Isotonic） ==============
w_cb  = auc_cb  / (auc_cb + auc_lgb + 1e-9)
w_lgb = auc_lgb / (auc_cb + auc_lgb + 1e-9)
print(f"Blend Weights → CatBoost: {w_cb:.3f}, LGBM: {w_lgb:.3f}")

oof_blend  = w_cb * oof_cb + w_lgb * oof_lgb
pred_blend = w_cb * pred_cb + w_lgb * pred_lgb
print("OOF AUC (Blend):", roc_auc_score(train[TARGET], oof_blend))

cal = IsotonicRegression(out_of_bounds="clip")
cal.fit(oof_blend, train[TARGET])
oof_cal = cal.transform(oof_blend)
pred_cal = cal.transform(pred_blend)
print("OOF AUC (Calibrated):", roc_auc_score(train[TARGET], oof_cal))

Blend Weights → CatBoost: 0.501, LGBM: 0.499
OOF AUC (Blend): 0.9180598940429525
OOF AUC (Calibrated): 0.9181680548120736


In [10]:
# ============== 高置信伪标签 + 精简 LGBM 再训练（自洽兜底 v2） ==============
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import roc_auc_score

# 若缺少 blend，则尝试重建
if ('oof_blend' not in locals()) or ('pred_blend' not in locals()):
    built = False
    if ('oof_cb' in locals()) and ('oof_lgb' in locals()) and ('pred_cb' in locals()) and ('pred_lgb' in locals()):
        if 'auc_cb' not in locals():
            auc_cb = roc_auc_score(train[TARGET], oof_cb)
        if 'auc_lgb' not in locals():
            auc_lgb = roc_auc_score(train[TARGET], oof_lgb)
        w_cb  = auc_cb / (auc_cb + auc_lgb + 1e-9)
        w_lgb = 1.0 - w_cb
        oof_blend  = w_cb * oof_cb  + w_lgb * oof_lgb
        pred_blend = w_cb * pred_cb + w_lgb * pred_lgb
        print("Rebuilt oof_blend/pred_blend using OOF AUC weights.")
        built = True
    else:
        # 只有 test 预测时的回退
        if ('pred_cb' in locals()) and ('pred_lgb' in locals()):
            pred_blend = 0.5 * pred_cb + 0.5 * pred_lgb
            oof_blend  = None
            print("Built pred_blend from test preds only (no OOF available).")
        elif 'pred_lgb' in locals():
            pred_blend = pred_lgb; oof_blend = None; print("Using LGBM test preds only.")
        elif 'pred_cb' in locals():
            pred_blend = pred_cb;  oof_blend = None; print("Using CatBoost test preds only.")
        else:
            raise RuntimeError("No predictions found. Please run the CV training cell first.")

# 概率校准（若无 OOF 则跳过）
try:
    if oof_blend is not None:
        cal = IsotonicRegression(out_of_bounds="clip")
        cal.fit(oof_blend, train[TARGET])
        oof_cal = cal.transform(oof_blend)
        pred_cal = cal.transform(pred_blend)
        print("Isotonic calibration done.")
    else:
        raise ValueError("No OOF to calibrate.")
except Exception as e:
    print("Calibration skipped → using uncalibrated blend. Reason:", e)
    oof_cal = oof_blend if 'oof_blend' in locals() else None
    pred_cal = pred_blend

# 选择高置信样本做伪标签；若太少自动放宽阈值；若仍无则跳过伪标签
hi_pos = (pred_cal >= 0.99)
hi_neg = (pred_cal <= 0.01)
pseudo = test.loc[hi_pos | hi_neg, features_order].copy()

if pseudo.shape[0] == 0:
    print("No high-confidence rows at 0.99/0.01 → try 0.98/0.02.")
    hi_pos = (pred_cal >= 0.98)
    hi_neg = (pred_cal <= 0.02)
    pseudo = test.loc[hi_pos | hi_neg, features_order].copy()

if pseudo.shape[0] == 0:
    print("Still none; skip pseudo-label stage.")
    final_pred = pred_cal
    out = sub.copy()
    out[out.columns[-1]] = final_pred
    out.to_csv("submission.csv", index=False)
    print("Saved submission.csv (no pseudo-labeling).")
else:
    pseudo[TARGET] = (pred_cal[hi_pos | hi_neg] > 0.5).astype(int)
    print(f"Pseudo-labeled rows: {len(pseudo)}")

    aug = pd.concat([train[features_order + [TARGET]], pseudo], ignore_index=True)
    oof_aug = np.zeros(len(train))
    pred_aug = np.zeros(len(test))

    # 若 skf 未定义则补一个
    if 'skf' not in locals():
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(train[features_order], train[TARGET])):
        X_tr2 = aug[features_order]; y_tr2 = aug[TARGET]
        X_va2 = train.iloc[va_idx][features_order]; y_va2 = train.iloc[va_idx][TARGET]

        lgb2 = LGBMClassifier(
            n_estimators=4000, learning_rate=0.02, num_leaves=96,
            subsample=0.85, colsample_bytree=0.9, min_child_samples=60,
            reg_alpha=0.5, reg_lambda=1.5, objective="binary",
            random_state=SEED+7, monotone_constraints=lgb_mono,
            force_row_wise=True, verbosity=-1
        )
        lgb2.fit(
            X_tr2, y_tr2,
            eval_set=[(X_va2, y_va2)],
            eval_metric="auc",
            categorical_feature=cat_cols,
            callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=0)]
        )
        oof_aug[va_idx] = lgb2.predict_proba(X_va2)[:, 1]
        pred_aug += lgb2.predict_proba(test[features_order])[:, 1] / 5

    print("OOF AUC (Aug LGBM):", roc_auc_score(train[TARGET], oof_aug))

    w_base = 0.6
    final_pred = w_base * pred_cal + (1 - w_base) * pred_aug

    out = sub.copy()
    out[out.columns[-1]] = final_pred
    out.to_csv("submission.csv", index=False)
    print("Saved submission.csv with shape:", out.shape)

# 简要报告
def quick_report():
    print("\n=== QUICK REPORT ===")
    try:
        print(f"Train shape: {train.shape} | Test shape: {test.shape}")
        print(f"Target positive rate: {train[TARGET].mean():.4f}")
        print(f"Cat cols: {len(cat_cols)} | Num cols: {len(num_cols)} | Total feats: {len(features_order)}")
        if 'oof_cb' in locals():  print(f"OOF AUC - CB:  {roc_auc_score(train[TARGET], oof_cb):.5f}")
        if 'oof_lgb' in locals(): print(f"OOF AUC - LGBM:{roc_auc_score(train[TARGET], oof_lgb):.5f}")
        if 'oof_cal' in locals() and oof_cal is not None:
            print(f"Blend(cal): {roc_auc_score(train[TARGET], oof_cal):.5f}")
        if 'oof_aug' in locals():
            print(f"AugLGBM:   {roc_auc_score(train[TARGET], oof_aug):.5f}")
    except Exception as e:
        print("Report skipped:", e)
quick_report()


Isotonic calibration done.
Pseudo-labeled rows: 42262
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4000]	valid_0's auc: 0.9214	valid_0's binary_logloss: 0.244493
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4000]	valid_0's auc: 0.920758	valid_0's binary_logloss: 0.245462
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4000]	valid_0's auc: 0.919524	valid_0's binary_logloss: 0.246887
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4000]	valid_0's auc: 0.920533	valid_0's binary_logloss: 0.24512
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4000]	valid_0's auc: 0.920725	valid_0's binary_logloss: 0.24377
OOF AUC (Aug LGBM): 0.9205868074165878
Saved submission.csv with shape