In [None]:

# 05_hybrid_model_xgb_llm.ipynb
%pip install -q numpy pandas scikit-learn xgboost joblib

import numpy as np, pandas as pd, joblib
from pathlib import Path
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, f1_score

PROC = Path("data/processed")
EMB  = Path("data/embeddings")
MODELS = Path("models"); MODELS.mkdir(parents=True, exist_ok=True)

# LendingClub Hybrid
lc_train = pd.read_parquet(PROC/"lc_train.parquet")
lc_valid = pd.read_parquet(PROC/"lc_valid.parquet")
lc_test  = pd.read_parquet(PROC/"lc_test.parquet")

text_model = joblib.load(MODELS/"text_risk_model.pkl")
X_train_emb = np.load(EMB/"lc_train_emb.npy")
X_valid_emb = np.load(EMB/"lc_valid_emb.npy")
X_test_emb  = np.load(EMB/"lc_test_emb.npy")

train_text_score = text_model.predict_proba(X_train_emb)[:,1]
valid_text_score = text_model.predict_proba(X_valid_emb)[:,1]
test_text_score  = text_model.predict_proba(X_test_emb)[:,1]

NUM_COLS_LC = [c for c in lc_train.columns if c in ["annual_inc","dti","loan_amnt","zip_code","term","emp_length","revol_util","int_rate"]]
Xtr = np.column_stack([lc_train[NUM_COLS_LC].values, train_text_score])
Xva = np.column_stack([lc_valid[NUM_COLS_LC].values, valid_text_score])
Xte = np.column_stack([lc_test[NUM_COLS_LC].values,  test_text_score])

ytr, yva, yte = lc_train["default"].values, lc_valid["default"].values, lc_test["default"].values

xgb = XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.05,
                    subsample=0.9, colsample_bytree=0.9, random_state=42, n_jobs=-1)
xgb.fit(Xtr, ytr, eval_set=[(Xva,yva)], verbose=False)

def report(name, X, y):
    p = xgb.predict_proba(X)[:,1]
    auc = roc_auc_score(y, p)
    f1  = f1_score(y, (p>0.5).astype(int))
    print(f"{name}: AUC={auc:.3f} | F1={f1:.3f}")

report("LendingClub VALID", Xva, yva)
report("LendingClub TEST",  Xte, yte)

joblib.dump(xgb, MODELS/"hybrid_model_lc.pkl")
print("Saved:", MODELS/"hybrid_model_lc.pkl")

# Optional: German tabular-only baseline
try:
    ge_train = pd.read_parquet(PROC/"ge_train.parquet")
    ge_valid = pd.read_parquet(PROC/"ge_valid.parquet")
    ge_test  = pd.read_parquet(PROC/"ge_test.parquet")
    NUM_COLS_GE = [c for c in ge_train.columns if pd.api.types.is_numeric_dtype(ge_train[c])]
    ycol=None
    for c in ["Risk","class","creditability","target","default"]:
        if c in ge_train.columns: ycol=c; break
    if ycol:
        g_xgb = XGBClassifier(n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, random_state=42)
        g_xgb.fit(ge_train[NUM_COLS_GE].values, ge_train[ycol].values, eval_set=[(ge_valid[NUM_COLS_GE].values, ge_valid[ycol].values)], verbose=False)
        from sklearn.metrics import roc_auc_score, f1_score
        pro = g_xgb.predict_proba(ge_test[NUM_COLS_GE].values)[:,1]
        auc = roc_auc_score(ge_test[ycol].values, pro)
        f1  = f1_score(ge_test[ycol].values, (pro>0.5).astype(int))
        print(f"German TEST: AUC={auc:.3f} | F1={f1:.3f}")
        joblib.dump(g_xgb, MODELS/"german_tabular_model.pkl")
        print("Saved:", MODELS/"german_tabular_model.pkl")
except Exception as e:
    print("German baseline skipped:", e)
