In [None]:

# 04_text_risk_model.ipynb
%pip install -q numpy pandas scikit-learn joblib

import numpy as np, pandas as pd, joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score
from pathlib import Path

PROC = Path("data/processed")
EMB  = Path("data/embeddings")
MODELS = Path("models"); MODELS.mkdir(parents=True, exist_ok=True)

X_train = np.load(EMB/"lc_train_emb.npy")
X_valid = np.load(EMB/"lc_valid_emb.npy")
X_test  = np.load(EMB/"lc_test_emb.npy")

y_train = pd.read_parquet(PROC/"lc_train.parquet")["default"].values
y_valid = pd.read_parquet(PROC/"lc_valid.parquet")["default"].values
y_test  = pd.read_parquet(PROC/"lc_test.parquet")["default"].values

clf = LogisticRegression(max_iter=400, n_jobs=-1)
clf.fit(X_train, y_train)

def report(m, X, y, name):
    p = m.predict_proba(X)[:,1]
    auc = roc_auc_score(y,p); f1 = f1_score(y,(p>0.5).astype(int))
    print(f"{name}: AUC={auc:.3f} | F1={f1:.3f}")

report(clf, X_valid, y_valid, "VALID")
report(clf, X_test,  y_test,  "TEST")

joblib.dump(clf, MODELS/"text_risk_model.pkl")
print("Saved:", MODELS/"text_risk_model.pkl")
