In [1]:
# ============================================================
# âœ… TALENT classical model test (RandomForest) using DataFeeder
# ============================================================

import sys
from pathlib import Path
from argparse import Namespace
import numpy as np

# ------------------------------------------------------------
# 0) Path setup (TALENT installed via pip, src local)
# ------------------------------------------------------------
repo_root = Path.cwd().parents[0]
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# ------------------------------------------------------------
# 1) Imports
# ------------------------------------------------------------
from src.data.data_feeder import DataFeeder

try:
    from TALENT.model.classical_methods.randomforest import RandomForestMethod as RFClass
except ImportError:
    from TALENT.model.classical_methods.randomforest import RandomForest as RFClass

# ------------------------------------------------------------
# 2) Load preprocessed dataset
# ------------------------------------------------------------
data_dir = repo_root / "data" / "processed" / "pd" / "0001.gmsc"
if not (data_dir / "y.npy").exists():
    raise FileNotFoundError(f"Preprocessed files not found in {data_dir}")

X_num = np.load(data_dir / "N.npy", allow_pickle=True)
y = np.load(data_dir / "y.npy", allow_pickle=True)
X_cat = np.load(data_dir / "C.npy", allow_pickle=True) if (data_dir / "C.npy").exists() else None

print(f"âœ… Dataset loaded: X_num={X_num.shape}, X_cat={'None' if X_cat is None else X_cat.shape}, "
    f"y={y.shape}, default_rate={y.mean():.3f}")

# ------------------------------------------------------------
# 3) Build splits via DataFeeder
# ------------------------------------------------------------
feeder = DataFeeder(
    test_size=0.2,
    val_size=0.2,
    cv_splits=1,
    seed=42,
    task_type="pd",
    sampling=None,
)

(train_val, test, _info) = feeder.prepare(X_num=X_num, y=y, X_cat=X_cat)[1]
Xn_tr, Xc_tr, y_tr, Xn_val, Xc_val, y_val = train_val
Xn_te, Xc_te, y_te = test

print(f"âœ… Splits â†’ Train={Xn_tr.shape}, Val={Xn_val.shape}, Test={Xn_te.shape}")

# ------------------------------------------------------------
# 4) Wrap data into TALENT dicts
# ------------------------------------------------------------
N = {"train": Xn_tr, "val": Xn_val, "test": Xn_te}
C = {"train": Xc_tr, "val": Xc_val, "test": Xc_te} if Xc_tr is not None else None
Y = {"train": y_tr,  "val": y_val,  "test": y_te}
y_info = {"task_type": "binclass"}

# ------------------------------------------------------------
# 5) Build args Namespace (all required fields)
# ------------------------------------------------------------
save_dir = repo_root / "res" / "tmp_talent_rf"
save_dir.mkdir(parents=True, exist_ok=True)

args = Namespace(
    seed=42,
    save_path=str(save_dir),
    # Preprocessing/encoding settings:
    cat_policy="ordinal",
    num_policy="standard",
    num_nan_policy="mean",
    cat_nan_policy="new",      # must be "new" for classical models
    normalization="standard",  # âœ… added: required by data_norm_process()
    # Config (as TALENT expects)
    config={
        "model": {
            "n_estimators": 200,
            "max_depth": None,
            "n_jobs": -1,
        },
        "fit": {
            "n_bins": 10,  # âœ… required
        },
    },
)

# ------------------------------------------------------------
# 6) Train + evaluate RandomForest
# ------------------------------------------------------------
is_regression = False
model = RFClass(args, is_regression)

_ = model.fit(data=(N, C, Y), info=y_info, train=True)

vres, metric_names, test_logits = model.predict(data=(N, C, Y), info=y_info, model_name="RandomForest")

print("\nðŸ“Š TALENT Metrics on TEST:")
for name, val in zip(metric_names, vres):
    try:
        val = float(val)
    except Exception:
        pass
    print(f"{name:>15}: {val:.5f}")


ModuleNotFoundError: No module named 'src'