In [None]:
# ==========================================================
# 1. Imports and environment setup
# ==========================================================
import sys
from pathlib import Path
import numpy as np
from pprint import pprint

# Ensure we can import from the src folder (project root = parent of notebooks/)
PROJECT_ROOT = Path.cwd().parents[0]
sys.path.append(str(PROJECT_ROOT))

from src.data.data_feeder import DataFeeder

# Display settings for cleaner numpy output
np.set_printoptions(precision=3, suppress=True, linewidth=120)

print("ðŸš€ Starting DataFeeder test within notebook...\n")


# ==========================================================
# 2. Helper function for inspecting folds
# ==========================================================
def inspect_folds(folds: dict, description: str):
    """
    Inspect and print the structure and shapes of DataFeeder outputs.
    Each fold contains ((N, C, y), info), where N/C/y are dictionaries
    with keys: 'train', 'val', 'test'.
    """
    print("\n==============================")
    print(f" {description}")
    print("==============================")

    for fold_id, ((N, C, y), info) in folds.items():
        print(f"\nðŸ“‚ Fold {fold_id}:")
        print("Info:")
        pprint(info)

        def safe_shape(x):
            return None if x is None else x.shape

        # --- Numerical Features ---
        if N is not None:
            print("\nNumerical features:")
            for split in ["train", "val", "test"]:
                print(f"  â€¢ {split:<5}: {safe_shape(N[split])}")
        else:
            print("\nNumerical features: None")

        # --- Categorical Features ---
        if C is not None:
            print("\nCategorical features:")
            for split in ["train", "val", "test"]:
                print(f"  â€¢ {split:<5}: {safe_shape(C[split])}")
        else:
            print("\nCategorical features: None")

        # --- Targets ---
        print("\nTargets:")
        for split in ["train", "val", "test"]:
            print(f"  â€¢ {split:<5}: {safe_shape(y[split])}")
        print(f"  Target mean (train): {np.mean(y['train']):.3f} | (test): {np.mean(y['test']):.3f}")

    print("\nâœ… Fold inspection complete.\n")


# ==========================================================
# 3. TEST CASE 1 â€” PD (classification)
# ==========================================================
print("ðŸ§ª Testing PD (classification) dataset â€” single split (cv=1)")

feeder_pd_single = DataFeeder(
    task="pd",
    dataset="0014.hmeq",    # Replace with a dataset available locally
    test_size=0.2,
    val_size=0.2,
    cv_splits=1,
    seed=42,
    sampling=0.3,           # optional resampling for imbalance
)
folds_pd_single = feeder_pd_single.prepare()
inspect_folds(folds_pd_single, "PD dataset â€” single split (cv=1)")


print("ðŸ§ª Testing PD (classification) dataset â€” cross-validation (cv=3)")

feeder_pd_cv = DataFeeder(
    task="pd",
    dataset="0014.hmeq",
    test_size=0.2,          # ignored in CV mode
    val_size=0.2,
    cv_splits=3,
    seed=42,
)
folds_pd_cv = feeder_pd_cv.prepare()
inspect_folds(folds_pd_cv, "PD dataset â€” 3-fold CV")


# ==========================================================
# 4. TEST CASE 2 â€” LGD (regression)
# ==========================================================
print("ðŸ§ª Testing LGD (regression) dataset â€” single split (cv=1)")

feeder_lgd_single = DataFeeder(
    task="lgd",
    dataset="0001.heloc",   # Replace with a regression dataset you have
    test_size=0.25,
    val_size=0.2,
    cv_splits=1,
    seed=123,
)
folds_lgd_single = feeder_lgd_single.prepare()
inspect_folds(folds_lgd_single, "LGD dataset â€” single split (cv=1)")


print("ðŸ§ª Testing LGD (regression) dataset â€” cross-validation (cv=3)")

feeder_lgd_cv = DataFeeder(
    task="lgd",
    dataset="0001.heloc",
    test_size=0.25,         # ignored in CV mode
    val_size=0.2,
    cv_splits=3,
    seed=123,
)
folds_lgd_cv = feeder_lgd_cv.prepare()
inspect_folds(folds_lgd_cv, "LGD dataset â€” 3-fold CV")


# ==========================================================
# 5. Summary of test results
# ==========================================================
print("\nðŸŽ¯ Summary of DataFeeder Tests:")
print("- âœ… DataFeeder successfully loaded and preprocessed all datasets.")
print("- âœ… Output structure is fully TALENT-compatible (dicts with train/val/test).")
print("- âœ… Single-split and CV modes both operational.")
print("- âœ… Feature and target shapes verified per split.")
print("- âœ… Target distributions printed for sanity checking.")
print("\nðŸŽ‰ All DataFeeder tests completed successfully within notebook.\n")


ðŸš€ Starting DataFeeder test within notebook...

ðŸ§ª Testing PD (classification) dataset â€” single split (cv=1)

 PD dataset â€” single split (cv=1)

ðŸ“‚ Fold 1:
Info:
{'n_cat_features': 2, 'n_num_features': 10, 'task_type': 'classification'}

Numerical features:
  â€¢ train: (2536, 10)
  â€¢ val  : (634, 10)
  â€¢ test : (1192, 10)

Categorical features:
  â€¢ train: (2536, 2)
  â€¢ val  : (634, 2)
  â€¢ test : (1192, 2)

Targets:
  â€¢ train: (2536,)
  â€¢ val  : (634,)
  â€¢ test : (1192,)
  Target mean (train): 0.300 | (test): 0.200

âœ… Fold inspection complete.

ðŸ§ª Testing PD (classification) dataset â€” cross-validation (cv=3)

 PD dataset â€” 3-fold CV

ðŸ“‚ Fold 1:
Info:
{'n_cat_features': 2, 'n_num_features': 10, 'task_type': 'classification'}

Numerical features:
  â€¢ train: (3178, 10)
  â€¢ val  : (795, 10)
  â€¢ test : (1987, 10)

Categorical features:
  â€¢ train: (3178, 2)
  â€¢ val  : (795, 2)
  â€¢ test : (1987, 2)

Targets:
  â€¢ train: (3178,)
  â€¢ val  : (79