In [1]:
# ================================================================
# TALENT PROCESSED DATA EXPLORATION NOTEBOOK
# ================================================================
# This notebook:
#   • Automatically finds all processed datasets in data/processed/{pd, lgd}
#   • Uses ProcessedDataInspector to run TALENT compatibility checks
#   • Pretty-prints detailed reports per dataset
#   • Produces a final summary table
# ================================================================

# ---------------------------------------------------------------
# 1️⃣ Setup imports and paths
# ---------------------------------------------------------------
import sys
from pathlib import Path
import pandas as pd

# Determine repo root (one level up from notebooks/)
repo_root = Path().resolve().parents[0] if "notebooks" in str(Path().resolve()) else Path().resolve()
src_path = repo_root / "src"
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

print("✅ Import paths added:")
for p in [repo_root, src_path]:
    print(f" - {p}")

# Import our inspector
from src.data.processed_data_exploration import ProcessedDataInspector


# ---------------------------------------------------------------
# 2️⃣ Discover all processed datasets
# ---------------------------------------------------------------
processed_root = repo_root / "data" / "processed"
print(f"\n📁 Using processed data root: {processed_root}")

tasks = ["pd", "lgd"]
datasets = {
    t: sorted([d.name for d in (processed_root / t).iterdir() if d.is_dir()])
    for t in tasks if (processed_root / t).exists()
}

print("\n📦 Discovered processed datasets:")
for t, names in datasets.items():
    print(f"  {t.upper()}: {len(names)} datasets found → {names}")


# ---------------------------------------------------------------
# 3️⃣ Inspect each processed dataset
# ---------------------------------------------------------------
reports = []

for task, ds_list in datasets.items():
    for ds_name in ds_list:
        print("\n" + "=" * 70)
        print(f"🔍 Inspecting processed dataset: {ds_name} [{task.upper()}]")
        print("=" * 70)
        try:
            inspector = ProcessedDataInspector(ds_name, task, processed_root=processed_root)
            report = inspector.pretty_print()
            reports.append(report)
        except Exception as e:
            print(f"❌ Failed to inspect {ds_name}: {e}")

print("\n✅ All processed datasets checked.")


# ---------------------------------------------------------------
# 4️⃣ Build summary DataFrame
# ---------------------------------------------------------------
if reports:
    df_summary = pd.DataFrame(reports)
    df_summary = df_summary[
        ["dataset", "task", "n_samples", "n_num_features", "n_cat_features", "issues"]
    ]
    print("\n📊 TALENT processed data summary:")
    display(df_summary)
else:
    print("⚠️ No valid processed datasets found — nothing to summarize.")


✅ Import paths added:
 - /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit
 - /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/src

📁 Using processed data root: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/processed

📦 Discovered processed datasets:
  PD: 14 datasets found → ['0001.gmsc', '0002.taiwan_creditcard', '0003.vehicle_loan', '0004.lendingclub', '0005.Case Study', '0006.myhom', '0007.hackerearth', '0008.cobranded', '0009.german', '0010.bank_status', '0011.thomas', '0012.loan_default', '0013.home_credit', '0014.hmeq']
  LGD: 5 datasets found → ['0001.heloc', '0002.loss2', '0003.axa', '0004.base_model', '0005.base_modelisation']

🔍 Inspecting processed dataset: 0001.gmsc [PD]

📦 Dataset: 0001.gmsc [PD]
📁 Path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/processed/pd/0001.gmsc
🧮 Samples: 150000 | Num features: 10 | Cat features: 0
⚠️ Issues:
   - ✅ All TALENT compatibility checks passed.
---------------------------

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  np.subtract(arr, avg, out=arr, casting='unsafe', where=where)
  sqr = np.multiply(arr, arr, out=arr, where=where)



📦 Dataset: 0012.loan_default [PD]
📁 Path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/processed/pd/0012.loan_default
🧮 Samples: 105471 | Num features: 759 | Cat features: 0
⚠️ Issues:
   - N contains infinite values (e.g., at indices [[4, 383], [4, 612], [5, 383], [5, 612], [6, 383]])
   - 1 constant columns in N (first few indices: [663])
----------------------------------------------------------------------

🔍 Inspecting processed dataset: 0013.home_credit [PD]

📦 Dataset: 0013.home_credit [PD]
📁 Path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/processed/pd/0013.home_credit
🧮 Samples: 307511 | Num features: 104 | Cat features: 16
⚠️ Issues:
   - C contains 764371 entries with -1 (missing category placeholders).
----------------------------------------------------------------------

🔍 Inspecting processed dataset: 0014.hmeq [PD]

📦 Dataset: 0014.hmeq [PD]
📁 Path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/processed/p

Unnamed: 0,dataset,task,n_samples,n_num_features,n_cat_features,issues
0,0001.gmsc,pd,150000,10,0,[✅ All TALENT compatibility checks passed.]
1,0002.taiwan_creditcard,pd,30000,23,0,[✅ All TALENT compatibility checks passed.]
2,0003.vehicle_loan,pd,233154,32,3,[C contains 7661 entries with -1 (missing cate...
3,0004.lendingclub,pd,9578,12,1,[✅ All TALENT compatibility checks passed.]
4,0005.Case Study,pd,150000,10,1,[C contains 82 entries with -1 (missing catego...
5,0006.myhom,pd,7000,7,1,[✅ All TALENT compatibility checks passed.]
6,0007.hackerearth,pd,532428,35,8,[✅ All TALENT compatibility checks passed.]
7,0008.cobranded,pd,80000,47,0,[✅ All TALENT compatibility checks passed.]
8,0009.german,pd,999,7,13,[✅ All TALENT compatibility checks passed.]
9,0010.bank_status,pd,100000,16,0,[✅ All TALENT compatibility checks passed.]
