In [3]:
# --- TEST 1: Run full TALENT preprocessing on 0001.gmsc (PD) ---

import sys, os
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------------------------------------
# 1. Configure project paths
# ----------------------------------------------------------
# Detect project root from your notebook location
# (You said this notebook is in /TabPFNCredit/notebooks)
project_root = Path(__file__).resolve().parents[1] if "__file__" in locals() else Path.cwd().parents[0]

# Add src to path so imports work
src_path = project_root / "src"
sys.path.append(str(project_root))
sys.path.append(str(src_path))

print(f"✅ Added project root: {project_root}")
print(f"✅ Added src path: {src_path}")

# ----------------------------------------------------------
# 2. Import TALENT preprocessing
# ----------------------------------------------------------
from src.data.preprocessing import preprocess_dataset

# ----------------------------------------------------------
# 3. Set up dataset configuration
# ----------------------------------------------------------
task = "pd"
dataset = "0001.gmsc"

raw_path = project_root / "data" / "raw" / task / f"{dataset}.csv"
print(f"📂 Expected dataset path: {raw_path}")
if not raw_path.exists():
    raise FileNotFoundError(
        f"❌ Dataset not found! Please ensure this file exists:\n{raw_path}\n"
        f"or rename your file accordingly (TALENT expects '{dataset}.csv')."
    )

# ----------------------------------------------------------
# 4. Run TALENT preprocessing
# ----------------------------------------------------------
print(f"\n🚀 Running TALENT preprocessing for {dataset} ({task})...\n")
N, C, y, info = preprocess_dataset(task, dataset)

# ----------------------------------------------------------
# 5. Inspect results
# ----------------------------------------------------------
print("\n✅ Preprocessing successful!")
print(f"\n📊 Dataset info:\n{info}\n")

if N is not None:
    print(f"🔢 Numerical features: shape={N.shape}, dtype={N.dtype}")
    display(pd.DataFrame(N[:5, :5], columns=info['numerical_cols'][:5]))
else:
    print("No numerical features found (N=None).")

if C is not None:
    print(f"\n🧮 Categorical features: shape={C.shape}, dtype={C.dtype}")
    display(pd.DataFrame(C[:5, :], columns=info['categorical_cols'][:5]))
else:
    print("\nNo categorical features found (C=None).")

print(f"\n🎯 Target vector (y): shape={y.shape}, dtype={y.dtype}")
print("First 10 targets:", y[:10])


✅ Added project root: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit
✅ Added src path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/src
📂 Expected dataset path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/raw/pd/0001.gmsc.csv

🚀 Running TALENT preprocessing for 0001.gmsc (pd)...


✅ Preprocessing successful!

📊 Dataset info:
{'dataset_name': '0001.gmsc', 'task_type': 'classification', 'n_samples': 150000, 'n_num_features': 10, 'n_cat_features': 0, 'numerical_cols': ['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'], 'categorical_cols': []}

🔢 Numerical features: shape=(150000, 10), dtype=float32


Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome
0,0.766127,45.0,2.0,0.802982,9120.0
1,0.957151,40.0,0.0,0.121876,2600.0
2,0.65818,38.0,1.0,0.085113,3042.0
3,0.23381,30.0,0.0,0.03605,3300.0
4,0.907239,49.0,1.0,0.024926,63588.0



No categorical features found (C=None).

🎯 Target vector (y): shape=(150000,), dtype=int64
First 10 targets: [1 0 0 0 0 0 0 0 0 0]


In [5]:
import os
import time
import json
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import logging

# -------------------------------------------------------------------
# Disable all logger outputs
# -------------------------------------------------------------------
logging.disable(logging.CRITICAL)

# Import TALENT preprocessing
from src.data.preprocessing import preprocess_dataset

# Detect project paths
PROJECT_ROOT = Path.cwd().parents[0] if "notebooks" in str(Path.cwd()) else Path.cwd()
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROC_DIR = PROJECT_ROOT / "data" / "processed"

# -------------------------------------------------------------------
# Dataset definitions
# -------------------------------------------------------------------
PD_DATASETS = [
    "0001.gmsc", "0002.taiwan_creditcard", "0003.vehicle_loan",
    "0004.lendingclub", "0005.Case Study", "0006.myhom",
    "0007.hackerearth", "0008.cobranded", "0009.german",
    "0010.bank_status", "0011.thomas", "0012.loan_default",
    "0013.home_credit", "0014.hmeq"
]

LGD_DATASETS = [
    "0001.heloc", "0002.loss2", "0003.axa",
    "0004.base_model", "0005.base_modelisation"
]

# -------------------------------------------------------------------
# Helper function to validate dataset outputs
# -------------------------------------------------------------------
def validate_dataset(task, dataset):
    dataset_dir = PROC_DIR / task / dataset
    results = {"dataset": dataset, "task": task, "status": "✅ OK", "issues": []}

    # Check for key files
    for name in ["y.npy", "info.json"]:
        if not (dataset_dir / name).exists():
            results["status"] = "❌ FAIL"
            results["issues"].append(f"Missing {name}")

    if not (dataset_dir / "y.npy").exists():
        return results

    # Load outputs
    with open(dataset_dir / "info.json") as f:
        info = json.load(f)
    y = np.load(dataset_dir / "y.npy")
    N = np.load(dataset_dir / "N.npy") if (dataset_dir / "N.npy").exists() else None
    C = np.load(dataset_dir / "C.npy") if (dataset_dir / "C.npy").exists() else None

    # Validate lengths
    n_rows = len(y)
    if N is not None and N.shape[0] != n_rows:
        results["status"] = "⚠️ MISMATCH"
        results["issues"].append(f"N rows {N.shape[0]} != y {n_rows}")
    if C is not None and C.shape[0] != n_rows:
        results["status"] = "⚠️ MISMATCH"
        results["issues"].append(f"C rows {C.shape[0]} != y {n_rows}")

    # Check for NaNs
    if N is not None and np.isnan(N).any():
        results["status"] = "⚠️ WARN"
        results["issues"].append("NaN in N")
    if C is not None and np.isnan(C).any():
        results["status"] = "⚠️ WARN"
        results["issues"].append("NaN in C")

    results.update({
        "n_samples": n_rows,
        "n_num": N.shape[1] if N is not None else 0,
        "n_cat": C.shape[1] if C is not None else 0,
    })
    return results

# -------------------------------------------------------------------
# Preprocess and validate all datasets
# -------------------------------------------------------------------
all_results = []

for task, datasets in [("pd", PD_DATASETS), ("lgd", LGD_DATASETS)]:
    print(f"\n=== Processing {task.upper()} datasets ({len(datasets)}) ===")
    for dataset in datasets:
        start = time.time()
        try:
            N, C, y, info = preprocess_dataset(task, dataset)
            duration = time.time() - start
            result = validate_dataset(task, dataset)
            result["time_sec"] = round(duration, 2)
            all_results.append(result)
            print(f"   ✅ {dataset:20s} | n={len(y):6d} | num={len(info['numerical_cols']):3d} | cat={len(info['categorical_cols']):3d} | {duration:.2f}s")
        except Exception as e:
            print(f"   ❌ {dataset:20s} | ERROR: {e}")
            all_results.append({"dataset": dataset, "task": task, "status": f"ERROR: {e}", "issues": []})

# -------------------------------------------------------------------
# Summarize results
# -------------------------------------------------------------------
results_df = pd.DataFrame(all_results)
print("\n✅ Summary of all datasets:\n")
display(results_df)
print(results_df["status"].value_counts())

fails = results_df[results_df["status"].str.contains("FAIL|ERROR|MISMATCH|WARN")]
if not fails.empty:
    print("\n⚠️ Issues detected:")
    display(fails)
else:
    print("\n🎉 All datasets processed successfully!")



=== Processing PD datasets (14) ===
   ✅ 0001.gmsc            | n=150000 | num= 10 | cat=  0 | 0.12s
   ✅ 0002.taiwan_creditcard | n= 30000 | num= 23 | cat=  0 | 0.05s
   ❌ 0003.vehicle_loan    | ERROR: "['AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH'] not in index"
   ✅ 0004.lendingclub     | n=  9578 | num= 12 | cat=  1 | 0.02s
   ✅ 0005.Case Study      | n=150000 | num= 10 | cat=  1 | 0.12s
   ✅ 0006.myhom           | n=  7000 | num=  7 | cat=  1 | 0.01s
   ✅ 0007.hackerearth     | n=532428 | num= 27 | cat= 10 | 2.78s


  df = pd.read_csv(dataset_path)


   ✅ 0008.cobranded       | n= 80000 | num= 47 | cat=  0 | 0.69s
   ❌ 0009.german          | ERROR: Length mismatch: Expected axis has 1 elements, new values have 0 elements
   ❌ 0010.bank_status     | ERROR: Object arrays cannot be loaded when allow_pickle=False
   ❌ 0011.thomas          | ERROR: Expected 'BAD' in Thomas dataset.


  df = pd.read_csv(dataset_path)


   ❌ 0012.loan_default    | ERROR: "['f678'] not in index"
   ✅ 0013.home_credit     | n=307511 | num=104 | cat= 16 | 2.36s
   ✅ 0014.hmeq            | n=  5960 | num= 10 | cat=  2 | 0.01s

=== Processing LGD datasets (5) ===
   ✅ 0001.heloc           | n= 67898 | num=  9 | cat=  0 | 0.09s
   ✅ 0002.loss2           | n=  4638 | num= 39 | cat= 23 | 0.07s
   ✅ 0003.axa             | n=  2545 | num=  2 | cat=  0 | 0.01s


  df = pd.read_csv(dataset_path)


   ❌ 0004.base_model      | ERROR: "['TaxFreeTurnover', 'Parent_TaxFreeTurnover', 'FCLT_DebtSubordinatedPrct', 'FCLT_BalloonTotalAmountPrct', 'top_ncj'] not in index"
   ❌ 0005.base_modelisation | ERROR: "['PCRU', 'topbaloispur', 'SIGNE_SPREAD_E', 'MY1_SIGNE_SPREAD_E', 'PY1_SIGNE_SPREAD_E'] not in index"

✅ Summary of all datasets:



Unnamed: 0,dataset,task,status,issues,n_samples,n_num,n_cat,time_sec
0,0001.gmsc,pd,⚠️ WARN,[NaN in N],150000.0,10.0,0.0,0.12
1,0002.taiwan_creditcard,pd,✅ OK,[],30000.0,23.0,0.0,0.05
2,0003.vehicle_loan,pd,"ERROR: ""['AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.L...",[],,,,
3,0004.lendingclub,pd,✅ OK,[],9578.0,12.0,1.0,0.02
4,0005.Case Study,pd,⚠️ WARN,[NaN in N],150000.0,10.0,1.0,0.12
5,0006.myhom,pd,⚠️ WARN,[NaN in N],7000.0,7.0,1.0,0.01
6,0007.hackerearth,pd,⚠️ WARN,[NaN in N],532428.0,27.0,10.0,2.78
7,0008.cobranded,pd,⚠️ WARN,[NaN in N],80000.0,47.0,0.0,0.69
8,0009.german,pd,ERROR: Length mismatch: Expected axis has 1 el...,[],,,,
9,0010.bank_status,pd,ERROR: Object arrays cannot be loaded when all...,[],,,,


status
⚠️ WARN                                                                                                                                       9
✅ OK                                                                                                                                          3
ERROR: "['AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.LENGTH'] not in index"                                                                           1
ERROR: Length mismatch: Expected axis has 1 elements, new values have 0 elements                                                              1
ERROR: Object arrays cannot be loaded when allow_pickle=False                                                                                 1
ERROR: Expected 'BAD' in Thomas dataset.                                                                                                      1
ERROR: "['f678'] not in index"                                                                                                   

Unnamed: 0,dataset,task,status,issues,n_samples,n_num,n_cat,time_sec
0,0001.gmsc,pd,⚠️ WARN,[NaN in N],150000.0,10.0,0.0,0.12
2,0003.vehicle_loan,pd,"ERROR: ""['AVERAGE.ACCT.AGE', 'CREDIT.HISTORY.L...",[],,,,
4,0005.Case Study,pd,⚠️ WARN,[NaN in N],150000.0,10.0,1.0,0.12
5,0006.myhom,pd,⚠️ WARN,[NaN in N],7000.0,7.0,1.0,0.01
6,0007.hackerearth,pd,⚠️ WARN,[NaN in N],532428.0,27.0,10.0,2.78
7,0008.cobranded,pd,⚠️ WARN,[NaN in N],80000.0,47.0,0.0,0.69
8,0009.german,pd,ERROR: Length mismatch: Expected axis has 1 el...,[],,,,
9,0010.bank_status,pd,ERROR: Object arrays cannot be loaded when all...,[],,,,
10,0011.thomas,pd,ERROR: Expected 'BAD' in Thomas dataset.,[],,,,
11,0012.loan_default,pd,"ERROR: ""['f678'] not in index""",[],,,,
