In [1]:
# ================================================================
# RAW DATA EXPLORATION (Notebook-Friendly Runner)
# ================================================================
# This script:
#   - Automatically finds all raw datasets (PD + LGD)
#   - Runs RawDataInspector on each
#   - Prints concise summaries to console
#   - Keeps all reports in memory for later analysis
# ================================================================

# ---------------------------------------------------------------
# 1️⃣ Environment Setup — fix imports
# ---------------------------------------------------------------
import sys
from pathlib import Path
import pandas as pd

# ---------------------------------------------------------------
# 1️⃣ Fix Python path (relative to notebooks/)
# ---------------------------------------------------------------
# Move one level up from /notebooks/ to repo root
repo_root = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]

# Add the repo root to Python's module search path
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))
    print(f"✅ Added repo root to sys.path: {repo_root}")
else:
    print(f"ℹ️ Repo root already in sys.path: {repo_root}")

# Now imports from 'src' will work
from src.data.raw_data_exploration import RawDataInspector

# ---------------------------------------------------------------
# 2️⃣ Setup
# ---------------------------------------------------------------
raw_root = repo_root / "data" / "raw"
print(f"📁 Using raw data root: {raw_root}")

# ---------------------------------------------------------------
# 3️⃣ Discover available datasets
# ---------------------------------------------------------------
datasets = {
    "pd": sorted([f.stem for f in (raw_root / "pd").glob("*.csv")]) if (raw_root / "pd").exists() else [],
    "lgd": sorted([f.stem for f in (raw_root / "lgd").glob("*.csv")]) if (raw_root / "lgd").exists() else [],
}

print("📦 Discovered datasets:")
for t, names in datasets.items():
    print(f"  {t.upper()}: {len(names)} datasets found → {names}")

# ---------------------------------------------------------------
# 4️⃣ Run inspector on each dataset
# ---------------------------------------------------------------
reports = []

for task, dset_list in datasets.items():
    for dataset_name in dset_list:
        print("\n" + "="*70)
        print(f"🔍 Inspecting {dataset_name} [{task.upper()}]")
        print("="*70)

        dataset_path = raw_root / task / f"{dataset_name}.csv"
        if not dataset_path.exists():
            print(f"❌ Expected file not found: {dataset_path}")
            continue

        try:
            # Pass task explicitly to avoid auto-inference issues
            inspector = RawDataInspector(dataset_name, task=task, raw_root=str(raw_root))
            report = inspector.summarize()
            inspector.pretty_print()
            reports.append(report)
        except FileNotFoundError as fnf:
            print(f"❌ FileNotFoundError: {fnf}")
        except pd.errors.EmptyDataError:
            print(f"⚠️ Warning: {dataset_name} appears to be empty or malformed.")
        except Exception as e:
            print(f"❌ Unexpected error while inspecting {dataset_name}: {type(e).__name__} → {e}")

print("\n✅ Exploration complete.")

# ---------------------------------------------------------------
# 5️⃣ (Optional) Convert to DataFrame for tabular viewing
# ---------------------------------------------------------------
if reports:
    summary_df = pd.DataFrame(reports)
    summary_df = summary_df[
        ["dataset", "task", "n_rows", "n_columns",
         "columns_with_missing", "rows_with_missing",
         "constant_columns", "memory_usage_MB"]
    ]
    print("\n📊 Summary table of all datasets:")
    display(summary_df)
else:
    print("\n⚠️ No successful inspections — check dataset paths or encodings.")


✅ Added repo root to sys.path: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit
📁 Using raw data root: /Users/anderas/Documents/PhD/Projects/1. TabPFN/TabPFNCredit/data/raw
📦 Discovered datasets:
  PD: 14 datasets found → ['0001.gmsc', '0002.taiwan_creditcard', '0003.vehicle_loan', '0004.lendingclub', '0005.Case Study', '0006.myhom', '0007.hackerearth', '0008.cobranded', '0009.german', '0010.bank_status', '0011.thomas', '0012.loan_default', '0013.home_credit', '0014.hmeq']
  LGD: 5 datasets found → ['0001.heloc', '0002.loss2', '0003.axa', '0004.base_model', '0005.base_modelisation']

🔍 Inspecting 0001.gmsc [PD]
🔍 Dataset: 0001.gmsc  |  Task: PD
📊 Shape: 150000 rows × 11 columns
🧩 Dtypes: {dtype('int64'): 7, dtype('float64'): 4}
❓ Missing: 2 cols, 29731 rows
💾 Memory: 12.59 MB
⚠️ Issues: 609 duplicate rows detected (first few indices: [1669, 7823, 7920, 8840, 10869, 14067, 14465, 14874, 15346, 15544]); 2 columns contain missing values: ['MonthlyIncome', 'NumberOfDependents']

  self.df = pd.read_csv(self.file_path)


🔍 Dataset: 0008.cobranded  |  Task: PD
📊 Shape: 80000 rows × 49 columns
🧩 Dtypes: {dtype('O'): 36, dtype('float64'): 10, dtype('int64'): 3}
❓ Missing: 10 cols, 46413 rows
💾 Memory: 171.8 MB
⚠️ Issues: 1 high-cardinality columns (>90% unique): ['application_key']; 10 columns contain missing values: ['mvar2', 'mvar3', 'mvar4', 'mvar5', 'mvar21', 'mvar22', 'mvar23', 'mvar24', 'mvar33', 'mvar44'] | Example row indices with NaN: [0, 2, 3, 4, 5, 6, 7, 9, 10, 14]
------------------------------------------------------------

🔍 Inspecting 0009.german [PD]
🔍 Dataset: 0009.german  |  Task: PD
📊 Shape: 999 rows × 21 columns
🧩 Dtypes: {dtype('O'): 13, dtype('int64'): 8}
❓ Missing: 0 cols, 0 rows
💾 Memory: 0.81 MB
⚠️ Issues: 1 high-cardinality columns (>90% unique): ['1169']
------------------------------------------------------------

🔍 Inspecting 0010.bank_status [PD]
🔍 Dataset: 0010.bank_status  |  Task: PD
📊 Shape: 100514 rows × 19 columns
🧩 Dtypes: {dtype('float64'): 12, dtype('O'): 7}
❓ Missin

  self.df = pd.read_csv(self.file_path)


🔍 Dataset: 0012.loan_default  |  Task: PD
📊 Shape: 105471 rows × 771 columns
🧩 Dtypes: {dtype('float64'): 653, dtype('int64'): 99, dtype('O'): 19}
❓ Missing: 525 cols, 53531 rows
💾 Memory: 748.54 MB
⚠️ Issues: 11 constant columns: ['f33', 'f34', 'f35', 'f37', 'f38', 'f678', 'f700', 'f701', 'f702', 'f736']...; 73 high-cardinality columns (>90% unique): ['id', 'f3', 'f44', 'f54', 'f139', 'f216', 'f217', 'f278', 'f363', 'f364']...; 525 columns contain missing values: ['f7', 'f8', 'f14', 'f15', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22']... | Example row indices with NaN: [0, 1, 3, 5, 12, 20, 22, 23, 24, 32]
------------------------------------------------------------

🔍 Inspecting 0013.home_credit [PD]
🔍 Dataset: 0013.home_credit  |  Task: PD
📊 Shape: 307511 rows × 122 columns
🧩 Dtypes: {dtype('float64'): 65, dtype('int64'): 41, dtype('O'): 16}
❓ Missing: 67 cols, 298909 rows
💾 Memory: 536.69 MB
⚠️ Issues: 1 high-cardinality columns (>90% unique): ['SK_ID_CURR']; 67 columns contain missing 

  self.df = pd.read_csv(self.file_path)


🔍 Dataset: 0002.loss2  |  Task: LGD
📊 Shape: 4802 rows × 72 columns
🧩 Dtypes: {dtype('float64'): 43, dtype('O'): 25, dtype('int64'): 4}
❓ Missing: 34 cols, 4802 rows
💾 Memory: 8.67 MB
⚠️ Issues: 18 high-cardinality columns (>90% unique): ['UPB_At_Resolution', 'Unpaid_Interest', 'Total_Debt', '_adv_interest1M', '_adv_interest', '_ELAO', '_Accrued_int', '_EAD', '_Net_sales_Proceeds', '_Miclaimbal']...; 34 columns contain missing values: ['State', 'UPB_At_Resolution', 'Servicing_Loss', 'REO_Sales_Price', 'Original_Appraised_Value', 'REO_Appraisal_Amount', 'REO_Appraisal_Date', 'Analysis_Age', 'Analyst', 'Recourse_Type']... | Example row indices with NaN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
------------------------------------------------------------

🔍 Inspecting 0003.axa [LGD]
🔍 Dataset: 0003.axa  |  Task: LGD
📊 Shape: 2545 rows × 8 columns
🧩 Dtypes: {dtype('float64'): 8}
❓ Missing: 0 cols, 0 rows
💾 Memory: 0.16 MB
⚠️ Issues: 120 duplicate rows detected (first few indices: [16, 45, 46, 92, 11

Unnamed: 0,dataset,task,n_rows,n_columns,columns_with_missing,rows_with_missing,constant_columns,memory_usage_MB
0,0001.gmsc,pd,150000,11,2,29731,0,12.59
1,0002.taiwan_creditcard,pd,30000,25,0,0,0,5.72
2,0003.vehicle_loan,pd,233154,41,1,7661,1,152.98
3,0004.lendingclub,pd,9578,14,0,0,0,1.6
4,0005.Case Study,pd,150000,12,3,29797,0,21.74
5,0006.myhom,pd,7000,10,1,245,0,0.9
6,0007.hackerearth,pd,532428,45,21,532428,0,679.44
7,0008.cobranded,pd,80000,49,10,46413,0,171.8
8,0009.german,pd,999,21,0,0,0,0.81
9,0010.bank_status,pd,100514,19,19,64091,0,59.12
