In [None]:

# 02_preprocessing.ipynb
%pip install -q pandas numpy scikit-learn pyarrow

import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path

RAW  = Path("data/raw")
PROC = Path("data/processed"); PROC.mkdir(parents=True, exist_ok=True)

# LendingClub
lc = pd.read_csv(RAW/"lendingclub.csv", low_memory=False)
positive = {"Charged Off","Default"}
negative = {"Fully Paid"}
lc = lc[lc["loan_status"].isin(positive|negative)].copy()
lc["default"] = lc["loan_status"].apply(lambda x: 1 if x in positive else 0)
lc["description"] = lc.get("description", lc.get("desc",""))
lc["description"] = lc["description"].fillna("").astype(str)
NUM_COLS_LC = [c for c in lc.columns if c in ["annual_inc","dti","loan_amnt","zip_code","term","emp_length","revol_util","int_rate"]]
for c in NUM_COLS_LC:
    lc[c] = pd.to_numeric(lc[c], errors="coerce")
lc[NUM_COLS_LC] = lc[NUM_COLS_LC].fillna(lc[NUM_COLS_LC].median(numeric_only=True))

lc_train, temp = train_test_split(lc, test_size=0.2, random_state=42, stratify=lc["default"])
lc_valid, lc_test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp["default"])

lc_train.to_parquet(PROC/"lc_train.parquet", index=False)
lc_valid.to_parquet(PROC/"lc_valid.parquet", index=False)
lc_test.to_parquet(PROC/"lc_test.parquet", index=False)

# German Credit
g_candidates = list(RAW.glob("*german*/*.csv")) + list(RAW.glob("*german*.csv"))
if g_candidates:
    g = pd.read_csv(g_candidates[0])
    ycol=None
    for c in ["Risk","class","creditability","target"]:
        if c in g.columns: ycol=c; break
    if ycol:
        g[ycol] = g[ycol].astype(str).str.lower().map({"good":0,"bad":1,"1":0,"2":1}).fillna(0)
    NUM_COLS_GE = [c for c in g.columns if pd.api.types.is_numeric_dtype(g[c])]
    g[NUM_COLS_GE] = g[NUM_COLS_GE].fillna(g[NUM_COLS_GE].median(numeric_only=True))
    g_train, g_temp = train_test_split(g, test_size=0.2, random_state=42, stratify=g[ycol] if ycol else None)
    g_valid, g_test = train_test_split(g_temp, test_size=0.5, random_state=42, stratify=g_temp[ycol] if ycol else None)
    g_train.to_parquet(PROC/"ge_train.parquet", index=False)
    g_valid.to_parquet(PROC/"ge_valid.parquet", index=False)
    g_test.to_parquet(PROC/"ge_test.parquet", index=False)

print("Saved processed splits to:", PROC)
