In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# Project paths (portable across machines)
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data" / "data.csv"
SHORTLIST_PATH = PROJECT_ROOT / "data" / "feature_shortlist.csv"

df = pd.read_csv(DATA_PATH)

In [None]:
# Target column used across the project
TARGET_COL = "Bankrupt?"
if TARGET_COL not in df.columns:
    raise ValueError(f"Expected target column '{TARGET_COL}' not found in the dataset.")

X = df.drop(columns=[TARGET_COL])
all_features = X.columns.tolist()

In [None]:
# Compact accounting-led feature set (kept small for interpretability)
locked_features = [
    " ROA(A) before interest and % after tax",
    " Operating Profit Rate",
    " Operating Gross Margin",
    " Working Capital/Equity",
    " Current Liabilities/Liability",
    " Total debt/Total net worth",
    " Interest-bearing debt interest rate",
    " Cash flow rate",
    " Operating Funds to Liability",
    " Retained Earnings to Total Assets"
]

# Make sure the feature names match the dataset exactly (including leading spaces)
missing = [f for f in locked_features if f not in all_features]
if missing:
    raise ValueError(
        "Some selected features were not found in the dataset columns:\n"
        + "\n".join(missing)
    )

feature_map = pd.DataFrame({
    "feature": locked_features,
    "accounting_bucket": [
        "Profitability",
        "Profitability",
        "Profitability",
        "Liquidity",
        "Liquidity",
        "Leverage",
        "Leverage",
        "Cash Flow",
        "Cash Flow",
        "Earnings Quality"
    ],
    "why_it_matters": [
        "After-tax profitability; persistent weakness is a common distress signal",
        "Operating performance signal (less influenced by capital structure)",
        "Margin strength and pricing power proxy",
        "Liquidity buffer relative to equity",
        "Short-term pressure within the liability structure",
        "Debt burden relative to net worth",
        "Cost of debt; often rises as credit quality deteriorates",
        "Cash generation indicator",
        "Operating funds available to cover obligations",
        "Long-run capital accumulation / retained profitability"
    ]
})

feature_map.to_csv(SHORTLIST_PATH, index=False)

print("Saved feature shortlist to:", SHORTLIST_PATH)
print("\nFeatures included:")
for f in locked_features:
    print("-", f)