In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [None]:
# Paths
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data" / "data.csv"
SHORTLIST_PATH = PROJECT_ROOT / "data" / "feature_shortlist.csv"

df = pd.read_csv(DATA_PATH)
feature_map = pd.read_csv(SHORTLIST_PATH)

In [None]:
# Target column used across the project
TARGET_COL = "Bankrupt?"
if TARGET_COL not in df.columns:
    raise ValueError(f"Expected target column '{TARGET_COL}' not found in the dataset.")

features = feature_map["feature"].tolist()

# Confirm the shortlist matches the dataset columns
missing = [f for f in features if f not in df.columns]
if missing:
    raise ValueError("Some shortlist features are missing from the dataset:\n" + "\n".join(missing))

df_small = df[[TARGET_COL] + features].copy()

print("Subset shape:", df_small.shape)
print("\nTarget distribution:\n", df_small[TARGET_COL].value_counts())


In [None]:
# Correlation scan (directional signal only)
corrs = df_small.corr(numeric_only=True)[TARGET_COL].sort_values()
print("\nMost negative correlations (protective direction):\n", corrs.head(8))
print("\nMost positive correlations (risk direction):\n", corrs.tail(8))

In [None]:
# Quick class-wise distributions for a few key ratios
for col in features[:3]:
    plt.figure()
    df_small[df_small[TARGET_COL] == 0][col].hist(bins=40, alpha=0.7)
    df_small[df_small[TARGET_COL] == 1][col].hist(bins=40, alpha=0.7)
    plt.title(f"Distribution by class: {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.legend(["Non-bankrupt", "Bankrupt"])
    plt.show()
