In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    RocCurveDisplay
)

import matplotlib.pyplot as plt

In [None]:
# Paths
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data" / "data.csv"
SHORTLIST_PATH = PROJECT_ROOT / "data" / "feature_shortlist.csv"

df = pd.read_csv(DATA_PATH)
feature_map = pd.read_csv(SHORTLIST_PATH)

In [None]:
# Target column used across the project
TARGET_COL = "Bankrupt?"
if TARGET_COL not in df.columns:
    raise ValueError(f"Expected target column '{TARGET_COL}' not found in the dataset.")

features = feature_map["feature"].tolist()

# Confirm shortlist features exist (names must match exactly)
missing = [f for f in features if f not in df.columns]
if missing:
    raise ValueError("Some shortlist features are missing from the dataset:\n" + "\n".join(missing))

X = df[features].copy()
y = df[TARGET_COL].copy()

In [None]:
# Basic cleanup (dataset is typically clean, but keep this for robustness)
if X.isna().any().any():
    X = X.fillna(X.median(numeric_only=True))

if np.isinf(X.select_dtypes(include=[np.number])).any().any():
    raise ValueError("Infinite values found in features. Please inspect the dataset.")


In [None]:
# Stratified split due to class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Simple, interpretable baseline model
model = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

model.fit(X_train, y_train)

In [None]:
# Evaluation
proba_test = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba_test)
pred_test = (proba_test >= 0.5).astype(int)

cm = confusion_matrix(y_test, pred_test)
report = classification_report(y_test, pred_test, digits=4)

print("Test ROC-AUC:", round(auc, 4))
print("\nConfusion matrix:\n", cm)
print("\nClassification report:\n", report)

plt.figure()
RocCurveDisplay.from_predictions(y_test, proba_test)
plt.show()

In [None]:
# Coefficients (sign indicates direction on a standardized scale)
logreg = model.named_steps["logreg"]
coefs = pd.Series(logreg.coef_[0], index=features).sort_values()

print("\nStrongest negative coefficients (risk-reducing):")
print(coefs[coefs < 0].sort_values().head(8))

print("\nStrongest positive coefficients (risk-increasing):")
print(coefs[coefs > 0].sort_values(ascending=False).head(8))