In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
# Paths (kept relative so the project runs on any machine)
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = PROJECT_ROOT / "data" / "data.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
print("\nFirst 15 columns:", df.columns.tolist()[:15], f"... (total: {len(df.columns)})")
print("\nDtypes (sample):\n", df.dtypes.head())

In [None]:
# Target column used throughout this project
TARGET_COL = "Bankrupt?"
if TARGET_COL not in df.columns:
    raise ValueError(f"Expected target column '{TARGET_COL}' not found in the dataset.")

y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

print("\nTarget distribution:\n", y.value_counts(dropna=False))
print("Bankruptcy rate:", float(y.mean()))

In [None]:
# Quick data quality checks
missing_counts = X.isna().sum().sort_values(ascending=False)
print("\nMissing values (top 10):\n", missing_counts.head(10))

In [None]:
numeric_X = X.select_dtypes(include=[np.number])
print("\nAny infinite values?", bool(np.isinf(numeric_X).any().any()))
