## Notebook Objectives

1. Prepare data for charge prediction
2. Engineer LOS-aware and demographic features
3. Train multiple regression models
4. Evaluate predictive performance
5. Identify key charge drivers

In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
DATA_PATH = Path("../data/processed/hospital_inpatient_discharges_cleaned.csv")
df = pd.read_csv(DATA_PATH)

In [None]:
df.head()

In [None]:
df["log_total_charges"] = np.log1p(df["total_charges"])
target = "log_total_charges"

In [None]:
features = [
    "length_of_stay",
    "age",
    "gender",
    "hospital_name"
]

features = [f for f in features if f in df.columns]
features

In [None]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

In [None]:
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
lr_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

lr_model.fit(X_train, y_train)

In [None]:
ridge_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", Ridge(alpha=1.0))
])

ridge_model.fit(X_train, y_train)

In [None]:
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ))
])

rf_model.fit(X_train, y_train)

In [None]:
gb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(random_state=42))
])

gb_model.fit(X_train, y_train)

In [None]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    return {
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "R2": r2_score(y_test, preds)
    }

In [None]:
results = {
    "Linear Regression": evaluate_model(lr_model, X_test, y_test),
    "Ridge Regression": evaluate_model(ridge_model, X_test, y_test),
    "Random Forest": evaluate_model(rf_model, X_test, y_test),
    "Gradient Boosting": evaluate_model(gb_model, X_test, y_test)
}

pd.DataFrame(results).T

In [None]:
rf = rf_model.named_steps["model"]
feature_names = (
    rf_model.named_steps["preprocessor"]
    .get_feature_names_out()
)

importances = rf.feature_importances_

feature_importance = (
    pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    })
    .sort_values("importance", ascending=False)
)

feature_importance.head(10)

In [None]:
pred_log = rf_model.predict(X_test)
pred_actual = np.expm1(pred_log)

actual = np.expm1(y_test)

pd.DataFrame({
    "actual_charges": actual,
    "predicted_charges": pred_actual
}).head()

## Key Findings

* LOS is the **dominant driver** of total charges
* Tree-based models outperform linear models
* Log transformation significantly improves stability
* Hospital-level effects capture pricing and case-mix differences
* Models support budgeting and financial planning