In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

df = pd.read_csv("../data/hospital_length_of_stay_clean.csv")
df.head()


Unnamed: 0,age,sex,admitting_diagnosis,number_of_comorbidities,heart_rate_on_admission,systolic_bp,lab_abnormality_count,length_of_stay,heart_rate_on_admission_missing,systolic_bp_missing,lab_abnormality_count_missing
0,93,M,respiratory,0,97,146,2,9.555334,0,0,0
1,59,M,gi,4,124,141,1,10.582524,0,0,0
2,67,M,cardiac,1,97,123,1,19.227728,0,0,1
3,73,F,cardiac,3,73,143,2,17.072762,0,0,0
4,53,F,cardiac,3,79,136,4,13.766479,0,0,0


In [12]:
df["log_length_of_stay"] = np.log1p(df["length_of_stay"])

numeric_features = [
    "age",
    "number_of_comorbidities",
    "heart_rate_on_admission",
    "systolic_bp",
    "lab_abnormality_count",
    "heart_rate_on_admission_missing",
    "systolic_bp_missing",
    "lab_abnormality_count_missing",
]

categorical_features = [
    "sex",
    "admitting_diagnosis",
]

TARGET = "log_length_of_stay"

X = pd.get_dummies(df[numeric_features + categorical_features], columns=categorical_features, drop_first=True)
X.head()
y = df[TARGET]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred_lr)
rmse = root_mean_squared_error(y_test, y_pred_lr)
r2 = r2_score(y_test, y_pred_lr)

print("Linear Regression (log LOS)")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)


Linear Regression (log LOS)
MAE: 0.16319072164298404
RMSE: 0.23709152204743084
R²: 0.3277539782489711


In [None]:
y_test_days = np.expm1(y_test)
y_pred_days = np.expm1(y_pred_lr)

mae_days = mean_absolute_error(y_test_days, y_pred_days)
print("MAE (days):", mae_days)

coef = pd.Series(lr.coef_, index=X.columns).sort_values()
coef

MAE (days): 2.2160128991544306


admitting_diagnosis_minor         -0.137000
admitting_diagnosis_ortho         -0.088965
admitting_diagnosis_gi            -0.035013
systolic_bp                       -0.002030
sex_M                              0.001584
heart_rate_on_admission            0.002619
heart_rate_on_admission_missing    0.003863
age                                0.003967
admitting_diagnosis_respiratory    0.006816
systolic_bp_missing                0.006998
lab_abnormality_count_missing      0.024525
lab_abnormality_count              0.069409
number_of_comorbidities            0.071487
dtype: float64

In [16]:
coef = pd.Series(lr.coef_, index=X.columns)

coef_numeric = coef[coef.index.isin(numeric_features)]
coef_missing = coef[coef.index.str.contains("_missing")]
coef_diagnosis = coef[coef.index.str.contains("admitting_diagnosis")]
coef_sex = coef[coef.index.str.contains("sex_")]

print("Numeric features:")
print(coef_numeric.sort_values(), "\n")

print("Missing flags:")
print(coef_missing.sort_values(), "\n")

print("Diagnosis dummies:")
print(coef_diagnosis.sort_values(), "\n")

print("Sex:")
print(coef_sex.sort_values())


Numeric features:
systolic_bp                       -0.002030
heart_rate_on_admission            0.002619
heart_rate_on_admission_missing    0.003863
age                                0.003967
systolic_bp_missing                0.006998
lab_abnormality_count_missing      0.024525
lab_abnormality_count              0.069409
number_of_comorbidities            0.071487
dtype: float64 

Missing flags:
heart_rate_on_admission_missing    0.003863
systolic_bp_missing                0.006998
lab_abnormality_count_missing      0.024525
dtype: float64 

Diagnosis dummies:
admitting_diagnosis_minor         -0.137000
admitting_diagnosis_ortho         -0.088965
admitting_diagnosis_gi            -0.035013
admitting_diagnosis_respiratory    0.006816
dtype: float64 

Sex:
sex_M    0.001584
dtype: float64
