# Predicting Hospital Length of Stay Using Gradient Boosting Models

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Check and install necessary packages
try:
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
except ImportError:
    import sys
    print("Installing missing packages...")
    !{sys.executable} -m pip install xgboost lightgbm catboost
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb


# Set data path 
merged_data_path = "/home/anna/Desktop/Master_thesis/output_data/merged_data"  

# Load data
df = pd.read_csv(merged_data_path)  # Change to actual file path

# Drop identifiers
df = df.drop(columns=["patient_id", "case_id", "discharge_type", "diagnosis_code", "diagnosis_text", "diagnosis_category"]) 

# Define target and features
target = "length_of_stay_days"
X = df.drop(columns=[target])
y = df[target]

# Identify column types
num_features = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Handling missing values and scaling for numerical features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# One-hot encoding for categorical features
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])
xgb_pipeline.fit(X_train, y_train)

# Train LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lgb_model)
])
lgb_pipeline.fit(X_train, y_train)

# Train CatBoost
cb_model = cb.CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=0)
cb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", cb_model)
])
cb_pipeline.fit(X_train, y_train)

# Evaluate models
def evaluate(model_pipeline, name):
    y_pred = model_pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

print("Model Performance:")
evaluate(xgb_pipeline, "XGBoost")
evaluate(lgb_pipeline, "LightGBM")
evaluate(cb_pipeline, "CatBoost")


Installing missing packages...
Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[