# Predicting Hospital Length of Stay Using Gradient Boosting Models

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Check and install necessary packages
try:
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb
except ImportError:
    import sys
    print("Installing missing packages...")
    !{sys.executable} -m pip install xgboost lightgbm catboost
    import xgboost as xgb
    import lightgbm as lgb
    import catboost as cb


# Set data path 
merged_data_path = "/home/anna/Desktop/Master_thesis/output_data/merged_data"  

# Load data
df = pd.read_csv(merged_data_path)  # Change to actual file path

# Drop identifiers
df = df.drop(columns=["patient_id", "case_id", "discharge_type", "diagnosis_code", "diagnosis_text", "diagnosis_category"]) 

# Define target and features
target = "length_of_stay_days"
X = df.drop(columns=[target])
y = df[target]

# Identify column types
num_features = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Handling missing values and scaling for numerical features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# One-hot encoding for categorical features
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])
xgb_pipeline.fit(X_train, y_train)

# Train LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lgb_model)
])
lgb_pipeline.fit(X_train, y_train)

# Train CatBoost
cb_model = cb.CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=0)
cb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", cb_model)
])
cb_pipeline.fit(X_train, y_train)

# Evaluate models
def evaluate(model_pipeline, name):
    y_pred = model_pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

print("Model Performance:")
evaluate(xgb_pipeline, "XGBoost")
evaluate(lgb_pipeline, "LightGBM")
evaluate(cb_pipeline, "CatBoost")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.039077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9275
[LightGBM] [Info] Number of data points in the train set: 215216, number of used features: 52
[LightGBM] [Info] Start training from score 5.950352
Model Performance:
XGBoost - MAE: 3.96, RMSE: 7.30
LightGBM - MAE: 3.95, RMSE: 7.30
CatBoost - MAE: 4.03, RMSE: 7.37


## Apply log transformation to the target variable to reduce the effect of outliers

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Set data path 
merged_data_path = "/home/anna/Desktop/Master_thesis/output_data/merged_data"  

# Load data
df = pd.read_csv(merged_data_path)  # Change to actual file path

# Drop identifiers
df = df.drop(columns=["patient_id", "case_id", "discharge_type", "diagnosis_code", "diagnosis_text", "diagnosis_category"]) 

# Define target and features
target = "length_of_stay_days"
X = df.drop(columns=[target])
y = np.log1p(df[target])  # Apply log transformation

# Identify column types
num_features = X.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

# Handling missing values and scaling for numerical features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# One-hot encoding for categorical features
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown='ignore'))
])

# Combine pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])
xgb_pipeline.fit(X_train, y_train)

# Train LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", lgb_model)
])
lgb_pipeline.fit(X_train, y_train)

# Train CatBoost
cb_model = cb.CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=42, verbose=0)
cb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", cb_model)
])
cb_pipeline.fit(X_train, y_train)

# Evaluate models
def evaluate(model_pipeline, name):
    y_pred = np.expm1(model_pipeline.predict(X_test))  # Reverse log transformation
    y_true = np.expm1(y_test)  # Reverse log transformation
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}")

print("Model Performance:")
evaluate(xgb_pipeline, "XGBoost")
evaluate(lgb_pipeline, "LightGBM")
evaluate(cb_pipeline, "CatBoost")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9275
[LightGBM] [Info] Number of data points in the train set: 215216, number of used features: 52
[LightGBM] [Info] Start training from score 1.612096
Model Performance:
XGBoost - MAE: 3.62, RMSE: 7.55
LightGBM - MAE: 3.60, RMSE: 7.54
CatBoost - MAE: 3.67, RMSE: 7.63
