In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from scipy.stats import randint
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import joblib

# **Pre-Processing**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/hospital_data_sampleee.csv', encoding="utf-8-sig")
df.head()

Unnamed: 0,Date,Medication Revenue,Lab Cost,Consultation Revenue,Doctor Type,Financial Class,Patient Type,Entry Time,Post-Consultation Time,Completion Time,Patient ID
0,11/6/2019,$600.00,$-,$-,ANCHOR,MEDICARE,OUTPATIENT,9:38:34 AM,10:55:50 AM,10:58:02 AM,C10004
1,11/6/2019,$452.70,$-,$-,ANCHOR,MEDICARE,OUTPATIENT,1:46:52 PM,2:37:19 PM,2:51:14 PM,C10014
2,11/5/2019,$444.00,$-,$-,ANCHOR,HMO,OUTPATIENT,10:41:25 AM,11:25:26 AM,11:26:09 AM,C10015
3,11/8/2019,$378.00,$-,$-,ANCHOR,HMO,OUTPATIENT,12:58:50 PM,6:03:12 PM,6:10:58 PM,C10028
4,11/1/2019,$351.00,$10.00,$-,ANCHOR,MEDICARE,OUTPATIENT,10:13:21 AM,11:06:46 AM,11:09:28 AM,C10035


In [None]:
df.columns = df.columns.str.strip().str.replace("\n","").str.lower().str.replace(" ", "_")

In [None]:
df["date"] = pd.to_datetime(df["date"])
df["entry_time"] = pd.to_datetime(df["entry_time"])
df["post_consultation_time"] = pd.to_datetime(df["post-consultation_time"])

  df["entry_time"] = pd.to_datetime(df["entry_time"])
  df["post_consultation_time"] = pd.to_datetime(df["post-consultation_time"])


In [None]:
df["waiting_time"] = (df["post_consultation_time"] - df["entry_time"]).dt.total_seconds() / 60
df = df[df["waiting_time"] >= 0]

In [None]:
df["entry_hour"] = df["entry_time"].dt.hour
df["entry_dayofweek"] = df["entry_time"].dt.dayofweek
df["entry_minute"] = df["entry_time"].dt.minute

In [None]:
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["dayofweek"] = df["date"].dt.dayofweek

In [None]:
money_cols = ["medication_revenue", "lab_cost", "consultation_revenue"]
for col in money_cols:
    df[col] = pd.to_numeric(df[col].replace('[\$,]', '', regex=True).str.strip(), errors='coerce').fillna(0.0)

  df[col] = pd.to_numeric(df[col].replace('[\$,]', '', regex=True).str.strip(), errors='coerce').fillna(0.0)


In [None]:
df = df.drop(columns=[
    "patient_id",
    "post_consultation_time",
    "entry_time",
    "completion_time",
    "date"
])

In [None]:
for col in df.columns:
    if "time" in col and df[col].dtype == "object":
        print("Dropping:", col)
        df = df.drop(columns=[col])

Dropping: post-consultation_time


In [None]:
X = df.drop(columns=["waiting_time"])
y = df["waiting_time"]

In [None]:
print(X.dtypes)

medication_revenue      float64
lab_cost                float64
consultation_revenue    float64
doctor_type              object
financial_class          object
patient_type             object
entry_hour                int32
entry_dayofweek           int32
entry_minute              int32
year                      int32
month                     int32
dayofweek                 int32
dtype: object


In [None]:
categorical_cols = ["doctor_type", "financial_class", "patient_type"]
for col in categorical_cols:
    X[col] = X[col].astype(str)

In [None]:
numeric_cols = [col for col in X.columns if col not in categorical_cols]

In [None]:
y_log = np.log1p(y)

In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# **Random Forest Tree**

In [None]:
model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        random_state=42
    ))
])

In [None]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

In [None]:
pipeline = Pipeline([
    ("preprocess", preprocess),
    ("regressor", rf)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [None]:
param_dist = {
    "regressor__n_estimators": randint(100, 500),
    "regressor__max_depth": randint(5, 30),
    "regressor__min_samples_split": randint(2, 10),
    "regressor__min_samples_leaf": randint(1, 5),
    "regressor__max_features": ["sqrt", "log2"]
}

In [None]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring='neg_mean_absolute_error',
    verbose=2,
    random_state=42
)

In [None]:
search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END regressor__max_depth=11, regressor__max_features=log2, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=206; total time=   2.1s
[CV] END regressor__max_depth=11, regressor__max_features=log2, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=206; total time=   2.1s
[CV] END regressor__max_depth=11, regressor__max_features=log2, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=206; total time=   2.0s
[CV] END regressor__max_depth=12, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=221; total time=   2.6s
[CV] END regressor__max_depth=12, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=221; total time=   3.8s
[CV] END regressor__max_depth=12, regressor__max_features=sqrt, 

In [None]:
y_pred_log = search.predict(X_test)
y_pred = np.expm1(y_pred_log)

In [None]:
mae_rft = mean_absolute_error(np.expm1(y_test), y_pred)
rmse_rft = mean_squared_error(np.expm1(y_test), y_pred)
r2_rft = r2_score(np.expm1(y_test), y_pred)

In [None]:
print(f"Mean Absolute Error after tuning: {mae_rft:.2f} minutes")
print(f"RMSE: {rmse_rft:.2f} minutes")
print(f"R² Score: {r2_rft:.4f}")

Mean Absolute Error after tuning: 22.53 minutes
RMSE: 1849.22 minutes
R² Score: 0.0429


# **Gradient Boost**

In [None]:
gbr = GradientBoostingRegressor(random_state=42)

In [None]:
pipeline_gbr = Pipeline([
    ("preprocess", preprocess),
    ("regressor", gbr)
])

In [None]:
param_dist_gbr = {
    "regressor__n_estimators": randint(100, 400),
    "regressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "regressor__max_depth": randint(2, 6),
    "regressor__min_samples_split": randint(2, 10),
    "regressor__min_samples_leaf": randint(1, 5),
}

In [None]:
search_gbr = RandomizedSearchCV(
    pipeline_gbr,
    param_distributions=param_dist_gbr,
    n_iter=20,
    cv=3,
    scoring="neg_mean_absolute_error",
    verbose=2,
    random_state=42
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [None]:
search_gbr.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=206; total time=   6.7s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=206; total time=   5.8s
[CV] END regressor__learning_rate=0.1, regressor__max_depth=5, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=206; total time=   5.9s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=2, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=221; total time=   3.5s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=2, regressor__min_samples_leaf=1, regressor__min_samples_split=8, regressor__n_estimators=221; total time=   2.7s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=2, regres

In [None]:
y_pred_log_gbr = search_gbr.predict(X_test)
y_pred_gbr = np.expm1(y_pred_log_gbr)

In [None]:
mae_gbr = mean_absolute_error(np.expm1(y_test), y_pred_gbr)
rmse_gbr = mean_squared_error(np.expm1(y_test), y_pred_gbr)
r2_gbr = r2_score(np.expm1(y_test), y_pred_gbr)

In [None]:
print(f"MAE: {mae_gbr:.2f} minutes")
print(f"RMSE: {rmse_gbr:.2f} minutes")
print(f"R² Score: {r2_gbr:.4f}")

MAE: 22.35 minutes
RMSE: 1842.53 minutes
R² Score: 0.0463


# **X-Gradient Boost**

In [None]:
xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

In [None]:
pipeline_xgb = Pipeline([
    ("preprocess", preprocess),
    ("regressor", xgb)
])

In [None]:
param_dist_xgb = {
    "regressor__n_estimators": randint(200, 800),
    "regressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "regressor__max_depth": randint(3, 10),
    "regressor__subsample": [0.7, 0.8, 1.0],
    "regressor__colsample_bytree": [0.6, 0.8, 1.0],
}

In [None]:
search_xgb = RandomizedSearchCV(
    pipeline_xgb,
    param_distributions=param_dist_xgb,
    n_iter=20,
    cv=3,
    scoring="neg_mean_absolute_error",
    random_state=42,
    verbose=2
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [None]:
search_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END regressor__colsample_bytree=1.0, regressor__learning_rate=0.2, regressor__max_depth=7, regressor__n_estimators=470, regressor__subsample=1.0; total time=   2.0s
[CV] END regressor__colsample_bytree=1.0, regressor__learning_rate=0.2, regressor__max_depth=7, regressor__n_estimators=470, regressor__subsample=1.0; total time=   1.6s
[CV] END regressor__colsample_bytree=1.0, regressor__learning_rate=0.2, regressor__max_depth=7, regressor__n_estimators=470, regressor__subsample=1.0; total time=   1.4s
[CV] END regressor__colsample_bytree=0.6, regressor__learning_rate=0.01, regressor__max_depth=9, regressor__n_estimators=321, regressor__subsample=1.0; total time=   1.8s
[CV] END regressor__colsample_bytree=0.6, regressor__learning_rate=0.01, regressor__max_depth=9, regressor__n_estimators=321, regressor__subsample=1.0; total time=   1.9s
[CV] END regressor__colsample_bytree=0.6, regressor__learning_rate=0.01, regressor__max

In [None]:
y_pred_log_xgb = search_xgb.predict(X_test)
y_pred_xgb = np.expm1(y_pred_log_xgb)

In [None]:
mae_xgb = mean_absolute_error(np.expm1(y_test), y_pred_xgb)
rmse_xgb = mean_squared_error(np.expm1(y_test), y_pred_xgb)
r2_xgb = r2_score(np.expm1(y_test), y_pred_xgb)

In [None]:
print(f"MAE: {mae_xgb:.2f} minutes")
print(f"RMSE: {rmse_xgb:.2f} minutes")
print(f"R² Score: {r2_xgb:.4f}")

MAE: 22.35 minutes
RMSE: 1836.65 minutes
R² Score: 0.0494


# **Artifacts**

In [None]:
best_model = search_xgb.best_estimator_

In [None]:
joblib.dump(best_model, "hospital_waiting_model.pkl")

['hospital_waiting_model.pkl']