

**Importing Necessary Libraries**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

**Loading the Data**

In [None]:
df = pd.read_csv('ER Wait Time Dataset.csv')

**Initial Data Inspection**

In [None]:
print("---Data Info ---")
df.info()

print("\n--- First 5 Rows ---")
print(df.head())

print("\n--- Column Names ---")
print(df.columns.tolist())

**Data Cleaning and Preparation**

In [None]:

df['Visit Date'] = pd.to_datetime(df['Visit Date'])

median_wait_time = df['Total Wait Time (min)'].median()
df['Total Wait Time (min)'].fillna(median_wait_time, inplace=True)


median_ratio = df['Nurse-to-Patient Ratio'].median()
df['Nurse-to-Patient Ratio'].fillna(median_ratio, inplace=True)


print("--- Missing Values After Cleaning ---")
print(df.isnull().sum())

**Feature Engineering**

In [None]:

df['DayOfWeek'] = df['Visit Date'].dt.day_name()


df['Month'] = df['Visit Date'].dt.month


print("--- Data with New Features ---")
print(df.head())

**Exploratory Data Analysis (EDA)**

In [None]:

sns.set(style="whitegrid")


plt.figure(figsize=(12, 6))
sns.histplot(df['Total Wait Time (min)'], kde=True, bins=40)
plt.title('Distribution of Total ER Wait Time')
plt.xlabel('Total Wait Time (minutes)')
plt.ylabel('Frequency')
plt.show()


plt.figure(figsize=(12, 6))
sns.boxplot(x='DayOfWeek', y='Total Wait Time (min)', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('ER Wait Time by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Total Wait Time (minutes)')
plt.show()


numeric_df = df.select_dtypes(include=np.number)
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

**Model Preparation**

In [None]:

features = [
    'Hospital Name',
    'Region',
    'Season',
    'Time of Day',
    'Urgency Level',
    'Specialist Availability',
    'Facility Size (Beds)',
    'Nurse-to-Patient Ratio',
    'DayOfWeek',
    'Month'
]

target = 'Total Wait Time (min)'

X = df[features]
y = df[target]


categorical_features = [
    'Hospital Name',
    'Region',
    'Season',
    'Time of Day',
    'Urgency Level',
    'Specialist Availability',
    'DayOfWeek'
]

X = pd.get_dummies(X, columns=categorical_features, drop_first=True)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print("--- Data Preparation Complete with New Features ---")
print("Shape of your new training features (X_train):", X_train.shape)

**Model Building and Training**

In [None]:

model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)


model.fit(X_train, y_train)

print("--- Model Training Complete ---")
print("The model has learned the patterns from the training data.")

**Model Evaluation**

In [None]:

y_pred = model.predict(X_test)


r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


print("--- Model Performance Evaluation ---")
print(f"R-squared (R²): {r2:.3f}")
print(f"Mean Absolute Error (MAE): {mae:.3f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse:.3f} minutes")

**Feature Importance Analysis and Visualization**

In [None]:

importances = model.feature_importances_
feature_names = X_train.columns


feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})


feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)


plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')
plt.title('Key Drivers of ER Wait Time (Feature Importance)', fontsize=16)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.show()


## **Trying the XGBoost, LightGBM**

In [None]:
pip install xgboost lightgbm

# **Train and Evaluate with XGBoost**

In [None]:
import xgboost as xgb


xgb_model = xgb.XGBRegressor(n_estimators=1000,
                             learning_rate=0.05,
                             early_stopping_rounds=5,
                             random_state=42)



xgb_model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
              verbose=False)



y_pred_xgb = xgb_model.predict(X_test)


r2_xgb = r2_score(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))


print("--- XGBoost Model Performance ---")
print(f"R-squared (R²): {r2_xgb:.3f}")
print(f"Mean Absolute Error (MAE): {mae_xgb:.3f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse_xgb:.3f} minutes")

# **Train and Evaluate with LightGBM**

In [None]:
import lightgbm as lgb


lgb_model = lgb.LGBMRegressor(n_estimators=1000,
                              learning_rate=0.05,
                              random_state=42)


lgb_model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(5, verbose=False)])


y_pred_lgb = lgb_model.predict(X_test)


r2_lgb = r2_score(y_test, y_pred_lgb)
mae_lgb = mean_absolute_error(y_test, y_pred_lgb)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))


print("\n--- LightGBM Model Performance ---")
print(f"R-squared (R²): {r2_lgb:.3f}")
print(f"Mean Absolute Error (MAE): {mae_lgb:.3f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse_lgb:.3f} minutes")

# **Hyperparameter Tuning with RandomizedSearchCV (LightGBM)**
*Let's tune our best model so far, it's LightGBM. This code will automatically search for the best settings and then train a final optimized model*

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
import numpy as np


param_grid = {
    'n_estimators': [100, 200, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40, 50],
    'max_depth': [-1, 10, 20],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'subsample': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}


lgb_base = lgb.LGBMRegressor(random_state=42)


random_search = RandomizedSearchCV(estimator=lgb_base,
                                   param_distributions=param_grid,
                                   n_iter=100,
                                   cv=5,
                                   scoring='neg_mean_absolute_error',
                                   n_jobs=-1,
                                   random_state=42,
                                   verbose=1)


print("--- Starting Hyperparameter Tuning ---")
random_search.fit(X_train, y_train)
print("--- Tuning Complete ---")


print("\nBest Parameters Found:")
print(random_search.best_params_)


best_score = -random_search.best_score_
print(f"\nBest Cross-Validated MAE from Search: {best_score:.3f} minutes")


best_model = random_search.best_estimator_
y_pred_best = best_model.predict(X_test)


r2_best = r2_score(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))


print("\n--- Tuned Model Performance on Test Set ---")
print(f"R-squared (R²): {r2_best:.3f}")
print(f"Mean Absolute Error (MAE): {mae_best:.3f} minutes")
print(f"Root Mean Squared Error (RMSE): {rmse_best:.3f} minutes")