### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.pyfunc

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, accuracy_score

from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


### Set display option to show Max Row & Column

In [2]:
pd.set_option('display.max_rows', None)

pd.set_option('display.max_columns', None)

In [3]:
cleaned_df = pd.read_csv(r'C:\Users\ASUS PC\Desktop\AMDARI INTERNSHIP\Med_Optix\Med_Optix\Model\clened_df.csv')

In [4]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194096 entries, 0 to 194095
Data columns (total 33 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   admission_id            194096 non-null  int64  
 1   date                    194096 non-null  object 
 2   hospital_id             194096 non-null  int64  
 3   ward_code               194096 non-null  object 
 4   arrival_source          194096 non-null  object 
 5   triage_level            194096 non-null  float64
 6   wait_minutes            194096 non-null  int64  
 7   length_of_stay_days     194096 non-null  int64  
 8   outcome                 194096 non-null  object 
 9   age                     194096 non-null  int64  
 10  sex                     194096 non-null  object 
 11  procedure_flag          194096 non-null  int64  
 12  base_beds               194096 non-null  int64  
 13  effective_capacity      194096 non-null  int64  
 14  occupancy           

In [5]:
def quick_overview(df, df_name):
  print(f"{df_name} dataframe information")
  print("="*60)
  print(df.info())
  print("\n")
  print(f"{df_name} missing values information")
  print("="*60)
  print(df.isnull().sum())
  print("\n")
  print(f"{df_name} describtion information")
  print("="*60)
  print(df.describe())
  print(f'Structure of {df_name} dataset')
  print('='*60)
  print(f"Row:{df.shape[0]}, Columns:{df.shape[1]}")

In [6]:
quick_overview(cleaned_df,'cleaned_df')

cleaned_df dataframe information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194096 entries, 0 to 194095
Data columns (total 33 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   admission_id            194096 non-null  int64  
 1   date                    194096 non-null  object 
 2   hospital_id             194096 non-null  int64  
 3   ward_code               194096 non-null  object 
 4   arrival_source          194096 non-null  object 
 5   triage_level            194096 non-null  float64
 6   wait_minutes            194096 non-null  int64  
 7   length_of_stay_days     194096 non-null  int64  
 8   outcome                 194096 non-null  object 
 9   age                     194096 non-null  int64  
 10  sex                     194096 non-null  object 
 11  procedure_flag          194096 non-null  int64  
 12  base_beds               194096 non-null  int64  
 13  effective_capacity      194096 non-null  

### Define target

In [7]:

target = 'admissions' 

In [8]:
cleaned_df['date'] = pd.to_datetime(cleaned_df['date'])

In [9]:
cleaned_df['date'].dtype

dtype('<M8[ns]')

### Sort Data by date

In [10]:
cleaned_df = cleaned_df.sort_values('date').reset_index(drop=True)

### Create a 'month' feature from the date


In [11]:

cleaned_df['month'] = cleaned_df['date'].dt.to_period('M').dt.to_timestamp()

In [12]:
cleaned_df.head()

Unnamed: 0,admission_id,date,hospital_id,ward_code,arrival_source,triage_level,wait_minutes,length_of_stay_days,outcome,age,sex,procedure_flag,base_beds,effective_capacity,occupancy,overflow,admissions,discharges,staffing_index,avg_wait_minutes,admission_rate_per_bed,discharge_rate_per_bed,occupancy_rate,hospital_name,city,country,latitude,longitude,founded_year,ward_id,ward_name,month,week
0,1,2022-10-28,1,ED,referral,4.0,189,0,discharged,48,M,0,30,34,34,26,51,17,0.999,219,1.7,0.567,1.0,Helsinki Central Hospital,Helsinki,Finland,63.6698,25.0721,1979,01-ED,Emergency Department,2022-10-01,2022-10-24/2022-10-30
1,129564,2022-10-28,4,ICU,transfer,3.0,138,5,discharged,75,F,0,20,23,22,2,10,1,0.992,130,0.5,0.05,0.957,Oulu Regional Hospital,Oulu,Finland,60.4406,25.1527,2003,04-ICU,Intensive Care Unit,2022-10-01,2022-10-24/2022-10-30
2,185348,2022-10-28,5,SURG,self,3.0,151,2,discharged,41,F,1,40,46,42,2,25,8,0.972,123,0.625,0.2,0.913,Kuopio Medical Center,Kuopio,Finland,61.654,28.4874,2003,05-SURG,Surgery,2022-10-01,2022-10-24/2022-10-30
3,30013,2022-10-28,1,SURG,self,3.0,38,3,discharged,55,F,0,40,40,33,0,15,11,0.951,33,0.375,0.275,0.825,Helsinki Central Hospital,Helsinki,Finland,63.6698,25.0721,1979,01-SURG,Surgery,2022-10-01,2022-10-24/2022-10-30
4,30014,2022-10-28,1,SURG,transfer,3.0,38,3,discharged,56,F,0,40,40,33,0,15,11,0.951,33,0.375,0.275,0.825,Helsinki Central Hospital,Helsinki,Finland,63.6698,25.0721,1979,01-SURG,Surgery,2022-10-01,2022-10-24/2022-10-30


### Sort by month to maintain correct time order


In [13]:

cleaned_df = cleaned_df.sort_values('month').reset_index(drop=True)

### Handle Missing Value in Triage_Level Column using group mode imputation


In [14]:
cleaned_df['triage_level'] = pd.to_numeric(cleaned_df['triage_level'], errors='coerce')

cleaned_df['triage_level'] = (
    cleaned_df.groupby('arrival_source')['triage_level'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))  #  Fill with group mode
    .fillna(-1)  # Fallback fill for any remaining NaNs
)

In [15]:
cleaned_df.head()

Unnamed: 0,admission_id,date,hospital_id,ward_code,arrival_source,triage_level,wait_minutes,length_of_stay_days,outcome,age,sex,procedure_flag,base_beds,effective_capacity,occupancy,overflow,admissions,discharges,staffing_index,avg_wait_minutes,admission_rate_per_bed,discharge_rate_per_bed,occupancy_rate,hospital_name,city,country,latitude,longitude,founded_year,ward_id,ward_name,month,week
0,1,2022-10-28,1,ED,referral,4.0,189,0,discharged,48,M,0,30,34,34,26,51,17,0.999,219,1.7,0.567,1.0,Helsinki Central Hospital,Helsinki,Finland,63.6698,25.0721,1979,01-ED,Emergency Department,2022-10-01,2022-10-24/2022-10-30
1,38799,2022-10-30,2,ED,ambulance,2.0,332,0,discharged,68,F,0,30,34,34,22,50,32,0.87,264,1.667,1.067,1.0,Tampere City Hospital,Tampere,Finland,63.2868,22.6592,2008,02-ED,Emergency Department,2022-10-01,2022-10-24/2022-10-30
2,137759,2022-10-30,4,MED,self,3.0,150,4,discharged,55,M,0,50,58,58,19,29,18,0.921,197,0.58,0.36,1.0,Oulu Regional Hospital,Oulu,Finland,60.4406,25.1527,2003,04-MED,General Medicine,2022-10-01,2022-10-24/2022-10-30
3,38797,2022-10-30,2,ED,transfer,3.0,332,0,discharged,66,F,0,30,34,34,22,50,32,0.87,264,1.667,1.067,1.0,Tampere City Hospital,Tampere,Finland,63.2868,22.6592,2008,02-ED,Emergency Department,2022-10-01,2022-10-24/2022-10-30
4,38796,2022-10-30,2,ED,ambulance,3.0,332,0,discharged,70,M,0,30,34,34,22,50,32,0.87,264,1.667,1.067,1.0,Tampere City Hospital,Tampere,Finland,63.2868,22.6592,2008,02-ED,Emergency Department,2022-10-01,2022-10-24/2022-10-30


In [16]:
cleaned_df.tail()

Unnamed: 0,admission_id,date,hospital_id,ward_code,arrival_source,triage_level,wait_minutes,length_of_stay_days,outcome,age,sex,procedure_flag,base_beds,effective_capacity,occupancy,overflow,admissions,discharges,staffing_index,avg_wait_minutes,admission_rate_per_bed,discharge_rate_per_bed,occupancy_rate,hospital_name,city,country,latitude,longitude,founded_year,ward_id,ward_name,month,week
194091,90540,2025-10-09,3,ED,self,4.0,242,0,discharged,60,M,0,30,34,34,40,66,30,1.054,267,2.2,1.0,1.0,Turku University Hospital,Turku,Finland,64.6781,27.328,1998,03-ED,Emergency Department,2025-10-01,2025-10-06/2025-10-12
194092,90541,2025-10-09,3,ED,self,4.0,242,0,discharged,39,F,0,30,34,34,40,66,30,1.054,267,2.2,1.0,1.0,Turku University Hospital,Turku,Finland,64.6781,27.328,1998,03-ED,Emergency Department,2025-10-01,2025-10-06/2025-10-12
194093,90542,2025-10-09,3,ED,self,3.0,242,0,discharged,45,F,0,30,34,34,40,66,30,1.054,267,2.2,1.0,1.0,Turku University Hospital,Turku,Finland,64.6781,27.328,1998,03-ED,Emergency Department,2025-10-01,2025-10-06/2025-10-12
194094,90536,2025-10-09,3,ED,self,3.0,242,0,discharged,43,M,0,30,34,34,40,66,30,1.054,267,2.2,1.0,1.0,Turku University Hospital,Turku,Finland,64.6781,27.328,1998,03-ED,Emergency Department,2025-10-01,2025-10-06/2025-10-12
194095,194096,2025-10-26,5,SURG,referral,3.0,95,5,discharged,48,M,1,40,40,39,0,20,15,0.877,81,0.5,0.375,0.975,Kuopio Medical Center,Kuopio,Finland,61.654,28.4874,2003,05-SURG,Surgery,2025-10-01,2025-10-20/2025-10-26


In [17]:
cleaned_df['triage_level'].isnull().sum()

np.int64(0)

### Separating Numeric and Categorical Columns

identifying and separating numeric and non-numeric (categorical) columns from `merged_df` for targeted analysis:

In [18]:
num_cols = cleaned_df.select_dtypes(include=[np.number]).columns.to_list()
cat_cols = cleaned_df.select_dtypes(exclude=[np.number]).columns.to_list()

### Remove target and date-related columns from features

In [19]:
num_cols = [col for col in num_cols if col not in ['admissions']]
cat_cols = [col for col in cat_cols if col not in ['date', 'month']]

### Define Numerical Transformation Pipeline

In [20]:
num_trans = Pipeline(steps= [
                     ('imputer',SimpleImputer(strategy='mean')),
                     ('rscaler', RobustScaler()),
                     ('scaler', StandardScaler())
                     ]
)

### Define Categorical Transformation Pipeline

In [21]:
cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))


])

### Combine numerical and categorical pipelines into a single preprocessor


In [22]:
preprocessor = ColumnTransformer(transformers= [
    ('num',num_trans, num_cols),
    ('cat', cat_trans, cat_cols)


])

### Split Data by Date (Train/Val/Test)

In [23]:
train_size = 0.8
test_size = 0.2

# Sort again to be safe
cleaned_df = cleaned_df.sort_values('month')

train_size = int(len(cleaned_df) * 0.8)


train = cleaned_df.iloc[:train_size]

test = cleaned_df.iloc[train_size:]

### Define Target and Feature

In [24]:
target_col = 'admissions'

X_train = train.drop(columns=[target_col])
y_train = train[target_col]


X_test = test.drop(columns=[target_col])
y_test = test[target_col]

### Define Metric Function

In [25]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, mape

### RANDOM FOREST MODEL with MLflow

In [26]:
mlflow.set_experiment("Hospital_Admissions_Forecast")

with mlflow.start_run(run_name="RandomForestRegressor"):
    rf_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('rf', RandomForestRegressor(n_estimators=200, random_state=42))
    ])

    rf_model.fit(X_train, y_train)
    predt = rf_model.predict(X_test)

    mae = mean_absolute_error(y_test, predt)
    mse = mean_squared_error(y_test, predt)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - predt) / y_test)) * 100

    mlflow.log_params({"n_estimators": 200})
    mlflow.log_metrics({"MAE": mae, "MSE": mse, "RMSE": rmse, "MAPE": mape})
    mlflow.sklearn.log_model(
        sk_model=rf_model,
        name="RandomForestRegressor",
        input_example=X_test.iloc[:1]
    )

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 733.32it/s] 


In [27]:
print(mae)

0.008146960329726945


In [28]:
print(rmse)

0.1596613273101342


## Prophet Model (for Monthly Forecasting)

#### Prophet only needs the date (month) and target columns.

In [29]:
cleaned_df['date'] = pd.to_datetime(cleaned_df['date'], errors='coerce')
cleaned_df['month'] = cleaned_df['date'].dt.to_period('M').dt.to_timestamp()

In [30]:
cleaned_df = cleaned_df.dropna(subset=['month'])

In [31]:
print(cleaned_df['month'].min(), cleaned_df['month'].max())

2022-10-01 00:00:00 2025-10-01 00:00:00


In [32]:
cleaned_df = cleaned_df[cleaned_df['month'].dt.year < 2100]

In [33]:


# Keep only the relevant columns
prophet_train = train[['month', target_col]].rename(columns={'month': 'ds', target_col: 'y'})

# Convert 'ds' to datetime and filter realistic years
prophet_train['ds'] = pd.to_datetime(prophet_train['ds'], errors='coerce')
prophet_train = prophet_train[prophet_train['ds'].dt.year < 2100]
input_sample = prophet_train[['ds']].head(1)

with mlflow.start_run(run_name="Prophet"):
    prophet_model = Prophet(daily_seasonality=False, weekly_seasonality= False, yearly_seasonality=True)
    prophet_model.fit(prophet_train)

    # Predict next len(test) months safely
    future = prophet_model.make_future_dataframe(periods=len(test))
    forecast = prophet_model.predict(future)
    # Extract predictions for test period
    # Get the last N predictions (where N = length of test)
    predtt = forecast['yhat'].tail(len(test)).values
    #predtt = forecast['yhat'].iloc[-len(test):].values
    y_test = test['admissions'].values

    from sklearn.metrics import mean_absolute_error, mean_squared_error
    mae = mean_absolute_error(y_test, predtt)
    print(f"Prediction error (MAE): {mae:.2f}")
    mse = mean_squared_error(y_test, predtt)
    print(f"Prediction error (MSE): {mse:.2f}")
    rmse = np.sqrt(mse)
    print(f"Prediction error (RMSE): {rmse:.2f}")
    mape = np.mean(np.abs((y_test - predtt) / y_test)) * 100
    print(f"Prediction error (MAPE): {mape:.1f}")
    mlflow.log_params({"daily_seasonality": False, "weekly_seasonality": False, "yearly_seasonality":True })
    mlflow.log_metrics({"MAE": mae, "MSE": mse, "RMSE": rmse, "MAPE": mape})
    mlflow.prophet.log_model(prophet_model=prophet_model, 
    name="ProphetModel",
    input_example=input_sample)


21:28:17 - cmdstanpy - INFO - Chain [1] start processing
21:28:27 - cmdstanpy - INFO - Chain [1] done processing


Prediction error (MAE): 16.26
Prediction error (MSE): 365.49
Prediction error (RMSE): 19.12
Prediction error (MAPE): 90.2


TypeError: log_model() got an unexpected keyword argument 'prophet_model'

## SARIMAX Model (for Seasonal Patterns)

In [None]:
with mlflow.start_run(run_name="SARIMAX"):
    sarimax_model = SARIMAX(train['admissions'],
                            order=(1,1,1),
                            seasonal_order=(1,1,1,12),
                            enforce_stationarity=False,
                            enforce_invertibility=False)
    
    sarimax_result = sarimax_model.fit(disp=False)
    pred = sarimax_result.forecast(steps=len(test))

    y_test = test['admissions']

    mae = mean_absolute_error(y_test, pred)
    print(f"Prediction error (MAE): {mae:.2f}")
    mse = mean_squared_error(y_test, pred)
    print(f"Prediction error (MSE): {mse:.2f}")
    rmse = np.sqrt(mse)
    print(f"Prediction error (RMSE): {rmse:.2f}")
    mape = np.mean(np.abs((y_test - pred) / y_test)) * 100
    print(f"Prediction error (MAPE): {mape:.2f}")

    mlflow.log_params({"order": (1,1,1), "seasonal_order": (1,1,1,12)})
    mlflow.log_metrics({"MAE": mae, "MSE": mse, "RMSE": rmse, "MAPE": mape})
    mlflow.statsmodels.log_model(sarimax_result, "SARIMAXModel")

: 