<h1>Validation </h1>

This notebook was used to validate the model. It can only run if provided with a model output file. For instance, the file currently in the file would represent the output of the 2016 scenario with 25 runs.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
import scipy.stats as stats
sns.set(style="whitegrid",palette="colorblind",font_scale=1.5)
sns.set_context("paper")

In [None]:
simulation_df = pd.read_csv("../../model/outputs/admission_log_hospitalScenario_-1_runs_25.csv")
#set gestational age weeks
simulation_df["gestational_age"] = simulation_df["gestational_age"]/7

In [None]:
df  = pd.read_csv("example_dataset.csv")

In [None]:
#filter start_date before 15-10-2017
df["start_date"] = pd.to_datetime(df["start_date"])


In [None]:
df.head()

In [None]:
#set ward for simulation df to lowercase
simulation_df["ward"] = simulation_df["ward"].str.lower()

In [None]:
wards = simulation_df["ward"].unique()


In [None]:
for ward in wards:
    plt.figure(figsize=(8,6),dpi=500)
    #plot scatterplot
    df_filtered = df[df["ward_level"]==ward]
    df_simulation_filtered = simulation_df[simulation_df["ward"]==ward]
    print(df_filtered["length_of_stay"].describe())
    print(df_simulation_filtered["length_of_stay"].describe())
    sns.lineplot(data=df_filtered,y='length_of_stay',x='gestational_age',label="Actual")
    sns.lineplot(data=df_simulation_filtered,y='length_of_stay',x='gestational_age',label="Simulated")
    #y label Length of stay in days
    plt.ylabel("Length of stay (Days)")
    #x label gestational age in weeks
    plt.xlabel("Gestational age (weeks)")
    #legend
    plt.legend()
    #save plot
    #plt.savefig(f"length_of_stay_{ward}.png")
    plt.show()

In [None]:
df

In [None]:
for ward in wards:
    df_filtered = df[df["ward_level"]==ward]
    df_simulation_filtered = simulation_df[simulation_df["ward"]==ward]
    length_of_stay_filtered = df_filtered['length_of_stay'].dropna()
    length_of_stay_simulation = df_simulation_filtered['length_of_stay'].dropna()
    # Determine the number of samples to select for the comparison
    sample_size = min(len(length_of_stay_filtered), len(length_of_stay_simulation))
    
    # Randomly select entries from each dataset
    length_of_stay_filtered_sample = length_of_stay_filtered.sample(n=sample_size, random_state=1).reset_index(drop=True)
    length_of_stay_simulation_sample = length_of_stay_simulation.sample(n=sample_size, random_state=1).reset_index(drop=True)
    #sort the data
    length_of_stay_filtered_sample = length_of_stay_filtered_sample.sort_values()
    length_of_stay_simulation_sample = length_of_stay_simulation_sample.sort_values()
    
    # QQ plot
    df_combined = pd.DataFrame({'Observed': length_of_stay_filtered_sample, 'Simulated': length_of_stay_simulation_sample})
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x='Simulated', y='Observed', data=df_combined)
    sns.lineplot(x='Observed', y='Observed', data=df_combined, color='red',linestyle='--')
    plt.show()
    print(ward)
    # Calculate R^2 score
    r2 = r2_score(length_of_stay_filtered_sample, length_of_stay_simulation_sample)
    print(f'R^2 score: {r2}')

In [None]:
df_combined.describe()