<h1> Length of Stay Medium Care </h1>

This notebook was used to analyze the LoS for Medium Care patients in the dataset and obtain the ridge regression coefficients. If running with the example data, it should run into an error at the correlation matrices.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
#seaborn settings
sns.set_theme(style="whitegrid")
sns.set_context("paper")
sns.despine(left=True,right=True)
#deactivate warnings
import warnings
warnings.filterwarnings('ignore')
#import smf
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#import r2_score
from sklearn.metrics import r2_score


In [None]:
df = pd.read_csv("example_dataset.csv")

In [None]:
df[df["ward_level"]=="Medium"].sort_values(by="length_of_stay",ascending=False).head(30)

In [None]:
df_ward = df[df["ward_level"]=="Medium"]

In [None]:
relevant_criteria = [
    'number_of_children',
 'gestational_age',
 'gestational_age',
 'sex',
 'birth_weight',
 'c_section',

 'stay_number',


 'length_of_stay',
 'thrombocytopenia_treatment',
 'antibiotics_treatment',
 'hypothermia',
 
 'antenatal_steroids_treatment',
 'phototherapy',
 'polycythemia_treatment',
 'sepsis_treatment',
 'anemia_treatment',
 'oxygen_days',
 'hfo_days',
 'cpap_days',
 'gestational_age_indicator',
 'birth_weight_indicator',
 'cardiovascular_indicator',
 'jaundice_indicator',
 'hypoglycemia_indicator',
 'infection_indicator',
 'asphyxia_indicator',
 'seizure_indicator',
 'feeding_indicator',
 'congenital_abnormality_indicator',
 'withdrawal_indicator',
 'maternal_medication_indicator',
 'psycho_indicator',
 'postIC_indicator',
 'others_indicator',
 
 

 
 'month',
 'ward',
    'ward_level',



]
admission_criteria = ['gestational_age_indicator',
 'birth_weight_indicator',
 'cardiovascular_indicator',
 'jaundice_indicator',
 'hypoglycemia_indicator',
 'infection_indicator',
 'asphyxia_indicator',
 'seizure_indicator',
 'feeding_indicator',
 'congenital_abnormality_indicator',
 'withdrawal_indicator',
 'maternal_medication_indicator',
 'psycho_indicator',
 'postIC_indicator',
 'others_indicator'
]

In [None]:
df_filtered = df_ward[relevant_criteria]

In [None]:

    
#drop ward
df_filtered_corr = df_filtered.drop(columns=["ward","ward_level"])

corr_matrix = df_filtered_corr.corr()[['length_of_stay']]
print(corr_matrix)
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='coolwarm')
plt.title('Medium')
plt.show()

In [None]:
#describe n_opnduur
df_filtered["length_of_stay"].describe()

In [None]:
plt.figure(figsize=(8, 6), dpi=400)
 
# Plot the scatterplot
sns.lineplot(data=df_ward, y='length_of_stay', x='gestational_age')
 
# Set x and y labels
plt.xlabel("Gestational Age (weeks)")
plt.ylabel("Length of Stay (days)")
 
# Set title
plt.title("Length of Stay vs Gestational Age for Medium Care")
 

 
# Show plot
plt.show()

In [None]:
gestational_age_cut = 224
second_gestational_age_cut = 259
#drop ward

for i in range(0,3):
    df_filtered_corr = df_filtered.drop(columns=["ward","ward_level",'month','gestational_age','number_of_children'])
    #filter for gestational age
    if i == 0:
        df_filtered_corr = df_filtered_corr[df_filtered_corr["gestational_age"]<=gestational_age_cut]
    elif i == 1:
        df_filtered_corr = df_filtered_corr[(df_filtered_corr["gestational_age"]>gestational_age_cut) & (df_filtered_corr["gestational_age"]<=second_gestational_age_cut)]
    else:
        df_filtered_corr = df_filtered_corr[df_filtered_corr["gestational_age"]>second_gestational_age_cut]
    print("Gestational age cut:",gestational_age_cut, i)
    corr_matrix = df_filtered_corr.corr()[['length_of_stay']]
    print(corr_matrix)
    plt.figure(figsize=(8,6),dpi=400)
    sns.heatmap(corr_matrix,annot=True,cmap='coolwarm')
    plt.title('Medium Care')
    plt.show()
    


<h3>Ridge Regression </h3>

In [None]:
df_filtered.info()

In [None]:
ward = "High"
gestational_age_cut = 217
second_gestational_age_cut = 266
x_criteria_1 =["gestational_age",'phototherapy','oxygen_days','cpap_days',"others_indicator","postIC_indicator",'stay_number']
x_criteria_2 = ["gestational_age",'phototherapy','birth_weight','oxygen_days','c_section']

x_criteria_3 =['oxygen_days','cpap_days','c_section','phototherapy','sepsis_treatment','antibiotics_treatment']
#set ln of n_opnduur


model_1 = Ridge(alpha=1,fit_intercept=True)
#filter only amddd > 220
df_filtered_1 = df_filtered[df_filtered["gestational_age"] <= gestational_age_cut]
X = df_filtered_1[x_criteria_1]

y = df_filtered_1['length_of_stay']
model_1.fit(X,y)

df_filtered.loc[df_filtered['gestational_age'] <= gestational_age_cut, 'predicted_los'] = (model_1.predict(X))
#filter larger than gestational_age_cut but smaller tahn 260
mask = df_filtered['predicted_los'] < df_filtered[['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_los'] = df_filtered.loc[mask, ['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)
#df_filtered['predicted_los'] = df_filtered['predicted_los'].astype(int)
df_filtered.loc[df_filtered['predicted_los'] < 1, 'predicted_los'] = 1

#intercept

#coefficents with names and p values
coefficients = pd.DataFrame(model_1.coef_,index=x_criteria_1,columns=['coefficients'])
print(coefficients)
print(r2_score(y,model_1.predict(X)))
mask = df_filtered['predicted_los'] < df_filtered[['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_los'] = df_filtered.loc[mask, ['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)
#df_filtered['predicted_los'] = df_filtered['predicted_los'].astype(int)
df_filtered.loc[df_filtered['predicted_los'] < 1, 'predicted_los'] = 1
#print constant
print(model_1.intercept_)
df_filtered_2 = df_filtered[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_gestational_age_cut)]
X = df_filtered_2[x_criteria_2]

y = df_filtered_2['length_of_stay']
model_2 = Ridge(alpha=1,fit_intercept=True)
model_2.fit(X,y)
df_filtered.loc[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_gestational_age_cut), 'predicted_los'] = (model_2.predict(X))

coefficients = pd.DataFrame(model_2.coef_,index=x_criteria_2,columns=['coefficients'])
print(coefficients)
print(r2_score(y,model_2.predict(X)))

#print constant
print(model_2.intercept_)

model_3 = Ridge(alpha=1,fit_intercept=True)
#filter only amddd > 220
df_filtered_3 = df_filtered[(df_filtered["gestational_age"] > second_gestational_age_cut) ]
X = df_filtered_3[x_criteria_3]

y = df_filtered_3['length_of_stay']
model_3.fit(X,y)

df_filtered.loc[ (df_filtered["gestational_age"] > second_gestational_age_cut), 'predicted_los'] = (model_3.predict(X))
#filter larger than gestational_age_cut but smaller tahn 260
mask = df_filtered['predicted_los'] < df_filtered[['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_los'] = df_filtered.loc[mask, ['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)
#df_filtered['predicted_los'] = df_filtered['predicted_los'].astype(int)
df_filtered.loc[df_filtered['predicted_los'] < 1, 'predicted_los'] = 1

#intercept

#coefficents with names and p values
coefficients = pd.DataFrame(model_3.coef_,index=x_criteria_3,columns=['coefficients'])
print(coefficients)
print(r2_score(y,model_3.predict(X)))

#print constant
print(model_3.intercept_)
plt.figure(figsize=(8,6),dpi=400)
plt.title('Actual vs Predicted LoS for Medium Care')
plt.ylabel("LoS (days)")
plt.xlabel("gestational age (weeks)")
sns.lineplot(data=df_filtered,x='gestational_age',y='predicted_los',label='predicted_los')
sns.lineplot(data=df_filtered,x='gestational_age',y='length_of_stay',label='actual_los')
plt.legend()
plt.show()


In [None]:


print(df_filtered[df_filtered["gestational_age"]<=gestational_age_cut]["predicted_los"].describe())
print(df_filtered[df_filtered["gestational_age"]<=gestational_age_cut]["length_of_stay"].describe())
#empty line
print()
print(df_filtered[(df_filtered["gestational_age"]>gestational_age_cut) & (df_filtered["gestational_age"]<=second_gestational_age_cut)]["predicted_los"].describe())
print(df_filtered[(df_filtered["gestational_age"]>gestational_age_cut) & (df_filtered["gestational_age"]<=second_gestational_age_cut)]["length_of_stay"].describe())
print()
print(df_filtered[df_filtered["gestational_age"]>second_gestational_age_cut]["predicted_los"].describe())
print(df_filtered[df_filtered["gestational_age"]>second_gestational_age_cut]["length_of_stay"].describe())

In [None]:
print(df_filtered["predicted_los"].describe())
print(df_filtered["length_of_stay"].describe())

<h3>Lasso Regression </h3>

In [None]:
from sklearn.linear_model import Lasso,LassoCV
#import r2_score
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
#set ln of o2
df_filtered["ln_n_o2dg"] = np.log1p(df_filtered["n_o2dg"])
#cpap ln
df_filtered["ln_n_dag_hhhfnc_cpap_nippv"] = np.log1p(df_filtered["n_dag_hhhfnc_cpap_nippv"])
#conv ln
df_filtered["ln_n_dag_conv_hfo"] = np.log1p(df_filtered["n_dag_conv_hfo"])

In [None]:
relevant_criteria

In [None]:
#fill n_opname_ind_overigind with 0 for nan
df_filtered["n_opname_ind_overigind"].fillna(0,inplace=True)

In [None]:
#fill hoftizer with 50
df_filtered["hoftiezer"].fillna(50,inplace=True)
df_filtered["n_klinisch_sepsisvroeg"].fillna(0,inplace=True)

In [None]:
#show above 300 ammdd
df_filtered[df_filtered["amddd"]>290].sort_values(by="n_opnduur",ascending=False)

In [None]:
#set squared amddd
df_filtered["squared_amddd"] = df_filtered["amddd"]**2

In [None]:
ward = "medium"
gestational_age_cut = 224
x_criteria = []

x_criteria_1 =["amddd", 'n_opname_ind_asfyxie','n_opname_ind_postichc','n_antibiotica','n_o2dg','n_dag_hhhfnc_cpap_nippv','n_opname_ind_overigind',"c_section"]
x_criteria_2 = ["amddd",'n_hyperbilli','geboortegew','n_o2dg',"c_section"]

x_criteria_3 = ["squared_amddd",'geboortegew','n_o2dg','n_opname_ind_infectie','n_opname_ind_asfyxie',"c_section"]
#set ln of n_opnduur


model_1 = Lasso(alpha=0.2,fit_intercept=True)
#filter only amddd > 220
df_filtered_1 = df_filtered[df_filtered["amddd"] <= gestational_age_cut]
X = df_filtered_1[x_criteria_1]

y = df_filtered_1['n_opnduur']
model_1.fit(X,y)

df_filtered.loc[df_filtered['amddd'] <= gestational_age_cut, 'predicted_n_opnduur'] = (model_1.predict(X))
#filter larger than gestational_age_cut but smaller tahn 260


#intercept

#coefficents with names and p values
coefficients = pd.DataFrame(model_1.coef_,index=x_criteria_1,columns=['coefficients'])
print(coefficients)
print(r2_score(y,model_1.predict(X)))
print(mean_squared_error(y,model_1.predict(X)))
#print constant
print(model_1.intercept_)
df_filtered_2 = df_filtered[(df_filtered["amddd"] > gestational_age_cut) & (df_filtered["amddd"] < 260)]
X = df_filtered_2[x_criteria_2]

y = df_filtered_2['n_opnduur']
model_2 = Lasso(alpha=0.1,fit_intercept=True)
model_2.fit(X,y)
df_filtered.loc[(df_filtered["amddd"] > gestational_age_cut) & (df_filtered["amddd"] < 260), 'predicted_n_opnduur'] = (model_2.predict(X))

coefficients = pd.DataFrame(model_2.coef_,index=x_criteria_2,columns=['coefficients'])
print(coefficients)
print(r2_score(y,model_2.predict(X)))
print(mean_squared_error(y,model_2.predict(X)))
#print constant
print(model_2.intercept_)

model_3 = Lasso(alpha=0.1,fit_intercept=True)
#filter only amddd > 220
df_filtered_3 = df_filtered[(df_filtered["amddd"] >= 260) ]
X = df_filtered_3[x_criteria_3]

y = df_filtered_3['n_opnduur']
model_3.fit(X,y)

df_filtered.loc[ (df_filtered["amddd"] >= 260), 'predicted_n_opnduur'] = (model_3.predict(X))
#filter larger than gestational_age_cut but smaller tahn 260


#intercept

#coefficents with names and p values
coefficients = pd.DataFrame(model_3.coef_,index=x_criteria_3,columns=['coefficients'])
print(coefficients)
print(r2_score(y,model_3.predict(X)))
print(mean_squared_error(y,model_3.predict(X)))
#print constant
print(model_3.intercept_)
plt.figure(figsize=(8,6))
sns.lineplot(data=df_filtered,x='amddd',y='predicted_n_opnduur',label='predicted_los')
sns.lineplot(data=df_filtered,x='amddd',y='n_opnduur',label='actual_los')
plt.legend()
plt.show()


In [None]:
#filter and plot from 260 amddd
df_filtered_260 = df_filtered[df_filtered["amddd"]>=260]
plt.figure(figsize=(8,6))
sns.lineplot(data=df_filtered_260,x='amddd',y='predicted_n_opnduur',label='predicted_los')
sns.lineplot(data=df_filtered_260,x='amddd',y='n_opnduur',label='actual_los')
plt.legend()
plt.show()


In [None]:
df_filtered["diff"] = np.abs(df_filtered["n_opnduur"] - (df_filtered["predicted_n_opnduur"]).astype(int))

df_filtered["diff"].describe()

In [None]:
# If 'n_opnduur' is smaller than 'n_o2dg', 'n_dag_conv_hfo', or 'n_dag_hhhfnc_cpap_nippv', 
# set 'n_opnduur' to the highest of the three

# Create a mask for rows where 'n_opnduur' is smaller than the maximum of the other three columns
mask = df_filtered['predicted_n_opnduur'] < df_filtered[['n_o2dg', 'n_dag_conv_hfo', 'n_dag_hhhfnc_cpap_nippv']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_n_opnduur'] = df_filtered.loc[mask, ['n_o2dg', 'n_dag_conv_hfo', 'n_dag_hhhfnc_cpap_nippv']].max(axis=1)

In [None]:
#if predicted 0 then set to 1
df_filtered.loc[df_filtered['predicted_n_opnduur'] <= 0, 'predicted_n_opnduur'] = 1

In [None]:
df_filtered["predicted_n_opnduur"].astype(int).describe()

In [None]:
df_filtered["n_opnduur"].describe()

In [None]:
#show max diff
df_filtered.sort_values(by="diff",ascending=False).head(30)

In [None]:
#histogram of diff
plt.figure(figsize=(8,6))
sns.histplot(data=df_filtered,x='diff',kde=True)
plt.title('Difference between actual and predicted')
plt.show()
