<h1> Length of Stay High Care</h1>

This notebook was used to analyze the LoS for High Care patients in the dataset and obtain the ridge regression coefficients. If running with the example data, it should run into an error at the correlation matrices.


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
#seaborn settings
sns.set_theme(style="whitegrid")
sns.set_context("paper")
sns.despine(left=True,right=True)
#deactivate warnings
import warnings
warnings.filterwarnings('ignore')
#import smf
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#import r2_score
from sklearn.metrics import r2_score


In [None]:
df = pd.read_csv("example_dataset.csv")

In [None]:
#show all columns
pd.set_option('display.max_columns', None)

In [None]:
df[(df["ward_level"]=="High")&(df["gestational_age"]>260)].sort_values(by="length_of_stay",ascending=False).head(30)
#drop top 5
df = df.drop(df[(df["ward_level"]=="High")&(df["gestational_age"]>260)&(df["length_of_stay"]>30)].sort_values(by="length_of_stay",ascending=False).head(5).index)

In [None]:
df_ward = df[df["ward_level"]=="High"]

In [None]:
#show 5 highest length of stay
df_ward.sort_values(by="length_of_stay",ascending=False).head(10)

In [None]:
#filter out top 3 n_opnduur
#df_ward = df_ward[df_ward["n_opnduur"]<df_ward["n_opnduur"].nlargest(3).min()]

In [None]:
relevant_criteria = [
    'number_of_children',
 'gestational_age',
 'gestational_age',
 'sex',
 'birth_weight',
 'c_section',

 'stay_number',


 'length_of_stay',
 'thrombocytopenia_treatment',
 'antibiotics_treatment',
 'hypothermia',
 
 'antenatal_steroids_treatment',
 'phototherapy',
 'polycythemia_treatment',
 'sepsis_treatment',
 'anemia_treatment',
 'oxygen_days',
 'hfo_days',
 'cpap_days',
 'gestational_age_indicator',
 'birth_weight_indicator',
 'cardiovascular_indicator',
 'jaundice_indicator',
 'hypoglycemia_indicator',
 'infection_indicator',
 'asphyxia_indicator',
 'seizure_indicator',
 'feeding_indicator',
 'congenital_abnormality_indicator',
 'withdrawal_indicator',
 'maternal_medication_indicator',
 'psycho_indicator',
 'postIC_indicator',
 'others_indicator',
 
 

 
 'month',
 'ward',
    'ward_level',



]
admission_criteria = ['gestational_age_indicator',
 'birth_weight_indicator',
 'cardiovascular_indicator',
 'jaundice_indicator',
 'hypoglycemia_indicator',
 'infection_indicator',
 'asphyxia_indicator',
 'seizure_indicator',
 'feeding_indicator',
 'congenital_abnormality_indicator',
 'withdrawal_indicator',
 'maternal_medication_indicator',
 'psycho_indicator',
 'postIC_indicator',
 'others_indicator'
]

In [None]:
df_filtered = df_ward[relevant_criteria]

In [None]:

    
#drop ward
df_filtered_corr = df_filtered.drop(columns=["ward","ward_level"])

corr_matrix = df_filtered_corr.corr()[['length_of_stay']]
print(corr_matrix)
plt.figure(figsize=(8,6))
sns.heatmap(corr_matrix,annot=True,cmap='coolwarm')
plt.title('High')
plt.show()

In [None]:
#describe n_opnduur
df_filtered["length_of_stay"].describe()

In [None]:
plt.figure(figsize=(8, 6), dpi=400)
 
# Plot the scatterplot
sns.lineplot(data=df_ward, y='length_of_stay', x='gestational_age')
 
# Set x and y labels
plt.xlabel("Gestational Age (weeks)")
plt.ylabel("Length of Stay (days)")
 
# Set title
plt.title("Length of Stay vs Gestational Age for High Care")
 

 
# Show plot
plt.show()

In [None]:
df_filtered.info()

In [None]:
gestational_age_cut = 196
second_cut = 259
#drop ward

for i in range(0,3):
    df_filtered_corr = df_filtered.drop(columns=["ward","ward_level",'gestational_age','month','number_of_children','stay_number'])
    #filter for gestational age
    if i == 0:
        df_filtered_corr = df_filtered_corr[df_filtered_corr["gestational_age"]<=gestational_age_cut]
    elif i == 1:
        df_filtered_corr = df_filtered_corr[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_cut)]
    else:
        df_filtered_corr = df_filtered_corr[df_filtered_corr["gestational_age"]>gestational_age_cut]

    print("Gestational age cut:",gestational_age_cut, i)
    corr_matrix = df_filtered_corr.corr()[['length_of_stay']]
    print(corr_matrix)
    plt.figure(figsize=(8,6),dpi=400)
    sns.heatmap(corr_matrix,annot=True,cmap='coolwarm')
    plt.title('High Care')
    plt.show()

In [None]:
#fill na with o
df_filtered = df_filtered.fillna(0)

<h3>Ridge regression</h3>

In [None]:
df_filtered['predicted_los'] = 0

In [None]:
ward = "High"
gestational_age_cut = 196
second_cut = 259
x_criteria_1 =["gestational_age",'phototherapy','anemia_treatment','oxygen_days','cpap_days',"antibiotics_treatment","others_indicator"]
x_criteria_2 = ["gestational_age",'birth_weight','oxygen_days','cpap_days','postIC_indicator','phototherapy','anemia_treatment']

x_criteria_3 =["gestational_age",'birth_weight','oxygen_days','cpap_days','postIC_indicator','antibiotics_treatment','sepsis_treatment','phototherapy','anemia_treatment','others_indicator']
#set ln of n_opnduur


model_1 = Ridge(alpha=1,fit_intercept=True)
#filter only amddd > 220
df_filtered_1 = df_filtered[df_filtered["gestational_age"] <= gestational_age_cut]
X = df_filtered_1[x_criteria_1]

y = df_filtered_1['length_of_stay']
model_1.fit(X,y)

df_filtered.loc[df_filtered['gestational_age'] <= gestational_age_cut, 'predicted_los'] = (model_1.predict(X))
#filter larger than gestational_age_cut but smaller tahn 260

#how many nan values in predicted_los
print(df_filtered['predicted_los'].isna().sum())

#intercept
mask = df_filtered['predicted_los'] < df_filtered[['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_los'] = df_filtered.loc[mask, ['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)
#df_filtered['predicted_los'] = df_filtered['predicted_los'].astype(int)
df_filtered.loc[df_filtered['predicted_los'] < 1, 'predicted_los'] = 1
#coefficents with names and p values
coefficients = pd.DataFrame(model_1.coef_,index=x_criteria_1,columns=['coefficients'])
print(coefficients)
print(r2_score(y,df_filtered.loc[df_filtered['gestational_age'] <= gestational_age_cut, 'predicted_los']))

#print constant
print(model_1.intercept_)
df_filtered_2 = df_filtered[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_cut)]
X = df_filtered_2[x_criteria_2]

y = df_filtered_2['length_of_stay']
model_2 = Ridge(alpha=1,fit_intercept=True)
model_2.fit(X,y)
df_filtered.loc[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_cut), 'predicted_los'] = (model_2.predict(X))
mask = df_filtered['predicted_los'] < df_filtered[['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_los'] = df_filtered.loc[mask, ['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)
#df_filtered['predicted_los'] = df_filtered['predicted_los'].astype(int)
df_filtered.loc[df_filtered['predicted_los'] < 1, 'predicted_los'] = 1
coefficients = pd.DataFrame(model_2.coef_,index=x_criteria_2,columns=['coefficients'])
print(coefficients)
print(r2_score(y,df_filtered.loc[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_cut), 'predicted_los']))

#print constant
print(model_2.intercept_)

model_3 = Ridge(alpha=1,fit_intercept=True)
#filter only amddd > 220
df_filtered_3 = df_filtered[(df_filtered["gestational_age"] > second_cut) ]
X = df_filtered_3[x_criteria_3]

y = df_filtered_3['length_of_stay']
model_3.fit(X,y)

df_filtered.loc[ (df_filtered["gestational_age"] > second_cut), 'predicted_los'] = (model_3.predict(X))
#filter larger than gestational_age_cut but smaller tahn 260


#intercept
mask = df_filtered['predicted_los'] < df_filtered[['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)

# Use the mask to select rows and assign the maximum value to 'n_opnduur'
df_filtered.loc[mask, 'predicted_los'] = df_filtered.loc[mask, ['oxygen_days', 'hfo_days', 'cpap_days']].max(axis=1)
#df_filtered['predicted_los'] = df_filtered['predicted_los'].astype(int)
df_filtered.loc[df_filtered['predicted_los'] < 1, 'predicted_los'] = 1
#coefficents with names and p values
coefficients = pd.DataFrame(model_3.coef_,index=x_criteria_3,columns=['coefficients'])
print(coefficients)
print(r2_score(y,df_filtered.loc[ (df_filtered["gestational_age"] > second_cut), 'predicted_los']))

#print constant
print(model_3.intercept_)
plt.figure(figsize=(8,6),dpi=400)
plt.title('Actual vs Predicted LoS for High Care')
sns.lineplot(data=df_filtered,x='gestational_age',y='predicted_los',label='predicted_los')
sns.lineplot(data=df_filtered,x='gestational_age',y='length_of_stay',label='actual_los')
plt.ylabel("LoS (days)")
plt.xlabel("gestational age (weeks)")
plt.legend()
plt.show()


In [None]:
df_filtered["length_of_stay"].describe()

In [None]:
df_filtered["predicted_los"].describe()

In [None]:
df_filtered[df_filtered["gestational_age"]<=gestational_age_cut]["predicted_los"].describe()

In [None]:
df_filtered[df_filtered["gestational_age"]<=gestational_age_cut]["length_of_stay"].describe()

In [None]:
df_filtered[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_cut)]["predicted_los"].describe()

In [None]:
df_filtered[(df_filtered["gestational_age"] > gestational_age_cut) & (df_filtered["gestational_age"] <= second_cut)]["length_of_stay"].describe()

In [None]:
df_filtered[(df_filtered["gestational_age"] > second_cut) ]["predicted_los"].describe()

In [None]:
df_filtered[(df_filtered["gestational_age"] > second_cut)]["length_of_stay"].describe()