In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [21]:
# Load hospital stay dataset
data = pd.read_csv('Hospital-LOS.csv')

# Define target variable (length of stay in days) and feature set
X = data.drop(columns=['Stay (in days)', 'patientid']) # Drop target and unnecessary identifier
y = data['Stay (in days)'] # This is the target variable (dependent variable)

# Identify categorical and numerical features of the dataset
categorical_cols = X.select_dtypes(include=['object']).columns # Categorical variables
numeric_cols = X.select_dtypes(include=['number']).columns # Numeric variables

# Preprocessing: Encode categorical variables and standardize numerical variables
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols), # One-hot encode categorical features
    ('num', StandardScaler(), numeric_cols) # Standardize numeric features
])

In [23]:
# # Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Define a pipeline with preprocessing and Random Forest regression model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate model performance using standard regression metrics
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred)) # R² score to show model accuracy

MAE: 1.2405153187399511
MSE: 3.4202028539614244
R² Score: 0.9447086063740399


In [29]:
# Define a new patient profile for prediction
new_patient = {
    'Available Extra Rooms in Hospital': 3,
    'Department': 'gynecology',
    'Ward_Facility_Code': 'C',
    'doctor_name': 'Dr. Oliva',
    'staff_available': 15,
    'Age': '31-40',
    'gender': 'Female',
    'Type of Admission': 'Emergency',
    'Severity of Illness': 'Extreme',
    'health_conditions': 'diabetes',
    'Visitors with Patient': 2,
    'Insurance': 'yes'
}


In [31]:
# Convert patient data into a DataFrame for model input
new_patient_df = pd.DataFrame([new_patient])

# Predict length of hospital stay for new patient
predicted_stay = model.predict(new_patient_df)

print("Predicted Length of Stay (in days):", predicted_stay[0])


Predicted Length of Stay (in days): 8.032666666666666


In [33]:
# Load updated dataset with additional co-morbidities
data = pd.read_csv('Hospital_LOS_with_Comorbidities.csv')

# Update target and feature set with new variables
X = data.drop(columns=['Stay (in days)', 'patientid'])  # Drop target and patient ID
y = data['Stay (in days)']  # Target variable

# Identify categorical and numeric features in the updated dataset
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['number']).columns

# Update preprocessor to handle additional features
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols), # Handle categorical variables
    ('num', StandardScaler(), numeric_cols) # Normalize numeric variables
])

# Split updated dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an updated model pipeline
updated_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train the updated model
updated_model.fit(X_train, y_train)

# Evaluate updated model performance
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 1.2405153187399511
MSE: 3.4202028539614244
R² Score: 0.9447086063740399


In [35]:
# Define a new patient profile with additional co-morbidities
new_patient = {
    'Available Extra Rooms in Hospital': 3,
    'Department': 'gynecology',
    'Ward_Facility_Code': 'C',
    'doctor_name': 'Dr. Oliva',
    'staff_available': 15,
    'Age': '31-40',
    'gender': 'Female',
    'Type of Admission': 'Emergency',
    'Severity of Illness': 'Extreme',
    'health_conditions': 'diabetes',
    'Visitors with Patient': 2,
    'Insurance': 'yes',
    'Septicemia': 0,
    'CHF': 0,
    'Pneumonia': 1,
    'COPD_Bronchiectasis': 0,
    'Cardiac_Dysrhythmias': 0,
    'Acute_Cerebrovascular_Disease': 1,
    'Acute_Renal_Failure': 0,
    'Skin_Infections': 0,
    'UTI': 1

}

In [37]:
# Convert patient data into DataFrame format for prediction
new_patient_df = pd.DataFrame([new_patient])

# Predict length of stay for new patient with co-morbidities
predicted_stay = model.predict(new_patient_df)

print("Predicted Length of Stay (in days):", predicted_stay[0])


Predicted Length of Stay (in days): 8.032666666666666


# This was the stopping point. I have not made any changes to the code, so it's still not factoring in co-morbidities. 

In [4]:
# Streamlit-based dashboard for interactive hospital stay predictions
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from model import predict_study_length  # Example function from your model file

# Set up dashboard title
st.title("Length of Study Prediction Dashboard")

# File uploader for users to input CSV files
uploaded_file = st.file_uploader("Upload a CSV", type=["csv"])
if uploaded_file:
    df = pd.read_csv(uploaded_file)
    st.write(df.head())

    # Generate predictions for uploaded dataset
    predictions = predict_study_length(df)  # This should be a function from model.py
    df['Predictions'] = predictions
    st.write(df)

    # Display predictions using line chart
    st.line_chart(df['Predictions'])


2025-02-09 16:26:11.244 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]


In [11]:
pip install 

SyntaxError: invalid syntax (3737097518.py, line 1)