In [None]:
import pandas as pd
import os

# --- This is the "Bytes" part ---

# This is the full, correct path to the nested folder.
file_path_to_patients = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/hosp/patients.csv.gz' 

# Use pandas to read the gzipped CSV file
try:
    df_patients = pd.read_csv(file_path_to_patients)
    
    # Print the first 5 rows to see what it looks like
    print("Successfully loaded patients.csv.gz:")
    print(df_patients.head())

except FileNotFoundError:
    print(f"Error: Could not find the file at {os.path.abspath(file_path_to_patients)}")
    print("Please double-check the folder and file name.")

In [None]:
import pandas as pd
import os

# --- This is the "Bytes" part (Step 2) ---

# Define the paths to the new files
# (They are in the same 'hosp' folder as patients.csv.gz)
base_path = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/hosp/'
file_path_diagnoses = os.path.join(base_path, 'diagnoses_icd.csv.gz')
file_path_dictionary = os.path.join(base_path, 'd_icd_diagnoses.csv.gz')

try:
    # Load the diagnoses
    df_diagnoses = pd.read_csv(file_path_diagnoses)
    print("Successfully loaded diagnoses_icd.csv.gz:")
    print(df_diagnoses.head())
    print("\n" + "="*50 + "\n") # Adding a separator

    # Load the dictionary
    df_icd_dictionary = pd.read_csv(file_path_dictionary)
    print("Successfully loaded d_icd_diagnoses.csv.gz:")
    print(df_icd_dictionary.head())

except FileNotFoundError:
    print(f"Error: Could not find one of the files.")
    print(f"Looked for: {os.path.abspath(file_path_diagnoses)}")
    print(f"And: {os.path.abspath(file_path_dictionary)}")

In [None]:
# --- This is the "Bytes" part (Step 3) ---

# We need to make sure the df_icd_dictionary is available from the previous cell
# If you get an error, re-run the cell from Step 2 first.

try:
    # Search the 'long_title' column for any row containing 'Sepsis'
    # 'case=False' makes the search case-insensitive
    df_sepsis_codes = df_icd_dictionary[df_icd_dictionary['long_title'].str.contains('Sepsis', case=False, na=False)]

    print("Found the following ICD codes for 'Sepsis':")
    print(df_sepsis_codes)

except NameError:
    print("Error: 'df_icd_dictionary' not found. Please re-run the previous cell (Step 2).")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# --- This is the "Bytes" part (Step 4) ---

# We need the DataFrames from the previous steps
# If you get an error, re-run the cells from Step 2 and 3 first.

try:
    # 1. Create a list of the sepsis ICD codes
    sepsis_icd_code_list = df_sepsis_codes['icd_code'].tolist()
    
    print(f"Searching for patients with these {len(sepsis_icd_code_list)} codes: {sepsis_icd_code_list}")
    print("\n" + "="*50 + "\n")

    # 2. Find all diagnoses that match any code in our list
    df_sepsis_diagnoses = df_diagnoses[df_diagnoses['icd_code'].isin(sepsis_icd_code_list)]

    # 3. Show the resulting patients
    print(f"Found {len(df_sepsis_diagnoses)} sepsis diagnoses in the demo set.")
    print(df_sepsis_diagnoses.head())

except NameError:
    print("Error: 'df_sepsis_codes' or 'df_diagnoses' not found. Please re-run the previous cells (Step 2 and 3).")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
import os

# --- This is the "Bytes" part (Step 5) ---

# We need the df_sepsis_diagnoses from the previous step
# If you get an error, re-run all previous cells.

# Define the path to the ICU stays file
base_path = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/'
file_path_icustays = os.path.join(base_path, 'icu/icustays.csv.gz')

try:
    # 1. Load the icustays file
    df_icustays = pd.read_csv(file_path_icustays)
    
    # 2. Get a list of all unique hospital admissions in the ICU
    all_icu_hadm_ids = set(df_icustays['hadm_id'].unique())
    print(f"Total unique hospital admissions in ICU: {len(all_icu_hadm_ids)}")

    # 3. Get a list of all unique hospital admissions with a sepsis diagnosis
    sepsis_hadm_ids = set(df_sepsis_diagnoses['hadm_id'].unique())
    print(f"Total unique hospital admissions with Sepsis: {len(sepsis_hadm_ids)}")

    # 4. Create our "Negative" group
    # These are admissions in the ICU that are NOT in our sepsis list
    non_sepsis_hadm_ids = all_icu_hadm_ids - sepsis_hadm_ids
    print(f"Total unique hospital admissions without Sepsis: {len(non_sepsis_hadm_ids)}")

except NameError:
    print("Error: 'df_sepsis_diagnoses' not found. Please re-run all previous cells first.")
except FileNotFoundError:
    print(f"Error: Could not find icustays.csv.gz at {os.path.abspath(file_path_icustays)}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
import os

# --- This is the "Bytes" part (Step 6) ---

# We need 'all_icu_hadm_ids' from the previous step (Step 5)
# If you get an error, re-run all previous cells.

# Define the paths to the data files in their respective folders
base_path_hosp = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/hosp/'
base_path_icu = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/icu/'

file_path_labs = os.path.join(base_path_hosp, 'labevents.csv.gz')
file_path_vitals = os.path.join(base_path_icu, 'chartevents.csv.gz')

try:
    # 1. Load Lab Events and filter
    print(f"Loading {file_path_labs}...")
    df_labevents_all = pd.read_csv(file_path_labs)
    print(f"Loaded {len(df_labevents_all)} lab events.")
    
    # Filter to only keep our 128 ICU patients
    df_labevents = df_labevents_all[df_labevents_all['hadm_id'].isin(all_icu_hadm_ids)]
    print(f"Filtered lab events to {len(df_labevents)} rows (our ICU patients).")
    print(df_labevents.head())
    print("\n" + "="*50 + "\n")

    # 2. Load Chart Events (Vitals) and filter
    print(f"Loading {file_path_vitals}...")
    df_vitals_all = pd.read_csv(file_path_vitals)
    print(f"Loaded {len(df_vitals_all)} vital events.")
    
    # Filter to only keep our 128 ICU patients
    df_vitals = df_vitals_all[df_vitals_all['hadm_id'].isin(all_icu_hadm_ids)]
    print(f"Filtered vital events to {len(df_vitals)} rows (our ICU patients).")
    print(df_vitals.head())

except NameError:
    print("Error: 'all_icu_hadm_ids' not found. Please re-run all previous cells first (especially Step 5).")
except FileNotFoundError:
    print(f"Error: Could not find one of the files.")
    print(f"Looked for: {os.path.abspath(file_path_labs)}")
    print(f"And: {os.path.abspath(file_path_vitals)}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
import os

# --- This is the "Bytes" part (Step 7) ---

# Define the paths to the dictionary files
base_path_hosp = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/hosp/'
base_path_icu = './mimic-iv-clinical-database-demo-2.2/mimic-iv-clinical-database-demo-2.2/icu/'

file_path_lab_dict = os.path.join(base_path_hosp, 'd_labitems.csv.gz')
file_path_vitals_dict = os.path.join(base_path_icu, 'd_items.csv.gz')

try:
    # 1. Load the Lab Items Dictionary
    df_lab_dict = pd.read_csv(file_path_lab_dict)
    print("Successfully loaded d_labitems.csv.gz:")
    print(df_lab_dict.head())
    print("\n" + "="*50 + "\n")

    # 2. Load the Chart/Vitals Items Dictionary
    df_vitals_dict = pd.read_csv(file_path_vitals_dict)
    print("Successfully loaded d_items.csv.gz:")
    print(df_vitals_dict.head())

except FileNotFoundError:
    print(f"Error: Could not find one of the dictionary files.")
    print(f"Looked for: {os.path.abspath(file_path_lab_dict)}")
    print(f"And: {os.path.abspath(file_path_vitals_dict)}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# --- This is the "Bytes" part (Step 8) ---

# We need df_lab_dict and df_vitals_dict from the previous step (Step 7)
# If you get an error, re-run that cell first.

try:
    print("--- Finding Vitals itemids (from d_items.csv.gz) ---")
    
    # Define the vitals we're looking for
    vital_labels_to_find = [
        'Heart Rate',
        'Systolic', # For Systolic Blood Pressure
        'Diastolic', # For Diastolic Blood Pressure
        'Temperature',
        'Respiratory Rate'
    ]

    # Create a search pattern (e.g., 'Heart Rate|Systolic|...')
    vitals_search_pattern = '|'.join(vital_labels_to_find)
    
    # Search the 'label' column in the vitals dictionary
    df_found_vitals = df_vitals_dict[df_vitals_dict['label'].str.contains(vitals_search_pattern, case=False, na=False)]
    
    print("Found the following vital sign itemids:")
    print(df_found_vitals[['itemid', 'label', 'category', 'unitname']])
    print("\n" + "="*50 + "\n")

    print("--- Finding Lab itemids (from d_labitems.csv.gz) ---")
    
    # Define the labs we're looking for
    lab_labels_to_find = [
        'Lactate',
        'White Blood Cell' # For WBC Count
    ]

    # Create a search pattern
    labs_search_pattern = '|'.join(lab_labels_to_find)
    
    # Search the 'label' column in the lab dictionary
    df_found_labs = df_lab_dict[df_lab_dict['label'].str.contains(labs_search_pattern, case=False, na=False)]
    
    print("Found the following lab itemids:")
    print(df_found_labs[['itemid', 'label', 'category', 'fluid']])


except NameError:
    print("Error: 'df_lab_dict' or 'df_vitals_dict' not found. Please re-run the previous cell (Step 7).")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# --- This is the "Bytes" part (Step 9) ---

# We need df_lab_dict and df_vitals_dict from Step 7.
# If you get an error, re-run that cell first.

try:
    # 1. Define the EXACT vitals labels we want
    vital_labels_we_want = [
        'Heart Rate',
        'Non Invasive Blood Pressure systolic',
        'Non Invasive Blood Pressure diastolic',
        'Respiratory Rate',
        'Temperature Fahrenheit', # We'll get F
        'Temperature Celsius'   # and C, and normalize later
    ]
    
    # 2. Define the EXACT lab labels we want
    lab_labels_we_want = [
        'Lactate',
        'White Blood Cells'
    ]
    
    # 3. Filter the dictionaries for ONLY these exact labels
    df_vitals_final = df_vitals_dict[df_vitals_dict['label'].isin(vital_labels_we_want)]
    df_labs_final = df_lab_dict[df_lab_dict['label'].isin(lab_labels_we_want)]
    
    print("--- Final Vitals 'Shopping List' ---")
    print(df_vitals_final[['itemid', 'label', 'category', 'unitname']])
    print("\n" + "="*50 + "\n")
    
    print("--- Final Labs 'Shopping List' ---")
    print(df_labs_final[['itemid', 'label', 'category', 'fluid']])
    
    # 4. Store these IDs in a list for the next step
    # These are the *only* itemids we care about from now on
    vital_itemids_to_keep = df_vitals_final['itemid'].tolist()
    lab_itemids_to_keep = df_labs_final['itemid'].tolist()
    
    print(f"\nVital ItemIDs to keep: {vital_itemids_to_keep}")
    print(f"Lab ItemIDs to keep: {lab_itemids_to_keep}")

except NameError:
    print("Error: 'df_lab_dict' or 'df_vitals_dict' not found. Please re-run the cell from Step 7.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# --- This is the "Bytes" part (Step 10) ---

# We need all these from previous steps. Re-run steps 5, 6, and 9 if you get an error.
try:
    # 1. Filter the Vitals (df_vitals) using our vital_itemids_to_keep list
    df_vitals_features = df_vitals[df_vitals['itemid'].isin(vital_itemids_to_keep)]
    
    print("--- Filtered Vitals Data ---")
    print(f"Original vitals size (from Step 6): {len(df_vitals)} rows")
    print(f"New filtered vitals size: {len(df_vitals_features)} rows")
    print(df_vitals_features.head())
    print("\n" + "="*50 + "\n")

    # 2. Filter the Labs (df_labevents) using our lab_itemids_to_keep list
    df_lab_features = df_labevents[df_labevents['itemid'].isin(lab_itemids_to_keep)]

    print("--- Filtered Labs Data ---")
    print(f"Original labs size (from Step 6): {len(df_labevents)} rows")
    print(f"New filtered labs size: {len(df_lab_features)} rows")
    print(df_lab_features.head())

except NameError:
    print("Error: One or more DataFrames (df_vitals, df_labevents) or lists (vital_itemids_to_keep, lab_itemids_to_keep) were not found.")
    print("Please re-run Step 5, Step 6, and Step 9 to define them.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
# --- This is the "Bytes" part (Step 11) ---
# This is Feature Engineering

# We need:
# df_icustays (from Step 5)
# df_patients (from Step 1)
# sepsis_hadm_ids (from Step 5)

try:
    # 1. Get one row per hospital admission (hadm_id)
    # We use drop_duplicates to make sure we only have one entry per patient admission
    df_icu_patients = df_icustays.drop_duplicates(subset=['hadm_id'], keep='first')

    # 2. Merge with df_patients to add age and gender
    # This creates our base feature matrix
    df_feature_matrix = df_icu_patients.merge(df_patients, on='subject_id')
    
    # 3. Create the 'sepsis_label' (our target variable)
    # .isin() checks if the hadm_id is in our 'sepsis_hadm_ids' set
    # .astype(int) converts True/False to 1/0
    df_feature_matrix['sepsis_label'] = df_feature_matrix['hadm_id'].isin(sepsis_hadm_ids).astype(int)

    # 4. Clean up and show the result
    # We only keep the columns we need
    df_feature_matrix = df_feature_matrix[['subject_id', 'hadm_id', 'gender', 'anchor_age', 'sepsis_label']]
    
    print("--- Successfully Built Base Feature Matrix ---")
    print(df_feature_matrix.head())
    
    print("\n" + "="*50 + "\n")
    print("Checking our 1s and 0s (Sepsis vs. No Sepsis):")
    print(df_feature_matrix['sepsis_label'].value_counts())

except NameError:
    print("Error: One or more DataFrames (df_icustays, df_patients, sepsis_hadm_ids) were not found.")
    print("Please re-run Step 1, Step 5, and Step 9 to define them.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
import os
import numpy as np # We'll need numpy for NaN

# --- This is the "Bytes" part (Step 12) ---

# We need all these from previous steps:
# df_feature_matrix (Step 11)
# df_vitals_features (Step 10)
# df_lab_features (Step 10)
# vital_itemids_to_keep, lab_itemids_to_keep (Step 9)

try:
    # 1. Unify Temperature (Convert C to F)
    # Get a copy to avoid pandas warnings
    df_vitals_processed = df_vitals_features.copy()
    
    # Find all Celsius rows (itemid 223762)
    celsius_rows = df_vitals_processed['itemid'] == 223762
    
    # Convert C to F: (C * 9/5) + 32
    df_vitals_processed.loc[celsius_rows, 'valuenum'] = (df_vitals_processed.loc[celsius_rows, 'valuenum'] * 9/5) + 32
    
    
    # 2. Map ItemIDs to Names
    vital_id_to_name_map = {
        220210: 'RESP_RATE',
        220180: 'BP_DIASTOLIC',
        223761: 'TEMP_F',        # Already in F
        223762: 'TEMP_F',        # Now converted to F
        220045: 'HEART_RATE',
        220179: 'BP_SYSTOLIC'
    }
    
    # Get all the lactate and WBC itemids from our list in Step 9
    lab_id_to_name_map = {}
    for itemid in lab_itemids_to_keep:
        if itemid in [50813, 52442]:
            lab_id_to_name_map[itemid] = 'LACTATE'
        else:
            lab_id_to_name_map[itemid] = 'WBC' # All others are WBC
            
    # Apply the mapping
    df_vitals_processed['feature_name'] = df_vitals_processed['itemid'].map(vital_id_to_name_map)
    
    df_labs_processed = df_lab_features.copy()
    df_labs_processed['feature_name'] = df_labs_processed['itemid'].map(lab_id_to_name_map)

    
    # 3. Aggregate (Get Max Value)
    # We group by patient (hadm_id) and feature, get the max value, then 'unstack'
    # .unstack() pivots the feature_name from a row into columns
    df_vitals_agg = df_vitals_processed.groupby(['hadm_id', 'feature_name'])['valuenum'].max().unstack()
    df_labs_agg = df_labs_processed.groupby(['hadm_id', 'feature_name'])['valuenum'].max().unstack()
    
    # Rename columns (e.g., 'HEART_RATE' -> 'HEART_RATE_max')
    df_vitals_agg = df_vitals_agg.add_suffix('_max')
    df_labs_agg = df_labs_agg.add_suffix('_max')
    

    # 4. Merge all features into our base table
    df_final_features = df_feature_matrix.merge(df_vitals_agg, on='hadm_id', how='left')
    df_final_features = df_final_features.merge(df_labs_agg, on='hadm_id', how='left')
    
    print("--- Successfully Built FINAL Feature Matrix ---")
    print(df_final_features.head())
    
    print("\n" + "="*50 + "\n")
    print("Checking for missing values (NaN):")
    # .isnull() finds missing values, .sum() counts them
    print(df_final_features.isnull().sum())

except NameError as e:
    print(f"Error: A required DataFrame or list was not found. {e}")
    print("Please re-run all steps from Step 9, 10, and 11.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
import os
import numpy as np

# --- This is the "Bytes" part (Step 13) ---
# This is Data Pre-processing

# We need df_final_features from the previous step (Step 12)
try:
    # 1. Calculate the median for LACTATE_max
    # We use 'skipna=True' (the default) to ignore the NaNs when calculating
    median_lactate = df_final_features['LACTATE_max'].median()
    
    print(f"Calculated median for LACTATE_max: {median_lactate}")
    
    # 2. Fill the missing values
    # We use 'inplace=True' to modify the table directly
    df_final_features['LACTATE_max'].fillna(median_lactate, inplace=True)
    
    print("\n" + "="*50 + "\n")
    print("Checking for missing values after imputation:")
    print(df_final_features.isnull().sum())
    
    print("\n" + "="*50 + "\n")
    print("--- Final, Cleaned, AI-Ready Dataset ---")
    print(df_final_features.head())

except NameError:
    print("Error: 'df_final_features' not found.")
    print("Please re-run all steps from Step 12.")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- This is the "AI" part (Step 14) ---

# We need df_final_features from the previous step (Step 13)
try:
    # 1. Prepare for AI (Convert 'gender' to 0s and 1s)
    # This creates new columns: 'gender_F' and 'gender_M'
    df_model_data = pd.get_dummies(df_final_features, columns=['gender'], drop_first=True)
    
    # 2. Define Features (X) and Target (y)
    # X = All columns EXCEPT the answer ('sepsis_label') and patient IDs
    X = df_model_data.drop(columns=['subject_id', 'hadm_id', 'sepsis_label'])
    
    # y = ONLY the answer column
    y = df_model_data['sepsis_label']
    
    # 3. Split Data
    # test_size=0.2 means 20% for testing, 80% for training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"Total patients: {len(X)}")
    print(f"Training patients: {len(X_train)}")
    print(f"Testing patients: {len(X_test)}")
    print("\n" + "="*50 + "\n")

    # 4. Create and Train the Model
    # We create an instance of the LogisticRegression model
    model = LogisticRegression(max_iter=1000) # max_iter=1000 helps it solve
    
    # Train the model on the training data
    model.fit(X_train, y_train)
    
    print("--- Model Trained Successfully! ---")
    
    # 5. Evaluate the Model
    # Make predictions on the "unseen" test data
    y_pred = model.predict(X_test)
    
    # Check the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    print("\n" + "="*50 + "\n")
    print(f"MODEL ACCURACY ON TEST DATA: {accuracy * 100:.2f}%")
    

except NameError:
    print("Error: 'df_final_features' not found.")
    print("Please re-run all steps from Step 12 & 13.")
except Exception as e:
    print(f"An error occurred: {e}")

In [29]:
import joblib

# --- This is the "Bytes" part (Step 15) ---
# Saving our model to a file

# We need the 'model' variable from Step 14
try:
    # Save the model to a file named 'sepsis_model.pkl'
    joblib.dump(model, 'sepsis_model.pkl')
    
    # Save the list of feature columns
    # We need this so our app knows the correct order and names
    feature_columns = X_train.columns
    joblib.dump(feature_columns, 'model_columns.pkl')
    
    print("--- Model Saved Successfully! ---")
    print(f"Model saved to: sepsis_model.pkl")
    print(f"Columns saved to: model_columns.pkl")
    
except NameError:
    print("Error: 'model' or 'X_train' not found.")
    print("Please re-run Step 14 to train the model and define X_train.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Model Saved Successfully! ---
Model saved to: sepsis_model.pkl
Columns saved to: model_columns.pkl
