In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Path to the folder containing the original data extracts
data_folder = '/content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_files = [f for f in os.listdir(data_folder) if f.startswith('sepsis_') and f.endswith('.csv')]
print(f"Found datasets: {dataset_files}")


Found datasets: ['sepsis_data_interp_0_window_12.csv', 'sepsis_data_interp_0_window_3.csv', 'sepsis_data_interp_0_window_6.csv', 'sepsis_data_interp_1_window_12.csv', 'sepsis_data_interp_1_window_3.csv', 'sepsis_data_interp_1_window_6.csv', 'sepsis_data_interp_2_window_12.csv', 'sepsis_data_interp_2_window_3.csv', 'sepsis_data_interp_2_window_6.csv', 'sepsis_data_interp_3_window_12.csv', 'sepsis_data_interp_3_window_3.csv', 'sepsis_data_interp_3_window_6.csv', 'sepsis_data_interp_4_window_12.csv', 'sepsis_data_interp_4_window_3.csv', 'sepsis_data_interp_4_window_6.csv', 'sepsis_data_interp_5_window_12.csv', 'sepsis_data_interp_5_window_3.csv', 'sepsis_data_interp_5_window_6.csv']


In [None]:
import pandas as pd

# Function to calculate SIRS criteria
def calculate_sirs_criteria(row):
    """
    Determine whether a row meets SIRS criteria.
    """
    temp = row['Temperature'] < 96.8 or row['Temperature'] > 100.4  # Temperature
    heart_rate = row['Heart Rate'] > 90                        # Heart Rate
    resp_rate = row['Respiratory Rate'] > 20                   # Respiratory Rate
    wbc = row['White Blood Cell Count'] < 4 or row['White Blood Cell Count'] > 12  # White Blood Cell Count

    # Count how many criteria are met
    criteria_met = sum([temp, heart_rate, resp_rate, wbc])

    return criteria_met >= 2

# Function to assign sepsis labels
def assign_sepsis_label(data):
    """
    Assign `sepsis_label` based on SIRS criteria and charttime.
    """
    data['sirs_criteria_met'] = data.apply(calculate_sirs_criteria, axis=1)

    # Add a default label for sepsis onset
    data['sepsis_label'] = 0

    # Process each `hadm_id` group
    for hadm_id, group in data.groupby('hadm_id'):
        # Sort rows by charttime for chronological order
        group = group.sort_values(by='charttime')

        # Identify the first timestamp where SIRS criteria are met
        first_sirs_idx = group[group['sirs_criteria_met'] == 1].index.min()

        if pd.notna(first_sirs_idx):
            data.loc[(data['hadm_id'] == hadm_id) & (data.index >= first_sirs_idx), 'sepsis_label'] = 1

    data = data.drop(columns=['sirs_criteria_met'])

    return data

for file in dataset_files:
    print(f"Processing {file}...")

    data = pd.read_csv(os.path.join(data_folder, file))

    data['charttime'] = pd.to_datetime(data['charttime'])

    pivoted_data = data.pivot_table(
        index=['hadm_id', 'charttime'],  # Group by patient admission and timestamp
        columns='parameter',            # Create columns for each parameter
        values='value',
        aggfunc='mean'
    ).reset_index()

    pivoted_data = pivoted_data.sort_values(by=['hadm_id', 'charttime'])

    def fill_missing_values(group):
        return group.ffill().bfill()

    pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)
    assert 'hadm_id' in pivoted_data.columns, "Error: hadm_id is missing from the processed data."

    pivoted_data = assign_sepsis_label(pivoted_data)

    output_path = os.path.join(data_folder, f'labeled_{file}')
    pivoted_data.to_csv(output_path, index=False)
    print(f"Saved labeled data to {output_path}")


Processing sepsis_data_interp_0_window_12.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_0_window_12.csv
Processing sepsis_data_interp_0_window_3.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_0_window_3.csv
Processing sepsis_data_interp_0_window_6.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_0_window_6.csv
Processing sepsis_data_interp_1_window_12.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_1_window_12.csv
Processing sepsis_data_interp_1_window_3.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_1_window_3.csv
Processing sepsis_data_interp_1_window_6.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_1_window_6.csv
Processing sepsis_data_interp_2_window_12.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_2_window_12.csv
Processing sepsis_data_interp_2_window_3.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_2_window_3.csv
Processing sepsis_data_interp_2_window_6.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_2_window_6.csv
Processing sepsis_data_interp_3_window_12.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_3_window_12.csv
Processing sepsis_data_interp_3_window_3.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_3_window_3.csv
Processing sepsis_data_interp_3_window_6.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_3_window_6.csv
Processing sepsis_data_interp_4_window_12.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_4_window_12.csv
Processing sepsis_data_interp_4_window_3.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_4_window_3.csv
Processing sepsis_data_interp_4_window_6.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_4_window_6.csv
Processing sepsis_data_interp_5_window_12.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_5_window_12.csv
Processing sepsis_data_interp_5_window_3.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_5_window_3.csv
Processing sepsis_data_interp_5_window_6.csv...


  pivoted_data = pivoted_data.groupby('hadm_id', group_keys=False).apply(fill_missing_values)


Saved labeled data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/labeled_sepsis_data_interp_5_window_6.csv


In [None]:
pivoted_data.head()

parameter,hadm_id,charttime,Blood Oxygen Saturation (SO₂),CO₂ Partial Pressure (PaCO₂),Diastolic Blood Pressure,Heart Rate,Respiratory Rate,Systolic Blood Pressure,Temperature,White Blood Cell Count,pH Value,sepsis_label
0,100028,2142-12-30 05:20:00,,,,,,,,11.5,,0
1,100074,2176-04-12 07:00:00,,,52.0,103.0,22.0,83.0,99.679997,,,1
2,100074,2176-04-12 07:15:00,,,52.0,105.0,22.0,82.0,99.679997,,,1
3,100074,2176-04-12 07:30:00,,,59.0,100.0,22.0,102.0,99.679997,,,1
4,100074,2176-04-12 08:00:00,,,51.0,104.0,22.0,80.0,99.860001,,,1


In [None]:
print(pivoted_data.isnull().sum())


parameter
hadm_id                              0
charttime                            0
Blood Oxygen Saturation (SO₂)     6723
CO₂ Partial Pressure (PaCO₂)     18935
Diastolic Blood Pressure          4212
Heart Rate                        3369
Respiratory Rate                  3392
Systolic Blood Pressure           4394
Temperature                       9242
White Blood Cell Count           18956
pH Value                         19378
sepsis_label                         0
dtype: int64


In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import os

# List of processed files
processed_files = [f for f in os.listdir(data_folder) if f.startswith('labeled_') and f.endswith('.csv')]

for file in processed_files:
    print(f"Normalizing {file}...")

    data = pd.read_csv(os.path.join(data_folder, file))

    # Select columns to normalize (exclude metadata and `sepsis_label`)
    parameter_columns = data.columns.difference(['hadm_id', 'charttime', 'sepsis_label'])

    scaler = MinMaxScaler()

    # Normalize only the parameter columns
    data[parameter_columns] = scaler.fit_transform(data[parameter_columns])

    # Save the normalized dataset
    output_path = os.path.join(data_folder, f'normalized_{file}')
    data.to_csv(output_path, index=False)
    print(f"Saved normalized data to {output_path}")


Normalizing labeled_sepsis_data_interp_0_window_12.csv...
Saved normalized data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/normalized_labeled_sepsis_data_interp_0_window_12.csv
Normalizing labeled_sepsis_data_interp_0_window_3.csv...
Saved normalized data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/normalized_labeled_sepsis_data_interp_0_window_3.csv
Normalizing labeled_sepsis_data_interp_0_window_6.csv...
Saved normalized data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/normalized_labeled_sepsis_data_interp_0_window_6.csv
Normalizing labeled_sepsis_data_interp_1_window_12.csv...
Saved normalized data to /content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/18 Datasets (Interpolation x Windows)/normalized_labeled_sepsis_

In [None]:
print(data[['hadm_id', 'charttime','sepsis_label']].head())


   hadm_id           charttime  sepsis_label
0   100028 2142-12-30 05:20:00             0
1   100074 2176-04-12 07:00:00             1
2   100074 2176-04-12 07:15:00             1
3   100074 2176-04-12 07:30:00             1
4   100074 2176-04-12 08:00:00             1


In [None]:
print(data[parameter_columns].head())


   Blood Oxygen Saturation (SO₂)  CO₂ Partial Pressure (PaCO₂)  \
0                            NaN                           NaN   
1                            NaN                           NaN   
2                            NaN                           NaN   
3                            NaN                           NaN   
4                            NaN                           NaN   

   Diastolic Blood Pressure  Heart Rate  Respiratory Rate  \
0                       NaN         NaN               NaN   
1                  0.448718    0.580357          0.446809   
2                  0.448718    0.598214          0.446809   
3                  0.538462    0.553571          0.446809   
4                  0.435897    0.589286          0.446809   

   Systolic Blood Pressure  Temperature  White Blood Cell Count  pH Value  
0                      NaN          NaN                0.393103       NaN  
1                 0.237705     0.625000                     NaN       NaN  
2       

In [1]:
!jupyter nbconvert --to html BANA650_Modeling_Prep_AC.ipynb

[NbConvertApp] Converting notebook BANA650_Modeling_Prep_AC.ipynb to html
[NbConvertApp] Writing 330940 bytes to BANA650_Modeling_Prep_AC.html
