In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_SIRS_cleaned = pd.read_csv('//content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/df_SIRS_cleaned.csv')

  df_SIRS_cleaned = pd.read_csv('//content/drive/MyDrive/BANA 650 - Healthcare Analytics/BANA 650 Healthcare Project/df_SIRS_cleaned.csv')


In [None]:
df_SIRS_cleaned.head()

Unnamed: 0,hadm_id,itemid,charttime,value,unit,icd9_code,short_title,subject_id,admittime,dischtime,age,parameter
0,157559,646,2129-08-02 08:00:00,96.0,,99592,Severe sepsis,6702,2129-04-18 14:51:00,2129-10-27 01:00:00,81,Blood Oxygen Saturation (SO₂)
1,129535,646,2138-07-09 14:00:00,98.0,,99591,Sepsis,22327,2138-07-04 15:25:00,2138-07-16 16:22:00,85,Blood Oxygen Saturation (SO₂)
2,137656,646,2120-12-24 15:00:00,100.0,,78552,Septic shock,31880,2120-12-22 19:55:00,2121-01-14 20:54:00,66,Blood Oxygen Saturation (SO₂)
3,139263,646,2164-04-04 01:00:00,100.0,,99592,Severe sepsis,21002,2164-04-03 15:49:00,2164-04-13 13:40:00,81,Blood Oxygen Saturation (SO₂)
4,147371,646,2156-12-25 21:00:00,100.0,,99592,Severe sepsis,18007,2156-12-18 22:47:00,2157-02-04 14:00:00,50,Blood Oxygen Saturation (SO₂)


### **Step 1: Ensure Data Types are Correct**

In [None]:
# Convert 'charttime' into datetime format
df_SIRS_cleaned['charttime'] = pd.to_datetime(df_SIRS_cleaned['charttime'])
df_SIRS_cleaned['admittime'] = pd.to_datetime(df_SIRS_cleaned['admittime'])
df_SIRS_cleaned['dischtime'] = pd.to_datetime(df_SIRS_cleaned['dischtime'])
print(df_SIRS_cleaned.dtypes)

hadm_id                 int64
itemid                  int64
charttime      datetime64[ns]
value                 float64
unit                   object
icd9_code               int64
short_title            object
subject_id              int64
admittime      datetime64[ns]
dischtime      datetime64[ns]
age                     int64
parameter              object
dtype: object


### **Step 2: Interpolation for Missing Values**

With the below chunk of code, we can see that there are nearly 19K missing values against the 'value' column. We will use similar interpolations methods used in the research paper to fill in missing values

In [None]:
# Count the number of missing values in each column
missing_values2 = df_SIRS_cleaned.isna().sum()
print(missing_values2)

hadm_id              0
itemid               0
charttime            0
value            25851
unit           4820948
icd9_code            0
short_title          0
subject_id           0
admittime            0
dischtime            0
age                  0
parameter            0
dtype: int64


In [None]:
def interpolate_data(data, interp_limit):
    """
    Interpolates missing values for the 'value' column
    while retaining all other columns.
    """
    # Ensure 'hadm_id' and 'itemid' exist in the DataFrame
    if 'hadm_id' not in data.columns or 'itemid' not in data.columns:
        raise ValueError("The required columns 'hadm_id' or 'itemid' are missing from the DataFrame.")

    # Create a copy to avoid modifying the original DataFrame directly
    data = data.copy()

    data['value'] = data['value'].replace(0, pd.NA)

    data['interpolated_value'] = data.groupby(['hadm_id', 'itemid'], group_keys=False)['value'].apply(
        lambda group: group.interpolate(method='linear', limit=interp_limit)
    )

    data['interpolated_value'] = data['interpolated_value'].combine_first(data['value'])

    data['value'] = data['interpolated_value']
    data = data.drop(columns='interpolated_value')

    return data


In [None]:
# Check that interpolate_data function works properly with an interpolation limit of 2
interpolated_data = interpolate_data(df_SIRS_cleaned, interp_limit=2)
print(interpolated_data.head())

  lambda group: group.interpolate(method='linear', limit=interp_limit)


   hadm_id  itemid           charttime  value unit  icd9_code    short_title  \
0   157559     646 2129-08-02 08:00:00   96.0  NaN      99592  Severe sepsis   
1   129535     646 2138-07-09 14:00:00   98.0  NaN      99591         Sepsis   
2   137656     646 2120-12-24 15:00:00  100.0  NaN      78552   Septic shock   
3   139263     646 2164-04-04 01:00:00  100.0  NaN      99592  Severe sepsis   
4   147371     646 2156-12-25 21:00:00  100.0  NaN      99592  Severe sepsis   

   subject_id           admittime           dischtime  age  \
0        6702 2129-04-18 14:51:00 2129-10-27 01:00:00   81   
1       22327 2138-07-04 15:25:00 2138-07-16 16:22:00   85   
2       31880 2120-12-22 19:55:00 2121-01-14 20:54:00   66   
3       21002 2164-04-03 15:49:00 2164-04-13 13:40:00   81   
4       18007 2156-12-18 22:47:00 2157-02-04 14:00:00   50   

                       parameter  
0  Blood Oxygen Saturation (SO₂)  
1  Blood Oxygen Saturation (SO₂)  
2  Blood Oxygen Saturation (SO₂)  
3  Blo

In [None]:
#Check that the new function handles missing values based on interpolation
missing_values = interpolated_data.isna().sum()
print(missing_values)

hadm_id              0
itemid               0
charttime            0
value           216633
unit           4820948
icd9_code            0
short_title          0
subject_id           0
admittime            0
dischtime            0
age                  0
parameter            0
dtype: int64


### **Step 3: Filter by Prediction Windows (3, 6, 12 Hours)**

We’ll now filter the data to keep only measurements that occur within a specific window before the last recorded charttime (which we treat as a proxy for sepsis onset). This function filters data to keep only the measurements within the last 3, 6, or 12 hours before the predicted sepsis onset.

In [None]:
# Create a function to filter data to keep only the measurements within the last 3, 6 or 12 hrs before the predicted sepsis onset
def filter_prediction_window(data, hours):
    """
    Filters data within the specified prediction window.
    """
    onset_times = data.groupby('hadm_id')['charttime'].max().reset_index()
    onset_times['window_start'] = onset_times['charttime'] - pd.Timedelta(hours=hours)

    filtered_data = pd.merge(data, onset_times[['hadm_id', 'window_start']], on='hadm_id')
    return filtered_data[filtered_data['charttime'] >= filtered_data['window_start']]

In [None]:
# Define interpolation levels and prediction windows
interpolations = [0, 1, 2, 3, 4, 5]
prediction_windows = [3, 6, 12]

processed_datasets = {}

# Loop through each interpolation level and prediction window
for interp in interpolations:
    if interp == 0:
        # Level 0: No interpolation, keep original missing values
        interp_data = df_SIRS_cleaned.copy()
    else:
        # Levels 1-5: Apply interpolation with the specified limit
        interp_data = interpolate_data(df_SIRS_cleaned, interp_limit=interp)

    missing_values = interp_data['value'].isnull().sum()
    print(f"Interpolation Level {interp}: Remaining missing values in 'value' column after initial interpolation = {missing_values}")

    # Apply forward and backward filling within each group after interpolation
    interp_data['value'] = (
        interp_data.groupby(['hadm_id', 'itemid'])['value']
        .apply(lambda group: group.ffill().bfill())
        .reset_index(level=[0, 1], drop=True)
    )

    missing_values_after_fill = interp_data['value'].isnull().sum()
    print(f"Interpolation Level {interp}: Remaining missing values in 'value' column after forward/backward fill = {missing_values_after_fill}")

    for window in prediction_windows:
        final_data = filter_prediction_window(interp_data, window)

        dataset_key = f'interp_{interp}_window_{window}'
        processed_datasets[dataset_key] = final_data

        # Save each dataset as a CSV file
        filename = f'sepsis_data_interp_{interp}_window_{window}.csv'
        final_data.to_csv(filename, index=False)
        print(f"Saved dataset: {filename} with {len(final_data)} rows")


Interpolation Level 0: Remaining missing values in 'value' column after initial interpolation = 25851
Interpolation Level 0: Remaining missing values in 'value' column after forward/backward fill = 16630
Saved dataset: sepsis_data_interp_0_window_3.csv with 83616 rows
Saved dataset: sepsis_data_interp_0_window_6.csv with 148180 rows
Saved dataset: sepsis_data_interp_0_window_12.csv with 277178 rows


  lambda group: group.interpolate(method='linear', limit=interp_limit)


Interpolation Level 1: Remaining missing values in 'value' column after initial interpolation = 216633


  .apply(lambda group: group.ffill().bfill())


Interpolation Level 1: Remaining missing values in 'value' column after forward/backward fill = 19795
Saved dataset: sepsis_data_interp_1_window_3.csv with 83616 rows
Saved dataset: sepsis_data_interp_1_window_6.csv with 148180 rows
Saved dataset: sepsis_data_interp_1_window_12.csv with 277178 rows


  lambda group: group.interpolate(method='linear', limit=interp_limit)


Interpolation Level 2: Remaining missing values in 'value' column after initial interpolation = 216633


  .apply(lambda group: group.ffill().bfill())


Interpolation Level 2: Remaining missing values in 'value' column after forward/backward fill = 19795
Saved dataset: sepsis_data_interp_2_window_3.csv with 83616 rows
Saved dataset: sepsis_data_interp_2_window_6.csv with 148180 rows
Saved dataset: sepsis_data_interp_2_window_12.csv with 277178 rows


  lambda group: group.interpolate(method='linear', limit=interp_limit)


Interpolation Level 3: Remaining missing values in 'value' column after initial interpolation = 216633


  .apply(lambda group: group.ffill().bfill())


Interpolation Level 3: Remaining missing values in 'value' column after forward/backward fill = 19795
Saved dataset: sepsis_data_interp_3_window_3.csv with 83616 rows
Saved dataset: sepsis_data_interp_3_window_6.csv with 148180 rows
Saved dataset: sepsis_data_interp_3_window_12.csv with 277178 rows


  lambda group: group.interpolate(method='linear', limit=interp_limit)


Interpolation Level 4: Remaining missing values in 'value' column after initial interpolation = 216633


  .apply(lambda group: group.ffill().bfill())


Interpolation Level 4: Remaining missing values in 'value' column after forward/backward fill = 19795
Saved dataset: sepsis_data_interp_4_window_3.csv with 83616 rows
Saved dataset: sepsis_data_interp_4_window_6.csv with 148180 rows
Saved dataset: sepsis_data_interp_4_window_12.csv with 277178 rows


  lambda group: group.interpolate(method='linear', limit=interp_limit)


Interpolation Level 5: Remaining missing values in 'value' column after initial interpolation = 216633


  .apply(lambda group: group.ffill().bfill())


Interpolation Level 5: Remaining missing values in 'value' column after forward/backward fill = 19795
Saved dataset: sepsis_data_interp_5_window_3.csv with 83616 rows
Saved dataset: sepsis_data_interp_5_window_6.csv with 148180 rows
Saved dataset: sepsis_data_interp_5_window_12.csv with 277178 rows


In [2]:
!jupyter nbconvert --to html BANA650_Preprocessing2.ipynb

[NbConvertApp] Converting notebook BANA650_Preprocessing2.ipynb to html
[NbConvertApp] Writing 328976 bytes to BANA650_Preprocessing2.html
