# Sepsis Project: (main.ipynb)

# Data Import

In [2]:
## You can run this notebook also in your local VS-code ##
# =====================================
# 1) Setup: Detect Colab and set project folder
# =====================================

import sys
import os

# Detect if running in Colab
is_colab = 'google.colab' in sys.modules
print("Running in Colab?", is_colab)

# If in Colab, mount Drive and set path
if is_colab:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    #PROJECT_PATH = '/content/drive/MyDrive/Deep Learning S25 Course Project'
    PROJECT_PATH = '/content/drive/MyDrive/Deep Learning S25 Course Project'
else:
    # Local dev: use current folder or adjust if needed
    PROJECT_PATH = os.getcwd()

# Change working directory
os.chdir(PROJECT_PATH)

# Add to sys.path for custom imports
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

# Confirm
print("Current working directory:", os.getcwd())
print("sys.path includes this folder:", os.getcwd() in sys.path)

# Confirm contents
print("\nFolder contents:")
for item in os.listdir():
    print("-", item)

print("\nData folder contents:")
print(os.listdir("data"))

print("\nData_Preparation folder contents:")
print(os.listdir("Data_Preparation"))


Running in Colab? True
Mounted at /content/drive
Current working directory: /content/drive/MyDrive/Deep Learning S25 Course Project
sys.path includes this folder: True

Folder contents:
- data
- Data_Preparation
- models
- results
- Model_Definitions
- .ipynb_checkpoints
- Training_Pipeline
- main.ipynb

Data folder contents:
['training_set_A.csv', 'training_set_B.csv', 'training_set_AB.csv', '.ipynb_checkpoints', 'cleaned', 'preprocessed']

Data_Preparation folder contents:
['__pycache__', '.ipynb_checkpoints', 'feature_normalization.py', 'eda.py', 'data_parsing.py', '__init__.py', 'label_generation_split.py']


In [3]:
# =====================================
# 2) Check what's in your data folder
# =====================================
print("Data folder files:", os.listdir("data"))


Data folder files: ['training_set_A.csv', 'training_set_B.csv', 'training_set_AB.csv', '.ipynb_checkpoints', 'cleaned', 'preprocessed']


In [None]:
#Installing requirements
%pip install -q \
    imbalanced-learn \
    imblearn \
    matplotlib \
    numpy \
    pandas \
    scikit-learn \
    seaborn \
    torch \
    tqdm \
    ipywidgets \
    notebook \
    joblib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m122.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Whether data preprocessing step should be computed again
# If false, load previously saved preprocessed data
RECOMPUTE_DATA_PREPROCESSING = True

In [None]:
# =====================================
# 3) Data Import and structure
# =====================================
import pandas as pd

# Correct path: use 'data/' now
DATA_FILE_AB = 'data/training_set_AB.csv'
DATA_FILE_A = 'data/training_set_A.csv'
DATA_FILE_B = 'data/training_set_B.csv'


# training_set_A: Data from Hospital System A ========>  data_A (Use for training)
# training_set_B: Data from Hospital System B ========>  data_B (Use for validation or testing)
# training_set_AB: Combined Data from Hospital System A and B ========> (We probably won't use it) - We still can use it in EDA
data_AB = None
data_A = None
data_B = None

if RECOMPUTE_DATA_PREPROCESSING:
  data_AB = pd.read_csv(DATA_FILE_AB)
  data_A = pd.read_csv(DATA_FILE_A)
  data_B = pd.read_csv(DATA_FILE_B)


In [None]:
data_AB.head()

AttributeError: 'NoneType' object has no attribute 'head'

# Data Preparation

## Label generation & dataset splitting

In [None]:
# (Asal) Run Label generation & dataset splitting

#!ls -l
from Data_Preparation import(
  stratified_group_k_fold
)

splits = None

if RECOMPUTE_DATA_PREPROCESSING:
  splits = stratified_group_k_fold(data_AB, k=5)

  for fold, (train_idx, test_idx) in enumerate(splits):
      train_df = data_AB.iloc[train_idx]
      test_df = data_AB.iloc[test_idx]

      # Count positive labels
      train_pos = train_df['SepsisLabel'].sum()
      test_pos = test_df['SepsisLabel'].sum()

      # Count total labels
      train_total = len(train_df)
      test_total = len(test_df)

      print(f"\nFold {fold+1}")
      print(f"Train size: {train_total}, Positive cases: {train_pos} ({100 * train_pos / train_total:.2f}%)")
      print(f"Test   size: {test_total}, Positive cases: {test_pos} ({100 * test_pos / test_total:.2f}%)")

else:
  print("Skipping data preprocessing!")


Skipping data preprocessing!


## Missing Value Imputation

In [None]:
# (Mitch) Run Data parsing and Handling missing data codes here
from Data_Preparation import parse_and_clean_data
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd

CLEANED_DATA_DIR = Path('data/cleaned/')

#Strategy for handling missing values
IMPUTE = 'impute'
MASK = 'mask'
MASK_IMPUTE = 'mask-impute'

MISSING_VAL_STRATEGY = IMPUTE # 'mask' 'impute' 'mask-impute'
STRATEGIES_TO_LOAD = [MISSING_VAL_STRATEGY] # add any other strategies of interest to load them into the data_dict

TRAIN = 'train'
TEST = 'test'
data_dict = {}

if RECOMPUTE_DATA_PREPROCESSING:
  print("Preprocessing data!")
  for fold, (train_idx, test_idx) in enumerate(splits):
    for strategy in STRATEGIES_TO_LOAD:
      train_df = data_AB.iloc[train_idx]
      test_df = data_AB.iloc[test_idx]
      train_df_clean = parse_and_clean_data(df=train_df, missing_values=strategy)
      test_df_clean = parse_and_clean_data(df=test_df, missing_values=strategy)
      data_dict[fold] = {}
      data_dict[fold][TRAIN] = {}
      data_dict[fold][TEST] = {}
      data_dict[fold][TRAIN][strategy] = train_df_clean
      data_dict[fold][TEST][strategy] = test_df_clean

      train_fname = "_".join((str(fold), TRAIN, strategy))
      train_fname = ".".join((train_fname, "csv"))
      train_df_clean.to_csv(CLEANED_DATA_DIR.joinpath(train_fname), index=False)

      test_fname = "_".join((str(fold), TEST, strategy))
      test_fname = ".".join((test_fname, "csv"))
      test_df_clean.to_csv(CLEANED_DATA_DIR.joinpath(test_fname), index=False)
else:
  print("Loading preprocessed data!")
  fpaths = list(CLEANED_DATA_DIR.glob("*.csv"))
  for p in tqdm(fpaths):
    fold_str, split_set, strategy = (p.name.split(".")[0]).split("_")
    if strategy in STRATEGIES_TO_LOAD:
      curr_df = pd.read_csv(p)
      fold = int(fold_str)
      if fold in data_dict.keys():
        if split_set in data_dict[fold].keys():
          data_dict[fold][split_set].update({strategy : curr_df})
        else:
          data_dict[fold][split_set] = {}
          data_dict[fold][split_set][strategy] = curr_df
      else:
        data_dict[fold] = {}
        data_dict[fold][split_set] = {}
        data_dict[fold][split_set][strategy] = curr_df


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-4-2655099803.py", line 2, in <cell line: 0>
    from Data_Preparation import parse_and_clean_data
  File "<frozen importlib._bootstrap>", line 1176, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1138, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 1078, in _find_spec
  File "<frozen importlib._bootstrap_external>", line 1507, in find_spec
  File "<frozen importlib._bootstrap_external>", line 1476, in _get_spec
  File "<frozen importlib._bootstrap_external>", line 1434, in _path_importer_cache
OSError: [Errno 107] Transport endpoint is not connected

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshel

In [None]:
%pwd

%ls -l data/cleaned/

total 1511550
-rw------- 1 root root  62134403 Jun 30 04:04 0_test_impute.csv
-rw------- 1 root root 247082356 Jun 30 04:04 0_train_impute.csv
-rw------- 1 root root  61679344 Jun 30 04:15 1_test_impute.csv
-rw------- 1 root root 247900861 Jun 30 04:15 1_train_impute.csv
-rw------- 1 root root  61877235 Jun 30 04:26 2_test_impute.csv
-rw------- 1 root root 246878361 Jun 30 04:26 2_train_impute.csv
-rw------- 1 root root  61737822 Jun 30 04:37 3_test_impute.csv
-rw------- 1 root root 247794396 Jun 30 04:37 3_train_impute.csv
-rw------- 1 root root  62149065 Jun 30 04:47 4_test_impute.csv
-rw------- 1 root root 248590371 Jun 30 04:47 4_train_impute.csv


In [None]:
data_dict[0][TRAIN][MISSING_VAL_STRATEGY].head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,patient_id
0,83.5,98.0,37.0,121.0,80.0,62.0,18.0,33.0,0.0,24.0,...,250.0,180.0,44.0,1,0.0,1.0,-2.41,1,0,108754
1,69.0,95.5,36.5,125.0,94.5,75.0,19.5,33.0,0.0,24.0,...,250.0,180.0,44.0,1,0.0,1.0,-2.41,2,0,108754
2,75.0,97.5,37.0,132.0,95.5,75.0,20.0,33.0,0.0,24.0,...,250.0,180.0,44.0,1,0.0,1.0,-2.41,3,0,108754
3,68.0,97.0,37.0,127.0,87.0,66.0,20.0,33.0,0.0,24.0,...,250.0,180.0,44.0,1,0.0,1.0,-2.41,4,0,108754
4,70.0,98.0,37.0,127.0,89.0,68.0,19.0,33.0,0.0,24.0,...,250.0,180.0,44.0,1,0.0,1.0,-2.41,5,0,108754


In [None]:
data_dict[0][TEST][MISSING_VAL_STRATEGY].head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,patient_id
0,83.5,98.0,37.0,122.0,81.0,62.0,18.0,33.0,0.0,24.0,...,252.0,183.0,28.0,0,1.0,0.0,-61.34,1,0,105253
1,83.5,98.0,37.0,122.0,81.0,62.0,18.0,33.0,0.0,24.0,...,252.0,183.0,28.0,0,1.0,0.0,-61.34,2,0,105253
2,83.5,98.0,37.0,122.0,81.0,62.0,18.0,33.0,1.0,25.7,...,252.0,183.0,28.0,0,1.0,0.0,-61.34,3,0,105253
3,83.5,98.0,37.0,122.0,81.0,62.0,18.0,33.0,-1.6,23.7,...,252.0,183.0,28.0,0,1.0,0.0,-61.34,4,0,105253
4,83.5,98.0,37.0,122.0,81.0,62.0,18.0,33.0,-1.15,24.1,...,252.0,183.0,28.0,0,1.0,0.0,-61.34,5,0,105253


Create RNN sequences

In [None]:
def generate_rnn_sequences(df, feature_cols, label_col='SepsisLabel', group_col='patient_id', time_col='ICULOS', n=24):
    """
    For each patient, creates overlapping sequences of length `n` to predict the next label.

    Returns:
        sequences: list of (X_seq, y_next) pairs
    """
    sequences = []

    for pid, group in df.groupby(group_col):
        #group = group.sort_values(by='index' if 'index' in group.columns else group.index)
        group = group.sort_values(by='index' if 'index' in group.columns else time_col)
        X = group[feature_cols].values
        y = group[label_col].values

        if len(X) <= n:
            continue  # skip short sequences

        for i in range(len(X) - n):
            X_seq = X[i:i+n]        # shape: (n, D)
            y_next = y[i+n]         # scalar: label at t+n
            sequences.append((X_seq, y_next))

    return sequences


Feature Normalization and Addressing Class Imbalance

In [None]:
# (Aparna) Run Feature Normalization and Addressing Class Imbalance codes here
import pickle
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import numpy as np
import joblib

#SEQ_LEN = 24
ID_COL = 'patient_id'
TIME_COL = 'ICULOS'
LABEL_COL = 'SepsisLabel'

if True:

    for fold in range(5):
        print(f"\n=== Fold {fold} ===")
        train_df = data_dict[fold]['train']['impute'].copy()
        test_df = data_dict[fold]['test']['impute'].copy()
        print(f"Input: \nTrain shape: {train_df.shape}, Test shape: {test_df.shape}")

        #feature_cols = train_df.drop(columns=[ID_COL, TIME_COL, LABEL_COL]).columns
        col_mask = [ID_COL, TIME_COL, LABEL_COL]
        feature_cols = [x for x in train_df.columns.to_list() if not x in col_mask]
        scaler = StandardScaler()
        scaler.fit(train_df[feature_cols])

        # Apply to both train and test
        train_df[feature_cols] = scaler.transform(train_df[feature_cols])
        test_df[feature_cols] = scaler.transform(test_df[feature_cols])

        # Sequence
        train_sequences = generate_rnn_sequences(train_df, feature_cols)
        test_sequences = generate_rnn_sequences(test_df, feature_cols)

        # Extract arrays
        X_train = np.array([x for x, y in train_sequences])
        y_train = np.array([y for x, y in train_sequences])
        X_test = np.array([x for x, y in test_sequences])
        y_test = np.array([y for x, y in test_sequences])

        # SMOTE on train only
        X_flat = X_train.reshape(X_train.shape[0], -1)
        smote = SMOTE(random_state=42)

        X_res, y_res, *_ = smote.fit_resample(X_flat, y_train)


        X_train_bal = X_res.reshape(-1, X_train.shape[1], X_train.shape[2])

        print( "Output: \n",
            f"Train: {len(X_train_bal)} ({np.sum(y_res)} positives), "
            f"train_pos_rate: {float(np.mean(y_res))} ",
            f"Test: {len(X_test)} ({np.sum(y_test)} positives), ",
            )

        # Store
        preprocessed_data = {
            'X_train': X_train_bal,
            'y_train': y_res,
            'X_test': X_test,
            'y_test': y_test,
            #'scaler': scaler,  # store for inverse_transform or debugging
        }

        path = f'data/preprocessed/time_series_data_compressed_{fold}.pkl.z'
        print( f"Writing data to {path}: ")
        joblib.dump(preprocessed_data, path, compress=3)
        print( f"Completed!! ")

        del X_train, y_train, X_test, y_test, X_train_bal, X_res, y_res, X_flat, smote, preprocessed_data

preprocessed_folds = {}
for fold in range(5):
    print( f"Reading alreay processed data for fold {fold}: ")
    preprocessed_folds[fold] = joblib.load(f'data/preprocessed/time_series_data_compressed_{fold}.pkl.z')





=== Fold 0 ===
Input: 
Train shape: (1239169, 42), Test shape: (313041, 42)


In [None]:


fold0 = preprocessed_folds[0]
print(f"X_train shape: {fold0['X_train'].shape}") #(batch_size, sequence_length, num_features)
print(f"y_train shape: {fold0['y_train'].shape}")
print(f"X_test shape: {fold0['X_test'].shape}")
print(f"y_test shape: {fold0['y_test'].shape}")
# Print one sequence and its label
print("Example sequence (X_train[0]):")
print(fold0['X_train'][0])
print("Corresponding label (y_train[0]):")
print(fold0['y_train'][0])


Exploratory data analysis (EDA)

In [None]:
# Ehsan

!ls -l
############################################
###  Test Code Cell Please Don't Change  ###
############################################
# (Ehsan) Run Exploratory data analysis (EDA) codes here
# Lactate is the most relevant criteria then the rest of the plotted variables are most relevant
# 1. Serum Lactate
# 2. White Blood Cell Count (WBC)
# 3. Blood Urea Nitrogen (BUN) / Creatinine
# 4. Mean Arterial Pressure (MAP) / Systolic BP (SBP)
# 5. Heart Rate (HR) & Respiratory Rate (Resp)

from Data_Preparation import run_eda, run_comprehensive_eda
# Example:
#run_eda(data_A, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])
#run_eda(data_B, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])
#run_eda(data_A, ['Lactate','WBC'])
#run_eda(data_B, ['Lactate','WBC'])
############################################
######### A more comprehensive EDA #########
############################################
#run_comprehensive_eda
# 1) Missingness
# 2) Correlation heatmap (drop rows with any missing in features)
# 3) Boxplots for each feature by label
# 4) KDE overlays (all features in one grid)
# 5) PCA scatter
# Example:

# 1) Automatically select all feature columns except the ones to drop:
to_drop = ['SepsisLabel', 'patient_id', 'Unit1', 'Unit2', 'HospAdmTime']
all_features = [col for col in data_A.columns if col not in to_drop]

# 2) Quick sanity-check
print("Running EDA on:", all_features)

# 3) Call your comprehensive EDA (here we run all steps 1–5):
from Data_Preparation.eda import run_comprehensive_eda
run_comprehensive_eda(data_A, all_features, steps=[1,2])


##### Other examples
#run_comprehensive_eda(data_AB, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp','O2Sat','Temp','pH','PTT','Glucose','Chloride','Bilirubin_direct'], steps = [1,2])
#run_comprehensive_eda(data_A, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])
#run_comprehensive_eda(data_B, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])


from Data_Preparation.eda import corr_difference_analysis

features = [c for c in data_A.columns
            if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]

diff_matrix, top_changes = corr_difference_analysis(
    data_A,
    features,
    min_count=50,   # only include features with ≥50 non‐null in each label
    top_k=15,
    figsize=(8,6)
)


In [None]:
# Ehsan
############################################
###      Ready to run Dataset_A EDA      ###
############################################
###  Please Don't Change  ###
from Data_Preparation import run_eda, run_comprehensive_eda, corr_difference_analysis

features = [c for c in data_A.columns
            if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]
#run_comprehensive_eda(data_A)
run_comprehensive_eda(data_A, all_features, steps=[1,2])
#corr_difference_analysis(data_A)
diff_matrix, top_changes = corr_difference_analysis(
    data_A,
    features,
    min_count=50,   # only include features with ≥50 non‐null in each label
    top_k=15,
    figsize=(8,6)
)

In [None]:
# Ehsan
############################################
###      Ready to run Dataset_B EDA      ###
############################################
###  Please Don't Change  ###
from Data_Preparation import run_eda, run_comprehensive_eda, corr_difference_analysis

features = [c for c in data_B.columns
            if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]
#run_comprehensive_eda(data_B)
run_comprehensive_eda(data_B, all_features, steps=[1,2])
#corr_difference_analysis(data_B)
diff_matrix, top_changes = corr_difference_analysis(
    data_B,
    features,
    min_count=50,   # only include features with ≥50 non‐null in each label
    top_k=15,
    figsize=(8,6)
)

In [None]:
# Ehsan
############################################
###      Ready to run Dataset_AB EDA     ###
############################################
###  Please Don't Change  ###
from Data_Preparation import run_eda, run_comprehensive_eda, corr_difference_analysis

features = [c for c in data_AB.columns
            if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]
#run_comprehensive_eda(data_AB)
run_comprehensive_eda(data_AB, all_features, steps=[1,2])
#corr_difference_analysis(data_AB)
diff_matrix, top_changes = corr_difference_analysis(
    data_AB,
    features,
    min_count=50,   # only include features with ≥50 non‐null in each label
    top_k=15,
    figsize=(8,6)
)