# Sepsis Project: (main.ipynb)

# Setup

## Environment

In [None]:
## You can run this notebook also in your local VS-code ##
# =====================================
# 1) Setup: Detect Colab and set project folder
# =====================================

import sys
import os

# Detect if running in Colab
is_colab = 'google.colab' in sys.modules
print("Running in Colab?", is_colab)

# If in Colab, mount Drive and set path
if is_colab:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    #PROJECT_PATH = '/content/drive/MyDrive/Deep Learning S25 Course Project'
    PROJECT_PATH = '/content/drive/MyDrive/Deep Learning S25 Course Project'
else:
    # Local dev: use current folder or adjust if needed
    PROJECT_PATH = os.getcwd()

# Change working directory
os.chdir(PROJECT_PATH)

# Add to sys.path for custom imports
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

# Confirm
print("Current working directory:", os.getcwd())
print("sys.path includes this folder:", os.getcwd() in sys.path)

# Confirm contents
print("\nFolder contents:")
for item in os.listdir():
    print("-", item)

print("\nData folder contents:")
print(os.listdir("data"))

print("\nData_Preparation folder contents:")
print(os.listdir("Data_Preparation"))


In [None]:
# =====================================
# 2) Check what's in your data folder
# =====================================
print("Data folder files:", os.listdir("data"))


In [None]:
#Installing requirements
%pip install -q \
    imbalanced-learn \
    imblearn \
    matplotlib \
    numpy \
    pandas \
    scikit-learn \
    seaborn \
    torch \
    tqdm \
    ipywidgets \
    notebook \
    joblib

# Checking if cuda is available
import torch
torch.cuda.is_available()

## Data Import

In [None]:
# Whether data preprocessing step should be computed again
# If false, load previously saved preprocessed data
RECOMPUTE_DATA_PREPROCESSING = False
LOAD_CLEAN_DATA = False

# Whether to load raw data for EDA
RUN_EDA_AB = False
RUN_EDA_A = False
RUN_EDA_B = False 

In [None]:
# =====================================
# 3) Data Import and structure
# =====================================
import pandas as pd

# Correct path: use 'data/' now
DATA_FILE_AB = 'data/raw/training_set_AB.csv'
DATA_FILE_A = 'data/raw/training_set_A.csv'
DATA_FILE_B = 'data/raw/training_set_B.csv'


# training_set_A: Data from Hospital System A ========>  data_A (Use for training)
# training_set_B: Data from Hospital System B ========>  data_B (Use for validation or testing)
# training_set_AB: Combined Data from Hospital System A and B ========> (We probably won't use it) - We still can use it in EDA
data_AB = None
data_A = None
data_B = None

if RECOMPUTE_DATA_PREPROCESSING or RUN_EDA_AB:
  data_AB = pd.read_csv(DATA_FILE_AB)

if RECOMPUTE_DATA_PREPROCESSING or RUN_EDA_A:
  data_A = pd.read_csv(DATA_FILE_A)

if RECOMPUTE_DATA_PREPROCESSING or RUN_EDA_B:
  data_B = pd.read_csv(DATA_FILE_B)

# Drop columns with 60%+ missingness
row_ct = None if data_AB is None else data_AB.shape[0]
threshold = None if data_AB is None else int(row_ct * 0.2)
data_AB_cleaned = None if data_AB is None else data_AB.dropna(axis=1, thresh=threshold)
if not data_AB_cleaned is None:
  print(f'Kept {len(data_AB_cleaned.columns.to_list()) - 3} feature columns.')


In [None]:
'Skipped loading raw data!' if data_AB is None else data_AB.head()

# Data Preparation

## Label generation & dataset splitting

In [None]:
# (Asal) Run Label generation & dataset splitting

#!ls -l
from Data_Preparation import(
  stratified_group_k_fold
)

splits = None

if RECOMPUTE_DATA_PREPROCESSING:
  splits = stratified_group_k_fold(data_AB_cleaned, k=5)

  for fold, (train_idx, test_idx) in enumerate(splits):
      train_df = data_AB.iloc[train_idx]
      test_df = data_AB.iloc[test_idx]

      # Count positive labels
      train_pos = train_df['SepsisLabel'].sum()
      test_pos = test_df['SepsisLabel'].sum()

      # Count total labels
      train_total = len(train_df)
      test_total = len(test_df)

      print(f"\nFold {fold+1}")
      print(f"Train size: {train_total}, Positive cases: {train_pos} ({100 * train_pos / train_total:.2f}%)")
      print(f"Test   size: {test_total}, Positive cases: {test_pos} ({100 * test_pos / test_total:.2f}%)")

else:
  print("Skipping data preprocessing!")


## Missing Value Imputation

In [None]:
# (Mitch) Run Data parsing and Handling missing data codes here
from Data_Preparation import parse_and_clean_data
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd

CLEANED_DATA_DIR = Path('data/cleaned/')
if not CLEANED_DATA_DIR.is_dir():
  CLEANED_DATA_DIR.mkdir()

#Strategy for handling missing values
IMPUTE = 'impute'
MASK = 'mask'
MASK_IMPUTE = 'mask-impute'

MISSING_VAL_STRATEGY = IMPUTE # 'mask' 'impute' 'mask-impute'
STRATEGIES_TO_LOAD = [MISSING_VAL_STRATEGY] # add any other strategies of interest to load them into the data_dict

TRAIN = 'train'
TEST = 'test'
data_dict = {}

if RECOMPUTE_DATA_PREPROCESSING:
  print("Preprocessing data!")
  for fold, (train_idx, test_idx) in enumerate(splits):
    for strategy in STRATEGIES_TO_LOAD:
      train_df = data_AB_cleaned.iloc[train_idx]
      test_df = data_AB_cleaned.iloc[test_idx]
      train_df_clean = parse_and_clean_data(df=train_df, missing_values=strategy)
      test_df_clean = parse_and_clean_data(df=test_df, missing_values=strategy)
      data_dict[fold] = {}
      data_dict[fold][TRAIN] = {}
      data_dict[fold][TEST] = {}
      data_dict[fold][TRAIN][strategy] = train_df_clean
      data_dict[fold][TEST][strategy] = test_df_clean

      train_fname = "_".join((str(fold), TRAIN, strategy))
      train_fname = ".".join((train_fname, "csv"))
      train_df_clean.to_csv(CLEANED_DATA_DIR.joinpath(train_fname), index=False)

      test_fname = "_".join((str(fold), TEST, strategy))
      test_fname = ".".join((test_fname, "csv"))
      test_df_clean.to_csv(CLEANED_DATA_DIR.joinpath(test_fname), index=False)
elif LOAD_CLEAN_DATA:
  print("Loading preprocessed data!")
  fpaths = list(CLEANED_DATA_DIR.glob("*.csv"))
  for p in tqdm(fpaths):
    fold_str, split_set, strategy = (p.name.split(".")[0]).split("_")
    if strategy in STRATEGIES_TO_LOAD:
      curr_df = pd.read_csv(p)
      fold = int(fold_str)
      if fold in data_dict.keys():
        if split_set in data_dict[fold].keys():
          data_dict[fold][split_set].update({strategy : curr_df})
        else:
          data_dict[fold][split_set] = {}
          data_dict[fold][split_set][strategy] = curr_df
      else:
        data_dict[fold] = {}
        data_dict[fold][split_set] = {}
        data_dict[fold][split_set][strategy] = curr_df
else:
  print("Skipped clean data loading!")


In [None]:
%pwd

%ls -l data/cleaned/

In [None]:
'Skipped clean data loading!' if not LOAD_CLEAN_DATA else data_dict[0][TRAIN][MISSING_VAL_STRATEGY].head()

In [None]:
'Skipped clean data loading!' if not LOAD_CLEAN_DATA else data_dict[0][TEST][MISSING_VAL_STRATEGY].head()

Create RNN sequences

In [None]:
def generate_rnn_sequences(df, feature_cols, label_col='SepsisLabel', group_col='patient_id', time_col='ICULOS', n=3):
    """
    For each patient, creates overlapping sequences of length `n` to predict the next label.

    Returns:
        sequences: list of (X_seq, y_next) pairs
    """
    sequences = []

    for _, group in df.groupby(group_col):
        #group = group.sort_values(by='index' if 'index' in group.columns else group.index)
        group = group.sort_values(by='index' if 'index' in group.columns else time_col)
        X = group[feature_cols].values
        y = group[label_col].values

        if len(X) <= n:
            continue  # skip short sequences

        for i in range(len(X) - n):
            X_seq = X[i:i+n]        # shape: (n, D)
            y_next = y[i+n]         # scalar: label at t+n
            sequences.append((X_seq, y_next))

    return sequences


## Feature Normalization and Addressing Class Imbalance

In [None]:
REGENERATE_FOLDS = False

In [None]:
# (Aparna) Run Feature Normalization and Addressing Class Imbalance codes here
import pickle
from pathlib import Path
import torch
from torch.utils.data import TensorDataset
from Data_Preparation import(
    train_validate_split,
    center,
    smote_oversample_to_tensor
)
import numpy as np
import joblib

#SEQ_LEN = 24
ID_COL = 'patient_id'
TIME_COL = 'ICULOS'
LABEL_COL = 'SepsisLabel'

PREPROCESSED_DATA_DIR = Path('data/preprocessed')
if not PREPROCESSED_DATA_DIR.is_dir():
    PREPROCESSED_DATA_DIR.mkdir()

if REGENERATE_FOLDS:

    for fold in range(5):
        FOLD_DIR = PREPROCESSED_DATA_DIR.joinpath('fold_' + str(fold))
        if not FOLD_DIR.is_dir():
            FOLD_DIR.mkdir()
        
        TRAIN_DIR = FOLD_DIR.joinpath('train')
        if not TRAIN_DIR.is_dir():
            TRAIN_DIR.mkdir()
            
        TEST_DIR = FOLD_DIR.joinpath('test')
        if not TEST_DIR.is_dir():
            TEST_DIR.mkdir()
        
        VAL_DIR = FOLD_DIR.joinpath('validate')
        if not VAL_DIR.is_dir():
            VAL_DIR.mkdir()
        
        print(f"\n=== Fold {fold} ===")
        train_df = data_dict[fold]['train']['impute'].copy()
        test_df = data_dict[fold]['test']['impute'].copy()
        print(f"Input: \nTrain shape: {train_df.shape}, Test shape: {test_df.shape}")

        #feature_cols = train_df.drop(columns=[ID_COL, TIME_COL, LABEL_COL]).columns
        col_mask = [ID_COL, TIME_COL, LABEL_COL]
        feature_cols = [x for x in train_df.columns.to_list() if not x in col_mask]
        
        # Apply standard scaler to center the data
        train_df[feature_cols] = center(train_df, feature_cols)
        test_df[feature_cols] = center(test_df, feature_cols)
        
        # k fold split for train/validate splitting within train set
        inner_splits = train_validate_split(train_df)
        for i_fold, (train_idx, val_idx) in enumerate(inner_splits):
            train_seqs = generate_rnn_sequences(train_df.iloc[train_idx], feature_cols)
            val_seqs = generate_rnn_sequences(train_df.iloc[val_idx], feature_cols)
            X_train_tensor, y_train_tensor = smote_oversample_to_tensor(
                np.array([x for x, y in train_seqs]), 
                np.array([y for x, y in train_seqs])
            )
            X_val_tensor = torch.tensor(np.array([x for x, y in val_seqs]), dtype=torch.float32)
            y_val_tensor = torch.tensor(np.array([y for x, y in val_seqs]), dtype=torch.float32)
            
            del train_seqs, val_seqs
            
            train_path = TRAIN_DIR.joinpath(f'compressed_train_dataset_{i_fold}.pkl.z')
            val_path = VAL_DIR.joinpath(f'compressed_val_dataset_{i_fold}.pkl.z')
            
            print( f"Writing data to {train_path}: ")
            joblib.dump(TensorDataset(X_train_tensor, y_train_tensor), train_path, compress=3)
            
            print( f"Writing data to {val_path}: ")
            joblib.dump(TensorDataset(X_val_tensor, y_val_tensor), val_path, compress=3)
            
            del X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor

        
        # RNN Sequences for test set
        test_sequences = generate_rnn_sequences(test_df, feature_cols)
        X_test_tensor = torch.tensor(np.array([x for x, y in test_sequences]), dtype=torch.float32)
        y_test_tensor = torch.tensor(np.array([y for x, y in test_sequences]), dtype=torch.float32)

        fname = 'compressed_test_dataset.pkl.z'
        path = TEST_DIR.joinpath(fname)
        print( f"Writing data to {path}: ")
        joblib.dump(TensorDataset(X_test_tensor, y_test_tensor), path, compress=3)
        del test_sequences, X_test_tensor, y_test_tensor, test_df
        
        fname_train = 'compressed_train_dataset_full.pkl.z'
        train_path = TRAIN_DIR.joinpath(fname_train)
        train_seqs = generate_rnn_sequences(train_df, feature_cols)
        joblib.dump(
            TensorDataset(*smote_oversample_to_tensor(
                np.array([x for x, y in train_seqs]),
                np.array([y for x, y in train_seqs])
            )),
            train_path,
            compress=3
        )
        del train_seqs, train_df
        print('Completed!')

## Exploratory data analysis (EDA)

In [None]:
# Ehsan

!ls -l
############################################
###  Test Code Cell Please Don't Change  ###
############################################
# (Ehsan) Run Exploratory data analysis (EDA) codes here
# Lactate is the most relevant criteria then the rest of the plotted variables are most relevant
# 1. Serum Lactate
# 2. White Blood Cell Count (WBC)
# 3. Blood Urea Nitrogen (BUN) / Creatinine
# 4. Mean Arterial Pressure (MAP) / Systolic BP (SBP)
# 5. Heart Rate (HR) & Respiratory Rate (Resp)

from Data_Preparation import run_eda, run_comprehensive_eda
# Example:
#run_eda(data_A, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])
#run_eda(data_B, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])
#run_eda(data_A, ['Lactate','WBC'])
#run_eda(data_B, ['Lactate','WBC'])
############################################
######### A more comprehensive EDA #########
############################################
#run_comprehensive_eda
# 1) Missingness
# 2) Correlation heatmap (drop rows with any missing in features)
# 3) Boxplots for each feature by label
# 4) KDE overlays (all features in one grid)
# 5) PCA scatter
# Example:
if RUN_EDA_A:
    # 1) Automatically select all feature columns except the ones to drop:
    to_drop = ['SepsisLabel', 'patient_id', 'Unit1', 'Unit2', 'HospAdmTime']
    all_features = [col for col in data_A.columns if col not in to_drop]

    # 2) Quick sanity-check
    print("Running EDA on:", all_features)

    # 3) Call your comprehensive EDA (here we run all steps 1–5):
    from Data_Preparation.eda import run_comprehensive_eda
    run_comprehensive_eda(data_A, all_features, steps=[1,2])


##### Other examples
#run_comprehensive_eda(data_AB, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp','O2Sat','Temp','pH','PTT','Glucose','Chloride','Bilirubin_direct'], steps = [1,2])
#run_comprehensive_eda(data_A, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])
#run_comprehensive_eda(data_B, ['HCO3','Lactate','WBC','BUN','MAP','HR','Resp'])


from Data_Preparation.eda import corr_difference_analysis
if RUN_EDA_A:
    features = [c for c in data_A.columns
                if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]

    diff_matrix, top_changes = corr_difference_analysis(
        data_A,
        features,
        min_count=50,   # only include features with ≥50 non‐null in each label
        top_k=15,
        figsize=(8,6)
    )


In [None]:
# Ehsan
############################################
###      Ready to run Dataset_A EDA      ###
############################################
###  Please Don't Change  ###
from Data_Preparation import run_eda, run_comprehensive_eda, corr_difference_analysis
if RUN_EDA_A:
    features = [c for c in data_A.columns
                if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]
    #run_comprehensive_eda(data_A)
    run_comprehensive_eda(data_A, all_features, steps=[1,2])
    #corr_difference_analysis(data_A)
    diff_matrix, top_changes = corr_difference_analysis(
        data_A,
        features,
        min_count=50,   # only include features with ≥50 non‐null in each label
        top_k=15,
        figsize=(8,6)
    )

In [None]:
# Ehsan
############################################
###      Ready to run Dataset_B EDA      ###
############################################
###  Please Don't Change  ###
from Data_Preparation import run_eda, run_comprehensive_eda, corr_difference_analysis
if RUN_EDA_B:
    features = [c for c in data_B.columns
                if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]
    #run_comprehensive_eda(data_B)
    run_comprehensive_eda(data_B, all_features, steps=[1,2])
    #corr_difference_analysis(data_B)
    diff_matrix, top_changes = corr_difference_analysis(
        data_B,
        features,
        min_count=50,   # only include features with ≥50 non‐null in each label
        top_k=15,
        figsize=(8,6)
    )

In [None]:
# Ehsan
############################################
###      Ready to run Dataset_AB EDA     ###
############################################
###  Please Don't Change  ###
from Data_Preparation import run_eda, run_comprehensive_eda, corr_difference_analysis
if RUN_EDA_AB:
    features = [c for c in data_AB.columns
                if c not in ('SepsisLabel','patient_id','Unit1','Unit2','HospAdmTime')]
    #run_comprehensive_eda(data_AB)
    run_comprehensive_eda(data_AB, all_features, steps=[1,2])
    #corr_difference_analysis(data_AB)
    diff_matrix, top_changes = corr_difference_analysis(
        data_AB,
        features,
        min_count=50,   # only include features with ≥50 non‐null in each label
        top_k=15,
        figsize=(8,6)
    )

# Baseline Models

## LSTM

In [3]:
#### APARNA ####
import os
import json
from Model_Definitions.lstm import (
    SepsisLSTM,
    SepsisTransformerDataset,
    train_eval_lstm,
    SepsisTransformerResult
)
import torch.nn as nn

# Define arguments (replace with your actual paths)

processed_data_dir="data/preprocessed/AB"
raw_data="data/raw/training_set_AB.csv"


# Load metadata and extract number of features
metadata_file = os.path.join(processed_data_dir, 'data.json')
with open(metadata_file, 'r') as f:
    metadata = json.load(f)
num_features = metadata['num_features']

# Prepare datasets
train_ds = SepsisTransformerDataset(os.path.join(processed_data_dir, 'train'), raw_data)
test_ds = SepsisTransformerDataset(os.path.join(processed_data_dir, 'test'), raw_data)

# Initialize model, hyperparameters, and loss
model = SepsisLSTM(input_size=num_features)
train_params = {
    'batch_size': 16,
    'num_epochs': 10,
    'learning_rate': 1e-4
}
criterion = nn.BCEWithLogitsLoss()

# Train and evaluate
print(f"\n--- Training LSTM  ---\n")
(
    epochs, train_loss, val_loss,
    fpr, tpr,
    prec, rec,
    threshold, f1, p, r,
    cm, auroc, auprc,
    sensitivity, specificity
) = train_eval_lstm(model, criterion, train_ds, test_ds, train_params)

# Print result
result = SepsisTransformerResult(threshold, f1, p, r, cm, auroc, auprc, sensitivity, specificity)
print(result)



--- Training LSTM  ---

Epoch  1 | train loss 0.2448 | val loss 0.2196 | val AUROC 0.7012
Epoch  2 | train loss 0.2144 | val loss 0.2140 | val AUROC 0.7266
Epoch  3 | train loss 0.2095 | val loss 0.2150 | val AUROC 0.7349
Epoch  4 | train loss 0.2042 | val loss 0.2126 | val AUROC 0.7390
Epoch  5 | train loss 0.1992 | val loss 0.2121 | val AUROC 0.7438
Epoch  6 | train loss 0.1952 | val loss 0.2118 | val AUROC 0.7447
Epoch  7 | train loss 0.1909 | val loss 0.2138 | val AUROC 0.7351
Epoch  8 | train loss 0.1871 | val loss 0.2156 | val AUROC 0.7341
Epoch  9 | train loss 0.1820 | val loss 0.2283 | val AUROC 0.7095
Epoch 10 | train loss 0.1784 | val loss 0.2212 | val AUROC 0.7155
╔══════════════════════════════════════════════╗
║          Sepsis LSTM Results               ║
╠══════════════════════════════════════════════╣
  Best Threshold (max F1) : 0.1312481015920639
  F1 Score                : 0.23625730994152047
  Precision (at max F1)   : 0.16543816543816545
  Recall (at max F1)      :

# Advanced Models

## GRU-D

Ehsan

In [None]:
######## EHSAN #########

# Test GRU-D preprocessing end-to-end without interactive plotting

import os
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Use a non-interactive backend to avoid kernel crashes
matplotlib.use('Agg')

# Add Data_Preparation folder to module search path
sys.path.append('./Data_Preparation')
from grud_preprocessing import generate_grud_input

# Load data
raw_path = "data/raw/training_set_A.csv"
if not os.path.exists(raw_path):
    raise FileNotFoundError(f"File not found: {raw_path}")
df = pd.read_csv(raw_path)

# Select a single patient
patient_id = df['patient_id'].iloc[0]
patient_df = df[df['patient_id'] == patient_id].copy()

# Define feature columns (exclude metadata and labels)
exclude_cols = [
    'Age', 'Gender', 'Unit1', 'Unit2',
    'HospAdmTime', 'ICULOS', 'SepsisLabel', 'patient_id'
]
feature_cols = [col for col in patient_df.columns if col not in exclude_cols]

# Generate GRU-D inputs
X, M, Delta = generate_grud_input(patient_df, features=feature_cols)

# Print shapes and example values
print("GRU-D input shapes:")
print(f"  X shape   : {X.shape}")
print(f"  Mask shape: {M.shape}")
print(f"  Delta shape: {Delta.shape}")
print("\nSample mask values (first row, first 5 features):", M[0, :5])
print("Sample delta values (first row, first 5 features):", Delta[0, :5])

# Save the missingness mask plot to a file
plt.figure(figsize=(12, 6))
plt.imshow(M.T, aspect='auto', cmap='gray_r')
plt.colorbar(label='Mask Value')
plt.title(f'Missingness Mask for Patient {patient_id}')
plt.xlabel('Time Step')
plt.ylabel('Feature Index')
plt.tight_layout()
plt.savefig('grud_mask_preview.png')
print('Mask plot saved to grud_mask_preview.png')


In [None]:
######## EHSAN #########

# Generate GRU‑D inputs for datasets A, B, and AB and save to separate folders

import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib

# Use non‑interactive backend
matplotlib.use('Agg')

# Add preprocessing module to path
sys.path.append('./Data_Preparation')
from grud_preprocessing import generate_grud_input

# Map dataset identifiers to their CSV paths
datasets = {
    'A':  'data/raw/training_set_A.csv',
    'B':  'data/raw/training_set_B.csv',
    'AB': 'data/raw/training_set_AB.csv'
}

# Columns to exclude from features
exclude_cols = [
    'Age', 'Gender', 'Unit1', 'Unit2',
    'HospAdmTime', 'ICULOS', 'SepsisLabel', 'patient_id'
]

base_output_dir = "data/preprocessed_grud/"
os.makedirs(base_output_dir, exist_ok=True)

for set_name, csv_path in datasets.items():
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"File not found: {csv_path}")
    df = pd.read_csv(csv_path)
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    output_dir = os.path.join(base_output_dir, set_name)
    os.makedirs(output_dir, exist_ok=True)

    for patient_id, patient_df in tqdm(df.groupby('patient_id'),
                                       desc=f"Processing set {set_name}"):
        patient_df = patient_df.sort_values('ICULOS').reset_index(drop=True)
        X, M, Delta = generate_grud_input(patient_df, features=feature_cols)
        out_file = os.path.join(output_dir, f"patient_{patient_id}.npz")
        np.savez_compressed(out_file, X=X, M=M, Delta=Delta)


In [None]:
######## EHSAN #########

import importlib.util
import sys
import os

# 1) Manually load train_grud.py as a standalone module named “train_grud”
train_grud_path = os.path.abspath("Training_Pipeline/train_grud.py")
spec_tg = importlib.util.spec_from_file_location("train_grud", train_grud_path)
train_grud = importlib.util.module_from_spec(spec_tg)
sys.modules["train_grud"] = train_grud
spec_tg.loader.exec_module(train_grud)

# 2) Read grud_pipeline.py, rewrite its import to use our train_grud module
pipeline_path = os.path.abspath("Training_Pipeline/grud_pipeline.py")
source = open(pipeline_path, "r").read()
# Replace the package import with our standalone module
source = source.replace(
    "from Training_Pipeline.train_grud import SepsisGrudDataset, GRUDModel",
    "from train_grud import SepsisGrudDataset, GRUDModel"
)

# 3) Execute the modified pipeline code in its own module namespace
spec_gp = importlib.util.spec_from_loader("grud_pipeline", loader=None)
grud_pipeline = importlib.util.module_from_spec(spec_gp)
exec(source, grud_pipeline.__dict__)

# Now pull out the function
run_experiments = grud_pipeline.run_experiments


In [None]:
######## EHSAN #########
# clean notebook cell
run_experiments(
    splits=['A','B','AB'],
    results_base='results/GRU_D',
    epochs=2,
    batch_size=32,
    lr=1e-3
)

## Transformer

### Data Preparation

In [1]:
#### MITCH
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib
from Data_Preparation import generate_transformer_input, stratified_group_k_fold, center

# Use non‑interactive backend
matplotlib.use('Agg')

# Map dataset identifiers to their CSV paths
datasets = {
    'A':  'data/raw/training_set_A.csv',
    'B':  'data/raw/training_set_B.csv',
    'AB': 'data/raw/training_set_AB.csv'
}

# Columns to exclude from features
exclude_cols = [
    'Age', 'Gender', 'Unit1', 'Unit2', 'EtCO2',
    'HospAdmTime', 'ICULOS', 'SepsisLabel', 'patient_id'
]

# Calculate the percentage of missing values for each column
# concatenated_df = pd.read_csv(datasets['AB'])
# percent_missing = concatenated_df.isnull().sum() * 100 / len(concatenated_df)
# high_missingness_cols = concatenated_df.columns[percent_missing > 95]
# print(f'Columns with lots of missing data: {high_missingness_cols}')
# low_missingness_cols = concatenated_df.columns[percent_missing <= 95]
# print(f'Keeping the following columns:\n{low_missingness_cols}')


base_output_dir = "data/preprocessed_transformer/"
os.makedirs(base_output_dir, exist_ok=True)

for set_name, csv_path in datasets.items():
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"File not found: {csv_path}")
    df = pd.read_csv(csv_path)
    #feature_cols = [c for c in df.columns if (c not in exclude_cols) and (c not in high_missingness_cols)]
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    output_dir = os.path.join(base_output_dir, set_name)
    os.makedirs(output_dir, exist_ok=True)
    
    # initialize metadata dict
    data_metadata_dict = {}
    data_metadata_fname = 'data.json'
    
    # Split into train and test
    splits = stratified_group_k_fold(df, 5)
    train_idx, test_idx = splits[0]
    train_df = df.iloc[train_idx].copy()
    test_df = df.iloc[test_idx].copy()
    
    # Apply standard scaler for feature normalization
    train_df[feature_cols] = center(train_df, feature_cols)
    test_df[feature_cols] = center(test_df, feature_cols)
    
    # save class balance metadata
    data_metadata_dict['train_sepsis_prevalence'] = train_df['SepsisLabel'].sum() / len(train_df)
    data_metadata_dict['test_sepsis_prevalence'] = test_df['SepsisLabel'].sum() / len(test_df)
    
    # make directories to save data
    train_output_dir = os.path.join(output_dir, 'train')
    test_output_dir = os.path.join(output_dir, 'test')
    os.makedirs(train_output_dir, exist_ok=True)
    os.makedirs(test_output_dir, exist_ok=True)
    xmean_train = train_df.mean()
    
    collected_feature_ct = False
    
    # generate inputs for train set
    for patient_id, patient_df in tqdm(train_df.groupby('patient_id'), desc=f"Processing train set {set_name}"):
        patient_df = patient_df.sort_values('ICULOS').reset_index(drop=True)
        X = generate_transformer_input(patient_df, xmean_train, features=feature_cols)
        if X is None:
            continue
        if not collected_feature_ct:
            data_metadata_dict['num_features'] = X.shape[-1]
            collected_feature_ct = True
        out_file = os.path.join(train_output_dir, f"patient_{patient_id}.npz")
        np.savez_compressed(out_file, X=X)
    
    # generate inputs for test set
    for patient_id, patient_df in tqdm(test_df.groupby('patient_id'), desc=f"Processing test set {set_name}"):
        patient_df = patient_df.sort_values('ICULOS').reset_index(drop=True)
        X = generate_transformer_input(patient_df, xmean_train, features=feature_cols)
        if X is None:
            continue
        out_file = os.path.join(test_output_dir, f"patient_{patient_id}.npz")
        np.savez_compressed(out_file, X=X)
    
    # dump metadata to json file
    with open(os.path.join(output_dir, data_metadata_fname), 'w') as f:
        json.dump(data_metadata_dict, f)

Processing train set A: 100%|██████████| 16267/16267 [00:47<00:00, 345.71it/s]
Processing test set A: 100%|██████████| 4069/4069 [00:11<00:00, 352.04it/s]
Processing train set B: 100%|██████████| 16000/16000 [00:43<00:00, 364.09it/s]
Processing test set B: 100%|██████████| 4000/4000 [00:11<00:00, 352.63it/s]
Processing train set AB: 100%|██████████| 32269/32269 [01:33<00:00, 343.53it/s]
Processing test set AB: 100%|██████████| 8067/8067 [00:23<00:00, 349.63it/s]


### Transformer Training and Evaluation

In [None]:
import os
import torch
import torch.nn as nn

from Model_Definitions import (
    Sepsis_Predictor_Encoder,
    Sepsis_Predictor_Encoder_Hyperparameters
)

from Training_Pipeline import (
    SepsisTransformerDataset,
    SepsisTransformerResult,
    train_eval_transformer,
    Train_Hyperparameters
)

base_data_dir = "data/preprocessed_transformer/"
raw_data_dir = "data/raw"

split = 'A'
raw_csv_fname = 'training_set_' + split + '.csv'

train_dir = os.path.join(base_data_dir, split, 'train')
test_dir = os.path.join(base_data_dir, split, 'test')
raw_csv_path = os.path.join(raw_data_dir, raw_csv_fname)

metadata_file = os.path.join(base_data_dir, 'data.json')
with open(metadata_file, 'r') as f:
    metadata = json.load(f)
num_features = metadata['num_features']

train_ds = SepsisTransformerDataset(train_dir, raw_csv_path)
test_ds = SepsisTransformerDataset(test_dir, raw_csv_path)

hyperparams = Sepsis_Predictor_Encoder_Hyperparameters(
    embedding_dim=64,
    feedforward_hidden_dim=128,
    n_heads=4,
    activation='relu',
    n_layers=6,
    dropout_p=0,
    pos_encoding_dropout_p=0
)

train_params = Train_Hyperparameters(
    batch_size=16,
    num_epochs=10,
    learning_rate=1e-4
)

model = Sepsis_Predictor_Encoder(
    input_size=num_features,
    output_size=1,
    hyperparameters=hyperparams
)

criterion = nn.BCEWithLogitsLoss()

print(f'\n\n')
print(f'------ Starting Training: Split {split} ------\n')
print(f'{hyperparams}\n{train_params}\n')

eval_result = train_eval_transformer(model, criterion, train_ds, test_ds, train_params)
loss_grid, roc_grid, prc_grid, best_thresh_scores, confusion_matrix, auroc, auprc = eval_result
epochs, train_loss, eval_loss = loss_grid
threshold, f1, precision, recall = best_thresh_scores

res = SepsisTransformerResult(
    best_thresh=threshold,
    f1=f1,
    precision=precision,
    recall=recall,
    confusion_matrix=confusion_matrix,
    auroc=auroc,
    auprc=auprc
)

print(f'\n{res}\n')