# Dataset prepration

Get the dataset and indices

In [None]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../.."))
sys.path.append(project_root)

### Data Preparation Based on Training Method

The training configuration file determines which dataset folder is created:

- If `training_method = LR`, then the `LR_data/` directory will be created.
- If `training_method = GRUD`, then the `GRUD_data/` directory will be created.

Make sure the `training_method` is correctly set in your config before running the data preparation step.
``


In [None]:
import yaml

with open('train_config.yaml', 'r') as file:
    train_config = yaml.safe_load(file)
#TODO : fix here
use_LR = train_config['train']['training_method'] == 'LR'
data_path = train_config['data']['data_dir']
train_frac = train_config['data']['f_train']
test_frac = train_config['data']['f_test']
early_stopping = train_config['data']['f_early_stop']
if train_frac + test_frac + early_stopping > 1:
    raise ValueError("The sum of train_frac, test_frac, and early_stopping must be less or equal to 1.")

if use_LR:
    path = data_path + "LR_data/"
else:
    path = data_path + "GRUD_data/"

dataset_path = os.path.join(path, "dataset.pkl")
indices_path = os.path.join(path, "indices.pkl")

This function processes ICU patient data by filtering out stays with insufficient recorded hours and selecting only the first `WINDOW_SIZE` hours. It creates a binary target variable, `los_3`, indicating whether a patient stayed in the ICU for more than three days. The function returns the filtered time-series data and the corresponding target values.  


In [None]:
def filter_and_label_data(statics, data):
    """
    Notes:
        - Only ICU stays longer than `WINDOW_SIZE + GAP_TIME` hours are considered.
        - `WINDOW_SIZE` defines how many initial hours of ICU stay are kept.
        - `GAP_TIME` accounts for a buffer period before prediction.
    """
    GAP_TIME = 6  # In hours
    WINDOW_SIZE = 24  # In hours

    # Define target labels
    y = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][["los_icu"]].copy()
    y["los_3"] = (y["los_icu"] > 3).astype(float)
    y.drop(columns=["los_icu"], inplace=True)

    # Filter data: keep only ICU stays present in y and within the first WINDOW_SIZE hours
    data = data[
        (data.index.get_level_values("icustay_id").isin(y.index.get_level_values("icustay_id"))) &
        (data.index.get_level_values("hours_in") < WINDOW_SIZE)
    ]

    # Verify subject IDs match between data and labels
    subj_ids_lvl2 = data.index.get_level_values("subject_id")
    subj_ids_y = y.index.get_level_values("subject_id")
    assert set(subj_ids_lvl2) == set(subj_ids_y), "Subject ID pools differ!"

    return data, y

This function splits the data by subject into train and test sets.  
The function returns the input features and labels for both splits, avoiding information leakage.


In [None]:
def split_data_by_subject(data, y, train_frac, seed=1):
    """
    Splits preprocessed data and labels into training and holdout sets by subject ID.

    Args:
        data (pd.DataFrame): Preprocessed time-series data with a MultiIndex.
        y (pd.DataFrame): Corresponding labels with a MultiIndex.
        train_frac (float): Fraction of subjects to assign to the training set.
        seed (int): Random seed for reproducibility.

    Returns:
        train_data (pd.DataFrame): Training portion of the data.
        holdout_data (pd.DataFrame): Holdout (test) portion of the data.
        y_train (pd.DataFrame): Training labels.
        y_holdout (pd.DataFrame): Holdout labels.
    """
    subj_ids = data.index.get_level_values("subject_id").unique()
    np.random.seed(seed)
    shuffled = np.random.permutation(subj_ids)

    N_train = int(train_frac * len(shuffled))
    train_subj = shuffled[:N_train]
    test_subj = shuffled[N_train:]

    train_data   = data[data.index.get_level_values("subject_id").isin(train_subj)]
    holdout_data = data[data.index.get_level_values("subject_id").isin(test_subj)]
    y_train      = y[y.index.get_level_values("subject_id").isin(train_subj)]
    y_holdout    = y[y.index.get_level_values("subject_id").isin(test_subj)]

    return train_data, holdout_data, y_train, y_holdout


Normalizes all `"mean"` columns in train and test DataFrames using the mean and standard deviation computed from the training set.  
Ensures consistent scaling without information leakage.


In [None]:
def data_normalization(train_df, test_df):
    """
    Standardizes 'mean' columns in both train and test sets using training set statistics.

    Args:
        train_df (pd.DataFrame): Training set with MultiIndex columns.
        test_df (pd.DataFrame): Test/holdout set with the same structure.

    Returns:
        train_df_norm (pd.DataFrame): Normalized training set.
        test_df_norm (pd.DataFrame): Normalized test set.
    """
    idx = pd.IndexSlice

    # Identify all columns where the second level is 'mean'
    mean_cols = train_df.loc[:, idx[:, "mean"]]

    # Compute column-wise mean and std from the training set
    col_means = mean_cols.mean(axis=0)
    col_stds = mean_cols.std(axis=0)

    # Apply normalization to training and test sets
    train_df_norm = train_df.copy()
    test_df_norm = test_df.copy()

    train_df_norm.loc[:, idx[:, "mean"]] = (train_df.loc[:, idx[:, "mean"]] - col_means) / (col_stds + 1e-8)
    test_df_norm.loc[:, idx[:, "mean"]] = (test_df.loc[:, idx[:, "mean"]] - col_means) / (col_stds + 1e-8)

    return train_df_norm, test_df_norm

This function imputes missing values in time-series clinical data.  
It forward-fills "mean" values within each ICU stay, then fills remaining gaps with group means and zeros.  
It also creates binary masks and computes the time since each variable was last measured. It removes "std" and adds "time_since_measured"


In [None]:
def simple_imputer(dataframe, ID_COLS):
    idx = pd.IndexSlice
    df = dataframe.copy()

    # Drop extra levels if necessary
    if len(df.columns.names) > 2:
        df.columns = df.columns.droplevel(("label", "LEVEL1", "LEVEL2"))

    # Extract relevant columns
    df_out = df.loc[:, idx[:, ["mean", "count"]]].copy()

    # Handle 'mean' columns
    mean_cols = idx[:, "mean"]
    count_cols = idx[:, "count"]

    # Group-level means (per ICU stay)
    icustay_means = df_out.loc[:, mean_cols].groupby(ID_COLS).transform("mean")

    # Forward fill within ICU stays, then fill with group mean, then with 0
    df_out.loc[:, mean_cols] = (
        df_out.loc[:, mean_cols]
        .groupby(ID_COLS).ffill()
        .fillna(icustay_means)
        .fillna(0)
    )

    # Convert 'count' columns to binary mask
    df_out.loc[:, count_cols] = (df.loc[:, count_cols] > 0).astype(float)
    df_out.rename(columns={"count": "mask"}, level="Aggregation Function", inplace=True)

    # Calculate time since last measurement
    mask_cols = idx[:, "mask"]
    is_absent = 1 - df_out.loc[:, mask_cols]
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence.where(is_absent == 0).ffill()
    time_since_measured.rename(columns={"mask": "time_since_measured"}, level="Aggregation Function", inplace=True)

    # Add to output and fill remaining NaNs
    df_out = pd.concat([df_out, time_since_measured], axis=1)
    df_out.loc[:, idx[:, "time_since_measured"]] = df_out.loc[:, idx[:, "time_since_measured"]].fillna(100)

    # Ensure column order is consistent
    df_out.sort_index(axis=1, inplace=True)

    return df_out


This function returns index-based splits for training, testing, and early stopping, which is required for working with PyTorch `Dataset` objects using `Subset`. It ensures that splits are reproducible and compatible with indexable dataset 
structures.


In [None]:
def get_data_indices(dataset, train_frac, test_frac, early_stop_frac):
    """
    Generates sequential index splits for train, test, and early stopping.

    Args:
        dataset (torch Dataset or tensor-like): Must be indexable (i.e., supports len()).
        train_frac (float): Fraction of data for training.
        test_frac (float): Fraction of data for testing.
        early_stop_frac (float): Fraction of data for early stopping.

    Returns:
        data_indices (List[int]): All indices in the dataset.
        train_indices (List[int]): Training indices.
        test_indices (List[int]): Testing indices.
        early_stop_indices (List[int]): Early stopping indices.
    """
    N = len(dataset)
    data_indices = np.arange(N).tolist()

    N_train = int(train_frac * N)
    N_test = int(test_frac * N)
    N_early = int(early_stop_frac * N)

    train_indices = data_indices[:N_train]
    test_indices = data_indices[N_train:N_train + N_test]
    early_stop_indices = data_indices[N_train + N_test:N_train + N_test + N_early]

    return data_indices, train_indices, test_indices, early_stop_indices


Scales continuous features in train and test sets using `StandardScaler` fitted only on the training data.  
Ensures features have zero mean and unit variance for models sensitive to feature magnitude.

In [None]:
from sklearn.preprocessing import StandardScaler

def standard_scaler(flat_train, flat_test):
    """
    Scales continuous features (float64, int64) using StandardScaler fitted on training data.
    Returns scaled versions of train and test DataFrames.
    """
    scaler = StandardScaler()

    # Only select numeric (continuous) columns once
    continuous_cols = flat_train.select_dtypes(include=["float64", "int64"]).columns

    # Fit scaler on training, transform both
    train_scaled_values = scaler.fit_transform(flat_train[continuous_cols])
    test_scaled_values = scaler.transform(flat_test[continuous_cols])

    # Use .assign for inplace-style clean reassignment
    train_scaled = flat_train.copy()
    train_scaled[continuous_cols] = train_scaled_values

    test_scaled = flat_test.copy()
    test_scaled[continuous_cols] = test_scaled_values

    return train_scaled, test_scaled


Loads the MIMIC-III dataset from the specified path.  
Returns the time-series data (`vitals_labs`) and static patient information (`patients`).

In [None]:
def load_mimic_data(data_path):
    data_file_path = os.path.join(data_path, "all_hourly_data.h5")
    if not os.path.exists(data_file_path):
        raise FileNotFoundError(
            "Please download the MIMIC-III dataset from https://physionet.org/content/mimiciii/1.4/ "
            "and save it in the specified path."
        )
    print("Loading data...")
    data = pd.read_hdf(data_file_path, "vitals_labs")
    statics = pd.read_hdf(data_file_path, "patients")
    return statics, data

Prepares the dataset for modeling by filtering, labeling, normalizing, imputing,  
and optionally flattening the time-series data for logistic regression input.

In [None]:
def process_data_for_model(data, statics, ID_COLS, train_frac, use_LR):
    print("Filtering and labeling data...")
    data, y = filter_and_label_data(statics, data)
    train_data, holdout_data, y_train, y_holdout = split_data_by_subject(data, y, train_frac)

    print("Normalizing data...")
    train_data, holdout_data = data_normalization(train_data, holdout_data)

    print("Imputing missing values...")
    train_data, holdout_data = [
        simple_imputer(df, ID_COLS) for df in tqdm((train_data, holdout_data), desc="Imputation")
    ]

    if use_LR:
        print("Flattening data for LR...")
        flat_train, flat_holdout = (
            df.pivot_table(index=ID_COLS, columns=["hours_in"])
            for df in tqdm((train_data, holdout_data), desc="Pivoting")
        )
        train, holdout, label_train, label_holdout = (
            df.reset_index(drop=True)
            for df in tqdm((flat_train, flat_holdout, y_train, y_holdout), desc="Resetting Index")
        )
    else:
        train, holdout, label_train, label_holdout = train_data, holdout_data, y_train, y_holdout

    return train, holdout, label_train, label_holdout

Validates, scales, and concatenates the processed data and labels.  
Creates a PyTorch-compatible dataset and saves it along with train/test/early-stop indices.

In [None]:

def build_and_save_dataset(train, holdout, label_train, label_holdout, dataset_path, indices_path, use_LR, train_frac, test_frac, early_stop_frac):
    print("Checking data integrity...")
    for df, name in zip([train, holdout, label_train, label_holdout],
                        ["train", "holdout", "label_train", "label_holdout"]):
        assert not df.isnull().values.any(), f"Missing values found in {name}."

    print("Scaling data...")
    train_df, holdout_df = standard_scaler(train, holdout)

    print("Creating dataset...")
    data_x = pd.concat((train_df, holdout_df), axis=0)
    data_y = pd.concat((label_train, label_holdout), axis=0)

    assert np.issubdtype(data_x.values.dtype, np.number), "Non-numeric data found in features."
    assert np.issubdtype(data_y.values.dtype, np.number), "Non-numeric data found in labels."

    y_tensor = from_numpy(data_y.values).float()

    if use_LR:
        data_tensor = from_numpy(data_x.values).float()
        dataset = MIMICUserDataset(data_tensor, y_tensor)
    else:
        data_tensor = to_3D_tensor(data_x)
        dataset = MIMICUserDataset(data_tensor, y_tensor)

    print("Splitting dataset into indices...")
    data_indces, train_indices, test_indices, early_stop_indices = get_data_indices(
        data_tensor, train_frac, test_frac, early_stop_frac
    )

    os.makedirs(os.path.dirname(dataset_path), exist_ok=True)

    print("Saving dataset...")
    with open(dataset_path, "wb") as file:
        pickle.dump(dataset, file)

    with open(indices_path, "wb") as file:
        pickle.dump({
            "data_indices": data_indces,
            "train_indices": train_indices,
            "test_indices": test_indices,
            "early_stop_indices": early_stop_indices,
        }, file)

    print("Dataset and indices saved.")

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
from torch import from_numpy
from tqdm import tqdm
from mimic_data_handler import MIMICUserDataset, to_3D_tensor


if os.path.exists(dataset_path) and os.path.exists(indices_path):
    print(f"Dataset and indices already exist. Loading from: {dataset_path}")
else:
    print("Creating dataset...")
    ID_COLS = ["subject_id", "hadm_id", "icustay_id"]
    statics, data = load_mimic_data(data_path)

    train_frac = train_config['data']['f_train']
    test_frac = train_config['data']['f_test']
    early_stop_frac = train_config['data']['f_early_stop']

    train, holdout, label_train, label_holdout = process_data_for_model(
        data, statics, ID_COLS, train_frac, use_LR
    )

    build_and_save_dataset(
        train, holdout, label_train, label_holdout,
        dataset_path, indices_path, use_LR,
        train_frac, test_frac, early_stop_frac
    )