<a href="https://colab.research.google.com/github/Zfeng0207/FIT3199-FYP/blob/dev%2Fzfeng/multi-label-baseline-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks (1)/ECG-MIMIC-main')

In [3]:
!pip install -qqqq mlflow torchmetrics pytorch_lightning iterative-stratification

# Dataset Labeling

#### Multihot encode: Setting up target binary labels

In [4]:
import numpy as np
import ast

def multihot_encode(diagnoses, icd_codes):
    """
    Multi-hot encodes diagnoses based on ICD codes.

    Args:
        diagnoses (str): A string representation of diagnoses (e.g., "['I251', 'I48', 'I503']").
        icd_codes (tuple or list): A list or tuple of target ICD codes.

    Returns:
        np.ndarray: A multi-hot encoded array.
    """
    num_classes = len(icd_codes)
    res = np.zeros(num_classes, dtype=np.float32)

    # Evaluate the string as a list using ast.literal_eval
    try:
        diagnoses_list = ast.literal_eval(diagnoses)
    except (SyntaxError, ValueError):
        diagnoses_list = []  # Handle cases where evaluation fails

    # Iterate through diagnoses_list and encode if it starts with any of the target codes
    for diag in diagnoses_list:
        for i, code in enumerate(icd_codes):  # Iterate through icd_codes with index
            if diag.startswith(code):
                res[i] = 1
                break  # Exit inner loop after finding a match
    return res

In [5]:
import numpy as np
import pandas as pd
import ast

# dataframe with 300,000 rows
# df_full = pd.read_csv("src/data/label_df.csv")

# dataframe with 800,000 rows
df_full = pd.read_csv("src/data/records_w_diag_icd10.csv")


df_full['label_train'] = df_full['all_diag_all'].apply(
    lambda x: str(list(set([code[:3] for code in ast.literal_eval(x)])))
)


In [9]:
df_labels =  df_full[["filename",
            "study_id",
            "patient_id",
            "ecg_time",
            "label_train",
            "all_diag_all"]]

target_icd_codes = (
 "I25", "E87", "E11"
)

# target_icd_codes = (
# "I20", "I21", "I22", "I23", "I24", "I25", "I42", "E87", "I48", "I44", "I45", "E11", "J44", "J45"
# )

# this is not working correctly fix this!
df_labels['res'] = df_labels['label_train'].apply(lambda diagnoses: multihot_encode(diagnoses, target_icd_codes))

# df_labels['stroke_yn'] = df_labels['res'].apply(lambda x: 1 if 1 in x else 0)

df_labels = df_labels[df_labels['res'].apply(lambda x: len(x) > 0)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['res'] = df_labels['label_train'].apply(lambda diagnoses: multihot_encode(diagnoses, target_icd_codes))


## Number of sparse target class

In [10]:
df = df_labels.copy()

In [11]:
import numpy as np
import pandas as pd

def calculate_mean_positive_rate(df, label_col="res"):
    """
    Calculates the mean positive rate per label for a pandas DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the multi-hot labels.
        label_col (str): The column name containing the multi-hot labels.

    Returns:
        np.ndarray: An array containing the mean positive rate for each label.
    """

    # Extract labels and ensure they are NumPy arrays with consistent shape
    labels = df[label_col].apply(lambda x: np.array(x, dtype=np.float32)).values
    labels = np.vstack(labels)  # Stack the labels into a 2D array


    # Calculate mean positive rate per label
    mean_positive_rate = labels.mean(axis=0)

    return mean_positive_rate

# Assuming 'df' is your DataFrame
mean_positive_rates = calculate_mean_positive_rate(df)

# Print the results
print("Mean positive rate per label:", mean_positive_rates)

Mean positive rate per label: [0.14026262 0.1120426  0.11796609]


In [12]:
def count_empty_labels(df, label_col="res"):
    """
    Counts how many samples in the DataFrame have all-zero labels.

    Args:
        df (pd.DataFrame): DataFrame containing the dataset
        label_col (str): Column name containing the multi-hot labels

    Returns:
        int: Number of rows with all-zero labels
    """
    empty_count = 0

    for label_str in df[label_col]:
        if label_str.sum() == 0:
            empty_count += 1

    return empty_count


# Dataset and Model Configurations

In [13]:
from dataclasses import dataclass
import os
import platform

# You can define ROOT_PATH somewhere above
ROOT_PATH = "/content/drive/MyDrive/Colab Notebooks (1)/ECG-MIMIC-main/src"
@dataclass
class DatasetConfig:
    # ECG-specific
    NUM_LEADS:    int = 12  # 12 ECG channels (leads)
    NUM_CLASSES:  int = 3  # 12 ICD disease codes
    VALID_PCT:  float = 0.1

    # Dataset file and folder paths
    TRAIN_CSV:   str = os.path.join(ROOT_PATH, "data/train.csv")  # Your preprocessed split CSV
    TEST_CSV:    str = os.path.join(ROOT_PATH, "data/test.csv")
    MEMMAP_FILE: str = os.path.join(ROOT_PATH, "ecg_dataset", "data/memmap/memmap.npy")
    MEMMAP_META: str = os.path.join(ROOT_PATH, "ecg_dataset", "data/memmap/memmap_meta.npz")

@dataclass
class TrainingConfig:
    BATCH_SIZE:      int = 16
    NUM_EPOCHS:      int = 30  # Actual training epochs
    INIT_LR:       float = 1e-3
    NUM_WORKERS:     int = 7
    OPTIMIZER_NAME:  str = "Adam"
    WEIGHT_DECAY:  float = 1e-4
    USE_SCHEDULER:  bool = True
    SCHEDULER:       str = "multi_step_lr"  # or "cosine_annealing"
    F1_METRIC_THRESH: float = 0.5
    FREEZE_BACKBONE: bool = False

    # (Optional) model name (if you want to log it somewhere)
    MODEL_NAME:      str = "resnet18"


In [14]:
def encode_label(label: list, num_classes=10):
    """
    This functions converts labels into multi-hot encoding.
    Handles both single ICD codes and lists of codes.
    """
    target = torch.zeros(num_classes)

    # If label is a single code, make it a list
    if isinstance(label, str):
        label = [label]

    for l in label:
        # Check if 'l' contains brackets (indicating list within a string)
        if '[' in l or ']' in l:
            l = l.strip('[]').replace("'", "").split(",")  # Handle list-like strings
            for code in l:
                code = code.strip()  # Remove any whitespace around code
                if code in icd_to_index:
                    target[icd_to_index[code]] = 1.0
        else:
            l = l.strip()  # Remove any whitespace around code
            if l in icd_to_index:
                target[icd_to_index[l]] = 1.0
    return target


def decode_target(
    target: list,
    text_labels: bool = False,
    threshold: float = 0.4,
    cls_labels: dict = None,
):
    """This function converts the labels from
    probablities to outputs or string representations
    """

    result = []
    for i, x in enumerate(target):
        if x >= threshold:
            if text_labels:
                result.append(cls_labels[i] + "(" + str(i) + ")")
            else:
                result.append(str(i))
    return " ".join(result)


# This function is used for reversing the Normalization step performed
# during image preprocessing.
# Note the mean and std values must match the ones used.

def denormalize(tensors, *, mean, std):
    """Denormalizes image tensors using mean and std provided
    and clip values between 0 and 1"""

    for c in range(DatasetConfig.CHANNELS):
        tensors[:, c, :, :].mul_(std[c]).add_(mean[c])

    return torch.clamp(tensors, min=0.0, max=1.0)

In [15]:
# Create a dictionary mapping ICD codes to index
# icd_to_index = {code: idx for idx, code in enumerate(target_icd_codes)}


# Dataset

In [16]:
import numpy as np
from torch.utils.data import Dataset


class ECGDataset(Dataset):
    def __init__(self, dataframe, memmap, memmap_meta, normalize=True, indices=None):  # Add indices argument
        self.df = dataframe.reset_index(drop=True)
        self.memmap = memmap
        self.memmap_meta = memmap_meta  # Store memmap_meta
        self.normalize = normalize
        self.num_classes = DatasetConfig.NUM_CLASSES
        self.indices = indices  # Store indices if provided
        """
        Args:
            memmap_path (str): Path to the .npy memory-mapped ECG file.
            meta_path (str): Path to the .npz metadata file.
        """
        self.starts = self.memmap_meta["start"]
        self.lengths = self.memmap_meta["length"]
        self.shape = tuple(self.memmap_meta["shape"][0])
        self.ecg_data = self.memmap.reshape(self.shape)

    def __len__(self):
        return len(self.indices) if self.indices is not None else len(self.df)

    def __getitem__(self, idx):
        # Get the actual index from the indices list if provided
        actual_idx = self.indices[idx] if self.indices is not None else idx

        # Access starts and lengths using actual_idx
        start_idx = self.starts[actual_idx]
        length = self.lengths[actual_idx]

        signal = self.ecg_data[start_idx:start_idx + length, :]

        # Calculate mean and std for each channel (axis=0)
        channel_means = signal.mean(axis=0)
        channel_stds = signal.std(axis=0) + 1e-6

        # Normalize across all values using channel-specific mean and std
        signal = (signal - channel_means) / channel_stds

        # Reshape to [length, 12]
        signal = signal.T

        # Convert signal to PyTorch tensor before checking for NaN/inf
        signal = torch.tensor(signal, dtype=torch.float32)
        label = self.df.loc[actual_idx]['res']  # shape: (length, 12)

        return signal, label


# Data Module

In [17]:
def keep_if_multiple_labels(label_array):
    return np.sum(label_array) >= 1

In [18]:
!pip install -U imbalanced-learn



In [19]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [20]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit

class ECGDataModule(pl.LightningDataModule):
    def __init__(self, dataframe, memmap_meta, memmap, batch_size, num_workers, pin_memory, valid_pct, label_filter_fn = keep_if_multiple_labels , normalize=True, shuffle_validation=False):
        """
        :param dataframe: Original unfiltered dataframe.
        :param label_filter_fn: A function that takes a label array and returns True/False to filter sparse labels.
        """
        super().__init__()
        self.original_df = dataframe
        self.memmap_meta = memmap_meta
        self.memmap = memmap
        self.normalize = normalize
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.valid_pct = valid_pct
        self.shuffle_validation = shuffle_validation
        self.label_filter_fn = label_filter_fn

    def setup(self, stage=None):
        label_col = 'res'
        np.random.seed(42)

        # Parse and convert labels
        df = self.original_df.copy()
        df[label_col] = df[label_col].apply(lambda x: np.fromstring(x[1:-1], dtype=float, sep=' ') if isinstance(x, str) else x)

        # --- Step 1: Filter sparse labels ---
        mask = df[label_col].apply(self.label_filter_fn)
        df_filtered = df[mask].reset_index(drop=False)  # Keep original indices for alignment
        self.filtered_df = df_filtered  # Save for debugging

        # --- Step 2: Prepare Y matrix ---
        Y = np.vstack(df_filtered[label_col].values)

        # --- Step 3: Stratified split ---
        splitter = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        train_val_idx, test_idx = next(splitter.split(df_filtered, Y))

        # Second stratified split for validation
        splitter_val = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=self.valid_pct, random_state=42)
        Y_train_val = Y[train_val_idx]
        train_idx, val_idx = next(splitter_val.split(df_filtered.iloc[train_val_idx], Y_train_val))

        # Map back to original indices for memmap alignment
        self.train_idx = df_filtered.iloc[train_val_idx].iloc[train_idx]['index'].to_numpy()
        self.val_idx = df_filtered.iloc[train_val_idx].iloc[val_idx]['index'].to_numpy()
        self.test_idx = df_filtered.iloc[test_idx]['index'].to_numpy()

        # --- Step 4: Build datasets ---
        self.train_ds = ECGDataset(
            dataframe=self.original_df,  # full dataframe
            memmap=self.memmap,
            memmap_meta=self.memmap_meta,
            normalize=self.normalize,
            indices=self.train_idx,
        )
        self.valid_ds = ECGDataset(
            dataframe=self.original_df,
            memmap=self.memmap,
            memmap_meta=self.memmap_meta,
            normalize=self.normalize,
            indices=self.val_idx,
        )
        self.test_ds = ECGDataset(
            dataframe=self.original_df,
            memmap=self.memmap,
            memmap_meta=self.memmap_meta,
            normalize=self.normalize,
            indices=self.test_idx,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_ds,
            batch_size=self.batch_size,
            shuffle=self.shuffle_validation,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_ds,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
        )


In [21]:
import torchvision

def get_model(model_name: str, num_classes: int, freeze_backbone: bool= True):
    """A helper function to load and prepare any classification model
    available in Torchvision for transfer learning or fine-tuning."""

    model = getattr(torchvision.models, model_name)(weights="DEFAULT")

    if freeze_backbone:
        # Set all layer to be non-trainable
        for param in model.parameters():
            param.requires_grad = False

    model_childrens = [name for name, _ in model.named_children()]

    try:
        final_layer_in_features = getattr(model, f"{model_childrens[-1]}")[-1].in_features
    except Exception as e:
        final_layer_in_features = getattr(model, f"{model_childrens[-1]}").in_features

    new_output_layer = nn.Linear(
        in_features=final_layer_in_features,
        out_features=num_classes
    )

    try:
        getattr(model, f"{model_childrens[-1]}")[-1] = new_output_layer
    except:
        setattr(model, model_childrens[-1], new_output_layer)

    return model

**Function usage example:**

In [22]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [23]:
# from torchinfo import summary
# import torch.nn as nn

# # Suppose your ECG signals are 1000 time steps long
# TIME_LENGTH = 1000

# model = get_model(
#     model_name=TrainingConfig.MODEL_NAME,    # Should be "resnet50"
#     num_classes=DatasetConfig.NUM_CLASSES,
#     freeze_backbone=False,
# )

# # Correctly modify the first convolutional layer to accept 12 channels
# model.conv1 = nn.Conv2d(in_channels=12, out_channels=64, kernel_size=(7, 1), stride=(2, 1), padding=(3, 0), bias=False) # Reassign the layer

# # Proper ECG input shape
# summary(
#     model,
#     input_size=(TrainingConfig.BATCH_SIZE, DatasetConfig.NUM_LEADS, TIME_LENGTH, 1),  # (batch, channels=12, time, width=1)
#     depth=2,
#     device="cpu",
#     col_names=["output_size", "num_params", "trainable"]
# )

In [24]:
# # Assuming 'df' is your DataFrame and 'res' is the column with labels
# class_frequencies = []
# for code in target_icd_codes:
#     # Count occurrences of the current code in the 'res' column
#     freq = df['res'].str.contains(code).sum()
#     class_frequencies.append(freq)

# # Convert the list to a PyTorch tensor
# class_frequencies = torch.tensor(class_frequencies, dtype=torch.float32)

# Model

## Simple LSTM Model

In [25]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torchmetrics.classification import MultilabelAccuracy, MultilabelF1Score, MultilabelAUROC

class LSTMClassifier(pl.LightningModule):
    def __init__(self, input_size=12, hidden_size=64, num_layers=2, num_classes=2, lr=1e-3, f1_metric_threshold=0.5):
        super().__init__()
        self.save_hyperparameters()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True  # Using bidirectional LSTM
        )

        # Multi-label output layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.loss_fn = nn.BCEWithLogitsLoss()
        self.lr = lr

        # Metrics
        self.train_acc = MultilabelAccuracy(num_labels=num_classes, threshold=f1_metric_threshold)
        self.train_f1 = MultilabelF1Score(num_labels=num_classes, average="macro", threshold=f1_metric_threshold)
        self.train_auc = MultilabelAUROC(num_labels=num_classes)

        self.val_acc = MultilabelAccuracy(num_labels=num_classes, threshold=f1_metric_threshold)
        self.val_f1 = MultilabelF1Score(num_labels=num_classes, average="macro", threshold=f1_metric_threshold)
        self.val_auc = MultilabelAUROC(num_labels=num_classes)

    def forward(self, x):
        # Adjust the input shape to match LSTM requirements
        x = x.permute(0, 2, 1)  # Permute to (batch_size, sequence_length, input_size)

        # Ensure input data type is float32
        x = x.type(torch.float32)

        # Pass the modified input to the LSTM
        out, _ = self.lstm(x)

        out = out[:, -1, :]  # Take the last hidden state
        logits = self.fc(out)
        return logits
    # def on_train_start(self):
    #     # Log model type as a parameter or tag
    #     mlflow.pytorch.log_model(self, "model") # Registers the model
    #     mlflow.log_param("model_type", "LSTM")  # Log as parameter
    #     mlflow.set_tag("model_type", "LSTM")

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y.float())
        probs = torch.sigmoid(logits)

        acc = self.train_acc(probs, y.int())
        f1 = self.train_f1(probs, y.int())
        auc = self.train_auc(probs, y.int())

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        self.log("train_f1", f1, prog_bar=True)
        self.log("train_auc", auc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y.float())
        probs = torch.sigmoid(logits)

        acc = self.val_acc(probs, y.int())
        f1 = self.val_f1(probs, y.int())
        auc = self.val_auc(probs, y.int())

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)
        self.log("val_auc", auc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)


In [26]:
import pytorch_lightning as pl
import torch

class Swish(pl.LightningModule):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x * torch.sigmoid(x)

## RNN-Attention model

### ConvNormPool

In [27]:
class ConvNormPool(pl.LightningModule):
    """Conv Skip-connection module"""
    def __init__(
        self,
        input_size,
        hidden_size,
        kernel_size,
        norm_type='bachnorm'
    ):
        super().__init__()

        self.kernel_size = kernel_size
        self.conv_1 = nn.Conv1d(
            in_channels=input_size,
            out_channels=hidden_size,
            kernel_size=kernel_size
        )
        self.conv_2 = nn.Conv1d(
            in_channels=hidden_size,
            out_channels=hidden_size,
            kernel_size=kernel_size
        )
        self.conv_3 = nn.Conv1d(
            in_channels=hidden_size,
            out_channels=hidden_size,
            kernel_size=kernel_size
        )
        self.swish_1 = Swish()
        self.swish_2 = Swish()
        self.swish_3 = Swish()
        if norm_type == 'group':
            self.normalization_1 = nn.GroupNorm(
                num_groups=8,
                num_channels=hidden_size
            )
            self.normalization_2 = nn.GroupNorm(
                num_groups=8,
                num_channels=hidden_size
            )
            self.normalization_3 = nn.GroupNorm(
                num_groups=8,
                num_channels=hidden_size
            )
        else:
            self.normalization_1 = nn.BatchNorm1d(num_features=hidden_size)
            self.normalization_2 = nn.BatchNorm1d(num_features=hidden_size)
            self.normalization_3 = nn.BatchNorm1d(num_features=hidden_size)

        self.pool = nn.MaxPool1d(kernel_size=2)

    def forward(self, input):
        conv1 = self.conv_1(input)
        x = self.normalization_1(conv1)
        x = self.swish_1(x)
        x = F.pad(x, pad=(self.kernel_size - 1, 0))

        x = self.conv_2(x)
        x = self.normalization_2(x)
        x = self.swish_2(x)
        x = F.pad(x, pad=(self.kernel_size - 1, 0))

        conv3 = self.conv_3(x)
        x = self.normalization_3(conv1+conv3)
        x = self.swish_3(x)
        x = F.pad(x, pad=(self.kernel_size - 1, 0))

        x = self.pool(x)
        return x


### CNN

In [28]:
class CNN(pl.LightningModule):
    def __init__(
        self,
        input_size = 1,
        hid_size = 256,
        kernel_size = 5,
        num_classes = 5,
    ):

        super().__init__()

        self.conv1 = ConvNormPool(
            input_size=input_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )
        self.conv2 = ConvNormPool(
            input_size=hid_size,
            hidden_size=hid_size//2,
            kernel_size=kernel_size,
        )
        self.conv3 = ConvNormPool(
            input_size=hid_size//2,
            hidden_size=hid_size//4,
            kernel_size=kernel_size,
        )
        self.avgpool = nn.AdaptiveAvgPool1d((1))
        self.fc = nn.Linear(in_features=hid_size//4, out_features=num_classes)

    def forward(self, input):
        x = self.conv1(input)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.avgpool(x)
        # print(x.shape) # num_features * num_channels
        x = x.view(-1, x.size(1) * x.size(2))
        x = F.softmax(self.fc(x), dim=1)
        return x


### RNN

In [29]:
class RNN(pl.LightningModule):
    """RNN module(cell type lstm or gru)"""
    def __init__(
        self,
        input_size,
        hid_size,
        num_rnn_layers=1,
        dropout_p = 0.2,
        bidirectional = False,
        rnn_type = 'lstm',
    ):
        super().__init__()

        if rnn_type == 'lstm':
            self.rnn_layer = nn.LSTM(
                input_size=input_size,
                hidden_size=hid_size,
                num_layers=num_rnn_layers,
                dropout=dropout_p if num_rnn_layers>1 else 0,
                bidirectional=bidirectional,
                batch_first=True,
            )

        else:
            self.rnn_layer = nn.GRU(
                input_size=input_size,
                hidden_size=hid_size,
                num_layers=num_rnn_layers,
                dropout=dropout_p if num_rnn_layers>1 else 0,
                bidirectional=bidirectional,
                batch_first=True,
            )
    def forward(self, input):
        outputs, hidden_states = self.rnn_layer(input)
        return outputs, hidden_states


### RNN Model

In [30]:
class RNNModel(pl.LightningModule):
    def __init__(
        self,
        input_size,
        hid_size,
        rnn_type,
        bidirectional,
        n_classes=5,
        kernel_size=5,
    ):
        super().__init__()

        self.rnn_layer = RNN(
            input_size=46,#hid_size * 2 if bidirectional else hid_size,
            hid_size=hid_size,
            rnn_type=rnn_type,
            bidirectional=bidirectional
        )
        self.conv1 = ConvNormPool(
            input_size=input_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )
        self.conv2 = ConvNormPool(
            input_size=hid_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )
        self.avgpool = nn.AdaptiveAvgPool1d((1))
        self.fc = nn.Linear(in_features=hid_size, out_features=n_classes)

    def forward(self, input):
        x = self.conv1(input)
        x = self.conv2(x)
        x, _ = self.rnn_layer(x)
        x = self.avgpool(x)
        x = x.view(-1, x.size(1) * x.size(2))
        x = F.sigmoid(self.fc(x), dim=1)#.squeeze(1)
        return x


### RNN Attention Model

In [31]:
import torch

def calculate_class_weights(df, label_col="res"):
    """
    Calculates class weights based on label frequencies.

    Args:
        df (pd.DataFrame): The DataFrame containing the multi-hot labels.
        label_col (str): The column name containing the multi-hot labels.

    Returns:
        torch.Tensor: A tensor containing the class weights.
    """
    label_counts = df[label_col].sum()  # Assuming 'res' is the column with multi-hot labels
    num_samples = len(df)
    class_weights = num_samples / (df.shape[1] * label_counts)
    return torch.tensor(class_weights, dtype=torch.float32) # Move to the same device as your model

# Assuming 'training_df' is your training DataFrame:
class_weights = calculate_class_weights(df)

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torchmetrics.classification import MultilabelAccuracy, MultilabelF1Score, MultilabelAUROC

class RNNAttentionModel(pl.LightningModule):
    def __init__(
        self,
        hid_size =64,
        rnn_type = 'lstm',
        bidirectional=False,
        num_classes=DatasetConfig.NUM_CLASSES,
        input_size =12,
        kernel_size=5,
        lr=1e-3,
        f1_metric_threshold=0.5,
    ):
        super().__init__()
        self.save_hyperparameters()

        self.conv1 = ConvNormPool(
            input_size=input_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )
        self.conv2 = ConvNormPool(
            input_size=hid_size,
            hidden_size=hid_size,
            kernel_size=kernel_size,
        )

        self.rnn_layer = RNN(
            input_size=hid_size,
            hid_size=hid_size,
            rnn_type=rnn_type,
            bidirectional=bidirectional
        )

        self.attn = nn.Linear(hid_size, hid_size, bias=False)
        self.fc = nn.Linear(in_features=hid_size, out_features=num_classes)  # Multi-label output
        self.loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)
        self.lr = lr

        # Metrics
        self.train_acc = MultilabelAccuracy(num_labels=num_classes, threshold=f1_metric_threshold)
        self.train_f1 = MultilabelF1Score(num_labels=num_classes, average="macro", threshold=f1_metric_threshold)
        self.train_auc = MultilabelAUROC(num_labels=num_classes)

        self.val_acc = MultilabelAccuracy(num_labels=num_classes, threshold=f1_metric_threshold)
        self.val_f1 = MultilabelF1Score(num_labels=num_classes, average="macro", threshold=f1_metric_threshold)
        self.val_auc = MultilabelAUROC(num_labels=num_classes)

    def forward(self, input):
        # input = input.permute(0, 2, 1)  # Remove this line - permutation is done in the dataset
        x = self.conv1(input)
        x = self.conv2(x)
        x = x.permute(0, 2, 1)  # Permute before the RNN layer

        x_out, _ = self.rnn_layer(x)

        attn_weights = torch.softmax(self.attn(x_out), dim=1)
        x = torch.sum(attn_weights * x_out, dim=1)

        logits = self.fc(x)
        return logits

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y.float())
        probs = torch.sigmoid(logits)

        acc = self.train_acc(probs, y.int())
        f1 = self.train_f1(probs, y.int())
        auc = self.train_auc(probs, y.int())

        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        self.log("train_f1", f1, prog_bar=True)
        self.log("train_auc", auc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y.float())
        probs = torch.sigmoid(logits)

        acc = self.val_acc(probs, y.int())
        f1 = self.val_f1(probs, y.int())
        auc = self.val_auc(probs, y.int())

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)
        self.log("val_f1", f1, prog_bar=True)
        self.log("val_auc", auc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

# Dataset Initialization

In [33]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

# 1. Seed everything for reproducibility
pl.seed_everything(42, workers=True)

memmap_path = "src/data/memmap/memmap.npy"

memmap_data = np.memmap(memmap_path, dtype=np.float32, mode='r')
memmap_meta_path = "src/data/memmap/memmap_meta.npz"
memmap_meta = np.load(memmap_meta_path, allow_pickle=True)
# Instantiate the ECGDataModule
dm = ECGDataModule(
    dataframe=df,            # Your loaded DataFrame
    memmap=memmap_data,             # Your loaded memmap
    memmap_meta = memmap_meta,
    batch_size=TrainingConfig.BATCH_SIZE,
    num_workers=TrainingConfig.NUM_WORKERS,
    pin_memory=torch.cuda.is_available(),
    valid_pct=DatasetConfig.VALID_PCT,
)

# Prepare data (nothing to download for ECG, so will pass)
dm.prepare_data()

# Split dataset into training and validation sets
dm.setup()

# 4. Create ModelCheckpoint callback
model_checkpoint = ModelCheckpoint(
    monitor="valid/f1",        # Monitor validation F1 score
    mode="max",                # Maximize F1
    filename="ecg_epoch{epoch:03d}_vloss{valid/loss:.4f}_vf1{valid/f1:.4f}",
    auto_insert_metric_name=False,
    save_top_k=1,              # Save the best model only
)

# 5. Create Learning Rate Monitor callback
lr_monitor = LearningRateMonitor(logging_interval="epoch")


INFO:lightning_fabric.utilities.seed:Seed set to 42


In [34]:
# # To reload tensorBoard
# %reload_ext tensorboard

# # logs folder path
# %tensorboard --logdir=lightning_logs

**Train**

### Dataset checking before running model

In [35]:
# Assuming dm is your ECGDataModule instance
training_df = dm.filtered_df

# Inspect the shape
print("Shape of training DataFrame:", training_df.shape)


Shape of training DataFrame: (199208, 8)


In [36]:
mean_positive_rates = calculate_mean_positive_rate(training_df)

# Print the results
print("Mean positive rate per label:", mean_positive_rates)

Mean positive rate per label: [0.5633057  0.44997188 0.47376108]


# Training

In [37]:
# # Assuming 'dm' is your ECGDataModule instance
# train_loader = dm.train_dataloader()

# # 1. Using len() on the dataloader:
# num_batches = len(train_loader)
# print(f"Number of batches in train_dataloader: {num_batches}")

# # 2. Calculating total samples from batch size and num_batches:
# total_samples = num_batches * train_loader.batch_size
# print(f"Estimated total samples in training dataset: {total_samples}")

# # 3. Accessing the underlying dataset directly (more accurate):
# total_samples_accurate = len(train_loader.dataset)
# print(f"Actual total samples in training dataset: {total_samples_accurate}")


In [38]:
model = RNNAttentionModel()
# model = LSTMClassifier()

In [39]:
from torchinfo import summary
TIME_LENGTH = 1000

summary(
    model,
    input_size=(TrainingConfig.BATCH_SIZE, DatasetConfig.NUM_LEADS, TIME_LENGTH),  # (batch, channels=12, time, width=1)
    depth=2,
    device="cpu",
    col_names=["output_size", "num_params", "trainable"]
)

Layer (type:depth-idx)                   Output Shape              Param #                   Trainable
RNNAttentionModel                        [16, 3]                   --                        True
├─ConvNormPool: 1-1                      [16, 64, 500]             --                        True
│    └─Conv1d: 2-1                       [16, 64, 996]             3,904                     True
│    └─BatchNorm1d: 2-2                  [16, 64, 996]             128                       True
│    └─Swish: 2-3                        [16, 64, 996]             --                        --
│    └─Conv1d: 2-4                       [16, 64, 996]             20,544                    True
│    └─BatchNorm1d: 2-5                  [16, 64, 996]             128                       True
│    └─Swish: 2-6                        [16, 64, 996]             --                        --
│    └─Conv1d: 2-7                       [16, 64, 996]             20,544                    True
│    └─BatchNorm1d:

In [41]:
# Initializing the Trainer class object.
# It uses 'Tensorboard' as its default logger.
trainer = pl.Trainer(
    accelerator="auto", # Auto select the best hardware accelerator available
    devices="auto", # Auto select available devices for the accelerator (For eg. mutiple GPUs)
    strategy="auto", # Auto select the distributed training strategy.
    max_epochs=1, # Maximum number of epoch to train for.
    deterministic=True, # For deteministic and reproducible training.
    enable_model_summary=False, # Disable printing of model summary as we are using torchinfo.
    callbacks=[model_checkpoint, lr_monitor],  # Declaring callbacks to use.
    precision="16", # Using Mixed Precision training.
    logger=True, # Auto generate TensorBoard logs.
)

# Start training
trainer.fit(model, dm)

/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid(s) 3420, 3422, 3423) exited unexpectedly

## 7 Inference

To perform inference, first, we need to load the best checkpoint saved during training. We can do it simply by executing the following: