In [22]:
import torch
from torch.utils.data import  Dataset, DataLoader # wraps an iterable around the dataset
import pandas as pd
import os
import numpy as np
import wfdb
import torch.nn.functional as F
from torch import nn
import torch.optim as optim

## Dataset Class

In [23]:
current_directory = os.getcwd()                             # /e17-4yp-Comp.../python-scripts-resnet/PTB-XL
parent_directory = os.path.dirname(current_directory)       # /e17-4yp-Comp.../python-scripts-resnet

features_csv_path = os.path.join(parent_directory,  'data', 'ptb-xl-a-comprehensive-electrocardiographic-feature-dataset-1.0.1', 'features', '12sl_features.csv')   

# path of the record500 folder
path_record = os.path.join(parent_directory,  'data', 'ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1', 'records500')

In [24]:
# Assuming you already have your DataFrame loaded as self.df
# Define the percentages for train, validation, and test sets
train_percentage = 0.7
validation_percentage = 0.15
test_percentage = 0.15

In [25]:
class ECGDataSet_PTB_XL(Dataset):

    def __init__(self, parameter='hr', split="train"):

        # data loading
        # Skip the header row
        self.df = pd.read_csv(features_csv_path) 

        # Create an empty list to store the indices of rows to be removed
        rows_to_remove = [] 

        # Iterate through the rows
        for index, row in self.df.iterrows():
            file_index = int(self.df['ecg_id'].values[index])
            folder_name = str(file_index // 1000).zfill(2)+'000' 
            file_name = str(file_index).zfill(5)+'_hr.hea'
            ecg_record_path = os.path.join(path_record, folder_name, file_name)
            #print(ecg_record_path)

            # Check if the ecg_record_path exists
            if not os.path.exists(ecg_record_path):
                rows_to_remove.append(index)

        # Remove rows where ecg_record_path does not exist
        self.df.drop(rows_to_remove, inplace=True)
        # Reset the DataFrame index if needed
        self.df.reset_index(drop=True, inplace=True)

        
        # Calculate the number of samples for each set
        total_samples = len(self.df)
        num_train = int(train_percentage * total_samples)
        num_validation = int(validation_percentage * total_samples)
        num_test = total_samples - num_train - num_validation

        # Create an array of indices to shuffle
        indices = np.arange(total_samples)
        np.random.shuffle(indices)

        # Split the shuffled indices into train, validation, and test sets
        train_indices = indices[:num_train]
        validation_indices = indices[num_train:num_train + num_validation]
        test_indices = indices[num_train + num_validation:]

        # Create DataFrames for each set
        train = self.df.iloc[train_indices]
        validation = self.df.iloc[validation_indices]
        test = self.df.iloc[test_indices]

        # Reset the index for each DataFrame
        train.reset_index(drop=True, inplace=True)
        validation.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)


        if split=="train":
            self.df = train
        if split=="validate":
            self.df = validation
        elif split=="test":
            self.df = test
        
        if parameter == 'hr':   # 'hr' should be replaced
            self.df = self.df.dropna(subset=['RR_Mean_Global'])
            # Avg RR interval
            # in milli seconds
            RR = torch.tensor(self.df['RR_Mean_Global'].values, dtype=torch.float32) 
            # calculate HR
            self.y = 60 * 1000/RR

        elif parameter == 'qrs':
            self.df = self.df.dropna(subset=['QRS_Dur_Global']) 
            self.y = torch.tensor(self.df['QRS_Dur_Global'].values, dtype=torch.float32)

        elif parameter == 'qt':
            self.df = self.df.dropna(subset=['QT_Int_Global']) 
            self.y = torch.tensor(self.df['QT_Int_Global'].values, dtype=torch.float32)
        
        elif parameter == 'pr': 
            self.df = self.df.dropna(subset=['PR_Int_Global'])
            self.y = torch.tensor(self.df['PR_Int_Global'].values, dtype=torch.float32)
        
        # Size of the dataset
        self.samples = self.df.shape[0]

    def __getitem__(self, index):
        
        # file path
        file_index = int(self.df['ecg_id'].values[index])
        folder_name = str(file_index // 1000).zfill(2)+'000' 
        file_name = str(file_index).zfill(5)+'_hr'

        # ecg_record_path = os.path.join(self.super_parent_directory,  'data', 'ptb-xl', 'records500', folder_name, file_name)
        ecg_record_path = os.path.join(path_record , folder_name, file_name)

        # Use wfdb.rdsamp to read both the .dat file and .hea header file
        ecg_record_data, ecg_record_header = wfdb.rdsamp(ecg_record_path)

        ecg_signals = torch.tensor(ecg_record_data) # convert dataframe values to tensor
        
        ecg_signals = ecg_signals.float()
        
        # Transposing the ecg signals
        ecg_signals = ecg_signals/6 # normalization
        ecg_signals = ecg_signals.t() 
        
        qt = self.y[index]
        # Retrieve a sample from x and y based on the index
        return ecg_signals, qt

    def __len__(self):
        # Return the total number of samples in the dataset
        return self.samples

## Resnet

In [26]:

class KanResInit(nn.Module):
    def __init__(self, in_channels, filterno_1, filterno_2, filtersize_1, filtersize_2, stride):
        #print(in_channels) --> 8
        super(KanResInit, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, filterno_1, filtersize_1, stride=stride)
        self.bn1 = nn.BatchNorm1d(filterno_1)
        self.conv2 = nn.Conv1d(filterno_1, filterno_2, filtersize_2)
        self.bn2 = nn.BatchNorm1d(filterno_2)
        # initialize a relu layer
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        return x

In [27]:
class KanResModule(nn.Module):
    def __init__(self, in_channels, filterno_1, filterno_2, filtersize_1, filtersize_2, stride):
        super(KanResModule, self).__init__()
        # have to use same padding to keep the size of the input and output the same
        # calculate the padding needed for same
        padding = (filtersize_1 - 1) // 2 + (stride - 1)
        self.conv1 = nn.Conv1d(in_channels, filterno_1, filtersize_1, stride=stride, padding='same')
        self.bn1 = nn.BatchNorm1d(filterno_1)
        self.conv2 = nn.Conv1d(filterno_1, filterno_2, filtersize_2, padding='same')
        self.bn2 = nn.BatchNorm1d(filterno_2)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        identity = x
        #print(x.shape)      
        x = self.conv1(x)
        #print(x.shape)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        #print(x.shape)
        x = self.bn2(x)
        x = self.relu(x)
        x = x + identity
        return x

In [28]:
class KanResWide_X2(nn.Module):
    def __init__(self, input_shape, output_size):
        super(KanResWide_X2, self).__init__()

        #print(input_shape[0])
        #print(input_shape[1])

        self.input_shape = input_shape
        self.output_size = output_size
        
        self.init_block = KanResInit(input_shape[0], 64, 32, 8, 3, 1)
        self.pool = nn.AvgPool1d(kernel_size=2)
        
        self.module_blocks = nn.Sequential(
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1),
            KanResModule(32, 64, 32, 50, 50, 1)
        )
        
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(32, output_size)
        
    def forward(self, x):
        x = self.init_block(x)
        #print("init block trained")
        #print(x.shape)
        x = self.pool(x)
        #print("pool 1 trained")
        #print(x.shape)
        x = self.module_blocks(x)
        #print("module blocks trained")
        x = self.global_avg_pool(x)
        #print(x.shape)
        x = x.view(x.size(0), -1)
        #q: explain the above line
        #a: it flattens the input
        x = self.fc(x)
        #print(x.shape)
        # squeeze the output
        x = torch.squeeze(x)
        #print(x.shape)
        return x


## Training and Validaton

In [29]:
def train(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    model.train()
    

    train_losses_epoch = [] 
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        #print(X.shape)
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_losses_epoch.append(loss.item())
    
    return np.mean(train_losses_epoch)

In [30]:
def validate(dataloader, model, loss_fn, device):
    model.eval()  # Set the model to evaluation mode
    val_losses_epoch = []

    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # Compute predictions
            pred = model(X)
            loss = loss_fn(pred, y)

            val_losses_epoch.append(loss.item())

    return np.mean(val_losses_epoch)

In [31]:
# 128 is the batch size, 8 is the number of channels, 5000 is the number of time steps
input_shape = (12, 5000)  # Modify this according to your input shape // change to (12,5000) for ptbxl
# Number of output units
output_size = 1 
# number of epochs
number_of_epochs = 50
#
learning_rate = 0.0005

y_parameters = ['pr']#, 'qrs', 'qt', 'hr']

In [32]:
device = (
            "cuda"
            if torch.cuda.is_available()
            else "mps"
            if torch.backends.mps.is_available()
            else "cpu"
        )

In [33]:
for y_parameter in y_parameters:

    # ECG dataset
    train_dataset = ECGDataSet_PTB_XL(parameter=y_parameter, split='train')
    validate_dataset = ECGDataSet_PTB_XL(parameter=y_parameter, split='validate')

    # data loaders
    train_dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True, num_workers=20)
    validate_dataloader = DataLoader(dataset=validate_dataset, batch_size=16, shuffle=False, num_workers=20)

    # model
    model = KanResWide_X2(input_shape, output_size)

    optimizer = optim.NAdam(model.parameters(), lr=learning_rate)     
    # Loss function for linear values (e.g., regression)
    loss_fn = nn.MSELoss()  # Mean Squared Error loss

    # train and validate

    train_losses = []
    val_losses = []
    epochs = []

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}\n-------------------------------")
        epochs.append(epoch)

        # train
        train_loss = train(train_dataloader, model, loss_fn, optimizer, device)
        #train_losses.append(train_loss)

        # validation
        val_loss = validate(validate_dataloader, model, loss_fn, device)
        #val_losses.append(val_loss)

        print("Training Loss: ", train_loss, "\t", "Validation Loss: ", val_loss )

FileNotFoundError: [Errno 2] No such file or directory: '/storage/projects2/e17-4yp-compreh-ecg-analysis/e17-4yp-Comprehensive-ECG-analysis-with-Deep-Learning-on-GPU-accelerators/python-scripts-resnet/data/ptb-xl-a-comprehensive-electrocardiographic-feature-dataset-1.0.1/features/12sl_features.csv'