#### 1. Setup and Importing Libraries

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset # wraps an iterable around the dataset
from torchvision import datasets    # stores the samples and their corresponding labels
from torchvision.transforms import transforms  # transformations we can perform on our dataset
from torchvision.transforms import ToTensor
import pandas as pd
import numpy as np
import os

#### 2. Data Loader

In [2]:
class ECGDataSet(Dataset):
    
    def __init__(self):
        # data loading
        current_directory = os.getcwd()
        parent_directory = os.path.dirname(current_directory)
        train_small_path = os.path.join(parent_directory, 'data', 'deepfake-ecg-small', 'train.csv')
        xy = pd.read_csv(train_small_path)  # Skip the header row
        
        # QT
        self.y = xy.iloc[:,[12]].values
        patient_ids = xy.iloc[:,[1]].values 

        # ECG reports
        self.x = np.empty((patient_ids.shape[0],8, 5000))
        i = 0
        # read each asc file
        for [patient_id] in patient_ids:
            asc_path = os.path.join(parent_directory, 'data', 'deepfake-ecg-small', 'train', str(patient_id)+'.asc')
            matrix = self.read_file(asc_path)
            self.x[i] = matrix
            i = i + 1

        # Size of the dataset
        self.samples = xy.shape[0]

    def __getitem__(self, index):
        # Retrieve a sample from x and y based on the index
        return self.x[index], self.y[index]

    def __len__(self):
        # Return the total number of samples in the dataset
        return self.samples
    
    def read_file(self, filename):
        # Read the file and extract the lines
        with open(filename, 'r') as file:
            lines = file.readlines()
            # Initialize an empty matrix
            matrix = np.empty((8, 5000))
            # Iterate over each line and fill the matrix
            for i, line in enumerate(lines):
            # Split the line into individual values
                values = line.split()
                # Convert the values to integers and store them in the matrix
                matrix[:, i] = np.array(values, dtype=int)

        return matrix

In [3]:
# ECG dataset
dataset = ECGDataSet()

In [4]:
# first data
first_data = dataset[0]
x, y = first_data

In [5]:
x

array([[-127., -162., -142., ...,  -89.,  -39.,  -93.],
       [  -1.,    0.,  -46., ...,  -18.,   22.,    5.],
       [ -33.,   -8.,  -27., ...,   44.,   71.,   82.],
       ...,
       [ -92.,  -86.,  -87., ...,   67.,   89.,  105.],
       [ -61.,  -67.,  -70., ...,   52.,   88.,   26.],
       [   2.,  -29.,  -25., ...,   69.,  128.,  115.]])

In [6]:
y

array([434], dtype=int64)

In [7]:
# data loader
# It allows you to efficiently load and iterate over batches of data during the training or evaluation process.
dataloader = DataLoader(dataset=dataset, batch_size=16, shuffle=True, num_workers=2)

In [8]:
# iterator object that allows you to iterate over the batches of data from the data loader.
dataitter = iter(dataloader)

# it retrieves the next batch of data from the iterator.
data = dataitter.next()

# split data
x, y = data

# print them
print(x)
print(y)