#### 1. Setup and Importing Libraries

In [60]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset # wraps an iterable around the dataset
from torchvision import datasets    # stores the samples and their corresponding labels
from torchvision.transforms import transforms  # transformations we can perform on our dataset
from torchvision.transforms import ToTensor
import pandas as pd
import numpy as np
import os

#### 2. Data Loader

In [91]:
class ECGDataSet(Dataset):
    
    def __init__(self):
        # data loading
        current_directory = os.getcwd()
        parent_directory = os.path.dirname(current_directory)
        train_small_path = os.path.join(parent_directory, 'data', 'deepfake-ecg-small', 'train.csv')
        xy = pd.read_csv(train_small_path)  # Skip the header row
        
        # QT
        self.y = torch.tensor(xy['qt'].values)
        patient_ids = xy['patid'].values

        # ECG reports
        self.x = []
        # read each asc file
        for patient_id in patient_ids:
            asc_path = os.path.join(parent_directory, 'data', 'deepfake-ecg-small', 'train', str(patient_id)+'.asc')
            ecg_data = np.loadtxt(asc_path)
            ecg_tensor = torch.from_numpy(ecg_data)
            ecg_tensor = ecg_tensor.permute(1, 0).unsqueeze(2)
            self.x.append(ecg_tensor)

        # Size of the dataset
        self.samples = xy.shape[0]

    def __getitem__(self, index):
        # Retrieve a sample from x and y based on the index
        return self.x[index], self.y[index]

    def __len__(self):
        # Return the total number of samples in the dataset
        return self.samples
    
    # def read_file(self, filename):
    #     # Read the file and extract the lines
    #     with open(filename, 'r') as file:
    #         lines = file.readlines()
    #         # Initialize an empty matrix
    #         matrix = np.empty((8, 5000))
    #         # Iterate over each line and fill the matrix
    #         for i, line in enumerate(lines):
    #         # Split the line into individual values
    #             values = line.split()
    #             # Convert the values to integers and store them in the matrix
    #             matrix[:, i] = np.array(values, dtype=int)

    #     return matrix

In [94]:
# ECG dataset
dataset = ECGDataSet()

ValueError: only one element tensors can be converted to Python scalars

In [63]:
# first data
first_data = dataset[0]
x, y = first_data

In [81]:
x

tensor([[[-127.],
         [-162.],
         [-142.],
         ...,
         [ -89.],
         [ -39.],
         [ -93.]],

        [[  -1.],
         [   0.],
         [ -46.],
         ...,
         [ -18.],
         [  22.],
         [   5.]],

        [[ -33.],
         [  -8.],
         [ -27.],
         ...,
         [  44.],
         [  71.],
         [  82.]],

        ...,

        [[ -92.],
         [ -86.],
         [ -87.],
         ...,
         [  67.],
         [  89.],
         [ 105.]],

        [[ -61.],
         [ -67.],
         [ -70.],
         ...,
         [  52.],
         [  88.],
         [  26.]],

        [[   2.],
         [ -29.],
         [ -25.],
         ...,
         [  69.],
         [ 128.],
         [ 115.]]], dtype=torch.float64)

In [65]:
y

tensor(434)

In [79]:
x.shape

torch.Size([8, 5000, 1])

In [82]:
y.shape

10000


In [72]:
# data loader
# It allows you to efficiently load and iterate over batches of data during the training or evaluation process.
dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=2)

### CNN

In [84]:
# Load whole dataset with DataLoader
# shuffle: shuffle data, good for training
# num_workers: faster loading with multiple subprocesses
# !!! IF YOU GET AN ERROR DURING LOADING, SET num_workers TO 0 !!!
batch_size = 4
dataloader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=2)

# # convert to an iterator and look at one random sample
# dataiter = iter(train_loader)
# data = next(dataiter)
# features, labels = data
# print(features, labels)

In [85]:
# CNN model
class CNN(nn.Module):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(8, 16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc = nn.Linear(16 * 2500, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


In [86]:
# hyperparameters
num_classes = 1  # Number of output classes
learning_rate = 0.001
num_epochs = 1

In [87]:
model = CNN(num_classes)

# criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()

# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [88]:
for epoch in range(num_epochs):
    for batch_inputs, batch_labels in dataloader:
        # Forward pass
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the loss after every epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

RuntimeError: DataLoader worker (pid(s) 14660, 7284) exited unexpectedly