# Preprocessing steps for the project

In [34]:
# Import libraries required
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt 
import os
import wfdb
import pickle
import sys
import glob

## Check for GPU 
background checks, used for making sure CUDA is setup and linked to pytorch. See below for setup:

<a href="https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exe_local
" >download cuda</a>

```pip uninstall torch torchvision torchaudio```

```pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118```

then to verify it downloaded, run the below in terminal:

```python -c "import torch; print('CUDA available:', torch.cuda.is_available(), 'Number of GPUs:', torch.cuda.device_count(), 'Current device:', torch.cuda.current_device(), 'Device name:', torch.cuda.get_device_name(0) if torch.cuda.device_count() > 0 else 'No GPU found')"```

In [None]:
# Setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

if device.type == 'cuda':
    print('CUDA Device Name:', torch.cuda.get_device_name(0))
    print('CUDA Version:', torch.version.cuda)
    print('PyTorch Version:', torch.__version__)
    
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0) / 1024**3, 1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0) / 1024**3, 1), 'GB')
    
    # Total memory info
    total_memory = torch.cuda.get_device_properties(0).total_memory
    print('Total Memory:', round(total_memory / 1024**3, 1), 'GB')
    
    free_memory = total_memory - torch.cuda.memory_reserved(0)
    print('Free Memory:', round(free_memory / 1024**3, 1), 'GB')

    # tensor check, making sure cuda and torch are setup correctly
    x = torch.rand(2, 3)
    print('Test Tensor:', x)
else:
    print('Using the CPU, no GPU found')


## Load Dataset, simple data exploration
To load the dataset, we will use The `WFDB` (Waveform Database) library. It is a part of the PhysioNet project and is a software package designed for reading, writing, and processing physiological signals, primarily ECG (electrocardiogram) signals.

In [None]:
def get_records(path):
    """ Get paths for ECG records in the specified directory. """ #each pt has 4 files. .atr, .dat, .hea, .xws
    # Pattern to find *.atr files
    path_source = f'{path}*.atr'
    paths = glob.glob(path_source) 

    # Remove the extensions and sort
    records = sorted(path[:-4] for path in paths)
    records = [record for record in records if not record.endswith('\\102-0')] # 7/06/2018: File 102.atr has been edited. Annotation number 1991 (0 indexed) has been shifted from sample 590296 to 590262 <-- from physionet, orig. @ 102-0.atr
    return records


In [None]:
def show_patient_info(record_path):
    """Show patient metadata for a single record."""
    try:
        record = wfdb.rdsamp(record_path)
        # NOTE FROM DATASET WEBSITE / LIBRARY - PT ECG DATA SAMPLES IN RECORD[0], PATIENT DATA IN RECORD[1]

        metadata = record[1]
        print(metadata)
        # print("Patient Information:")
        # print(f"Sampling rate (fs): {metadata['fs']}")
        # print(f"Total number of samples (sig_len): {metadata['sig_len']}")
        # print(f"Total number of channels (n_sig): {metadata['n_sig']}")
        # print(f"Base date: {metadata['base_date']}")
        # print(f"Base time: {metadata['base_time']}")
        # print(f"Units: {metadata['units']}")
        # print(f"Channel names: {metadata['sig_name']}")
        # print(f"Comments: {metadata['comments']}")
        print("")
        
    except FileNotFoundError:
        print(f"File not found: {record_path}, check file name + path xdd")
    except Exception as e:
        print(f"couldn't load file, error: {e}")




In [None]:
# identify all paths / files, print on separate line.
record_paths = get_records('C:/Users/adamb/Downloads/mit-bih-arrhythmia-database-1.0.0/')

for record in record_paths:
    print(record)

# print 1 pt's data
record_name = 'C:/Users/adamb/Downloads/mit-bih-arrhythmia-database-1.0.0/100'  #record + path to folder
show_patient_info(record_name)

In [None]:
#TODO - formal, more in depth load + preproccess. determine approach (heartbeat to heartbeat, full signal?)