In [2]:
import wfdb
import numpy as np
import pandas as pd
import neurokit2 as nk
from scipy.signal import resample
import os
import logging
from multiprocessing import Pool, cpu_count

# Set up logging to track errors and progress
logging.basicConfig(filename='preprocess.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def process_file(file_info):
    """Function to process a single ECG file."""
    dataset_path, file, output_dir = file_info
    record_name = os.path.join(dataset_path, file.split('.')[0])

    try:
        # Read ECG record
        record = wfdb.rdrecord(record_name)
        signal = record.p_signal[:, 0]  # Use first channel

        # Ensure signal has data
        if len(signal) == 0:
            raise ValueError("Empty signal detected")

        # Resample to 250 Hz
        signal_resampled = resample(signal, int(len(signal) * 250 / record.fs))

        # Normalize to [-1, 1]
        signal_normalized = (signal_resampled - signal_resampled.min()) / \
                            (signal_resampled.max() - signal_resampled.min()) * 2 - 1

        # Extract ECG features
        try:
            ecg_signals, info = nk.ecg_process(signal_normalized, sampling_rate=250)
            features = nk.ecg_analyze(ecg_signals, sampling_rate=250)
        except Exception as e:
            logging.warning(f"Failed to extract features for {file}: {e}. Using minimal features.")
            r_peaks = nk.ecg_peaks(signal_normalized, sampling_rate=250)['ECG_R_Peaks']
            hrv = np.diff(r_peaks) if len(r_peaks) > 1 else np.nan
            features = pd.DataFrame({
                'HRV': [hrv.mean() if not np.isnan(hrv) else 0],
                'R_peaks': [len(r_peaks)],
                'amplitude': [signal_normalized.max() - signal_normalized.min()]
            })

        # Save processed signal and features
        np.save(os.path.join(output_dir, f"{file.split('.')[0]}_processed.npy"), signal_normalized)
        features.to_csv(os.path.join(output_dir, f"{file.split('.')[0]}_features.csv"), index=False)

        return f"Successfully processed {file}"

    except Exception as e:
        return f"Error processing {file}: {e}"

def preprocess_ecg(data_dir, output_dir, max_files=None):
    """Preprocess ECG data using parallel processing."""
    os.makedirs(output_dir, exist_ok=True)

    # Collect all files from both datasets
    file_list = []
    for dataset in ["mitdb", "ptb-xl"]:
        dataset_path = os.path.join(data_dir, dataset)
        if not os.path.exists(dataset_path):
            logging.warning(f"Dataset path {dataset_path} not found, skipping.")
            continue

        files = [f for f in os.listdir(dataset_path) if f.endswith(".dat")]
        if max_files:
            files = files[:max_files]  # Limit for testing

        file_list.extend([(dataset_path, f, output_dir) for f in files])

    logging.info(f"Processing {len(file_list)} files in parallel")

    # Use multiprocessing to speed up processing
    num_workers = min(cpu_count(), 8)  # Use up to 8 cores for efficiency
    with Pool(num_workers) as pool:
        results = pool.map(process_file, file_list)

    for res in results:
        logging.info(res)

    logging.info("Preprocessing completed.")

if __name__ == "__main__":
    preprocess_ecg("C:/Users/akshi/OneDrive/Desktop/HeartGuardAI/data",
                   "C:/Users/akshi/OneDrive/Desktop/HeartGuardAI/data/processed")


ModuleNotFoundError: No module named 'wfdb'

In [2]:
!pip install wfdb numpy pandas neurokit2 scipy torch onnx shap

Collecting wfdb
  Downloading wfdb-4.2.0-py3-none-any.whl.metadata (3.7 kB)
Collecting neurokit2
  Downloading neurokit2-0.2.10-py2.py3-none-any.whl.metadata (37 kB)
Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collec

In [3]:
pip install pandas==2.2.2


Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.3
    Uninstalling pandas-2.2.3:
      Successfully uninstalled pandas-2.2.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
wfdb 4.2.0 requires pandas>=2.2.3, but you have pandas 2.2.2 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.2.2


In [4]:
pip install wfdb==4.0.0


Collecting wfdb==4.0.0
  Downloading wfdb-4.0.0-py3-none-any.whl.metadata (7.3 kB)
Collecting SoundFile<0.12.0,>=0.10.0 (from wfdb==4.0.0)
  Downloading soundfile-0.11.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting pandas<2.0.0,>=1.0.0 (from wfdb==4.0.0)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading wfdb-4.0.0-py3-none-any.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading soundfile-0.11.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: SoundFile, pandas, wfdb
  Attempting uninstall: SoundFile
    Found existing installation: soundfile 0.13.1
    Uninstalling soundfile-0.13.1:
      Succes

In [5]:
!pip show wfdb pandas google-colab

Name: wfdb
Version: 4.0.0
Summary: The WFDB Python package: tools for reading, writing, and processing physiologic signals and annotations.
Home-page: https://github.com/MIT-LCP/wfdb-python/
Author: The Laboratory for Computational Physiology
Author-email: contact@physionet.org
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: matplotlib, numpy, pandas, requests, scipy, SoundFile
Required-by: 
---
Name: pandas
Version: 1.5.3
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.11/dist-packages
Requires: numpy, python-dateutil, pytz
Required-by: arviz, bigframes, bigquery-magics, bokeh, bqplot, cmdstanpy, cudf-cu12, cufflinks, dask-cuda, dask-cudf-cu12, dask-expr, datascience, db-dtypes, dopamine_rl, fastai, geemap, geopandas, google-colab, gspread-dataframe, holoviews, ibis-

In [6]:
!pip uninstall wfdb -y
!pip install wfdb==3.4.0 pandas==2.2.2 neurokit2==0.2.2 scipy torch onnx shap

Found existing installation: wfdb 4.0.0
Uninstalling wfdb-4.0.0:
  Successfully uninstalled wfdb-4.0.0
Collecting wfdb==3.4.0
  Downloading wfdb-3.4.0-py3-none-any.whl.metadata (3.6 kB)
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting neurokit2==0.2.2
  Downloading neurokit2-0.2.2-py2.py3-none-any.whl.metadata (37 kB)
Downloading wfdb-3.4.0-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
Downloading neurokit2-0.2.2-py2.py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas, wfdb, neurokit2
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Un

In [7]:
!pip show wfdb pandas neurokit2

Name: wfdb
Version: 3.4.0
Summary: The WFDB Python Toolbox
Home-page: https://github.com/MIT-LCP/wfdb-python
Author: The Laboratory for Computational Physiology
Author-email: support@physionet.org
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: certifi, chardet, cycler, idna, joblib, kiwisolver, matplotlib, numpy, pandas, pyparsing, python-dateutil, pytz, requests, scikit-learn, scipy, threadpoolctl, urllib3
Required-by: 
---
Name: pandas
Version: 2.2.2
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: The Pandas Development Team <pandas-dev@python.org>
License: BSD 3-Clause License

Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
All rights reserved.

Copyright (c) 2011-2023, Open source contributors.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the fol

In [1]:
import wfdb
import numpy as np
import pandas as pd
import neurokit2 as nk
from scipy.signal import resample
import os
import logging
from multiprocessing import Pool, cpu_count

# Set up logging to track errors and progress
logging.basicConfig(filename='preprocess.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def process_file(file_info):
    """Function to process a single ECG file."""
    dataset_path, file, output_dir = file_info
    record_name = os.path.join(dataset_path, file.split('.')[0])

    try:
        # Read ECG record
        record = wfdb.rdrecord(record_name)
        signal = record.p_signal[:, 0]  # Use first channel

        # Ensure signal has data
        if len(signal) == 0:
            raise ValueError("Empty signal detected")

        # Resample to 250 Hz
        signal_resampled = resample(signal, int(len(signal) * 250 / record.fs))

        # Normalize to [-1, 1]
        signal_normalized = (signal_resampled - signal_resampled.min()) / \
                            (signal_resampled.max() - signal_resampled.min()) * 2 - 1

        # Extract ECG features
        try:
            ecg_signals, info = nk.ecg_process(signal_normalized, sampling_rate=250)
            features = nk.ecg_analyze(ecg_signals, sampling_rate=250)
        except Exception as e:
            logging.warning(f"Failed to extract features for {file}: {e}. Using minimal features.")
            r_peaks = nk.ecg_peaks(signal_normalized, sampling_rate=250)['ECG_R_Peaks']
            hrv = np.diff(r_peaks) if len(r_peaks) > 1 else np.nan
            features = pd.DataFrame({
                'HRV': [hrv.mean() if not np.isnan(hrv) else 0],
                'R_peaks': [len(r_peaks)],
                'amplitude': [signal_normalized.max() - signal_normalized.min()]
            })

        # Save processed signal and features
        np.save(os.path.join(output_dir, f"{file.split('.')[0]}_processed.npy"), signal_normalized)
        features.to_csv(os.path.join(output_dir, f"{file.split('.')[0]}_features.csv"), index=False)

        return f"Successfully processed {file}"

    except Exception as e:
        return f"Error processing {file}: {e}"

def preprocess_ecg(data_dir, output_dir, max_files=None):
    """Preprocess ECG data using parallel processing."""
    os.makedirs(output_dir, exist_ok=True)

    # Collect all files from both datasets
    file_list = []
    for dataset in ["mitdb", "ptb-xl"]:
        dataset_path = os.path.join(data_dir, dataset)
        if not os.path.exists(dataset_path):
            logging.warning(f"Dataset path {dataset_path} not found, skipping.")
            continue

        files = [f for f in os.listdir(dataset_path) if f.endswith(".dat")]
        if max_files:
            files = files[:max_files]  # Limit for testing

        file_list.extend([(dataset_path, f, output_dir) for f in files])

    logging.info(f"Processing {len(file_list)} files in parallel")

    # Use multiprocessing to speed up processing
    num_workers = min(cpu_count(), 8)  # Use up to 8 cores for efficiency
    with Pool(num_workers) as pool:
        results = pool.map(process_file, file_list)

    for res in results:
        logging.info(res)

    logging.info("Preprocessing completed.")

if __name__ == "__main__":
    preprocess_ecg("C:/Users/akshi/OneDrive/Desktop/HeartGuardAI/data",
                   "C:/Users/akshi/OneDrive/Desktop/HeartGuardAI/data/processed")


ModuleNotFoundError: No module named 'wfdb'

In [3]:
pip install pandas==2.2.2




In [4]:
pip list

Package                            Version
---------------------------------- ------------------
absl-py                            1.4.0
accelerate                         1.3.0
aiohappyeyeballs                   2.4.6
aiohttp                            3.11.13
aiosignal                          1.3.2
alabaster                          1.0.0
albucore                           0.0.23
albumentations                     2.0.4
ale-py                             0.10.2
altair                             5.5.0
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.6.0
arviz                              0.20.0
astropy                            7.0.1
astropy-iers-data                  0.2025.2.24.0.34.4
astunparse                         1.6.3
atpublic                           4.1.0
attrs                              25.1.0
audioread            

In [5]:
!pip list | grep wfdb


In [6]:
!pip show wfdb


[0m

In [7]:
!pip install wfdb==3.4.0 --no-cache-dir


Collecting wfdb==3.4.0
  Downloading wfdb-3.4.0-py3-none-any.whl.metadata (3.6 kB)
Downloading wfdb-3.4.0-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wfdb
Successfully installed wfdb-3.4.0


In [8]:
!pip show wfdb


Name: wfdb
Version: 3.4.0
Summary: The WFDB Python Toolbox
Home-page: https://github.com/MIT-LCP/wfdb-python
Author: The Laboratory for Computational Physiology
Author-email: support@physionet.org
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: certifi, chardet, cycler, idna, joblib, kiwisolver, matplotlib, numpy, pandas, pyparsing, python-dateutil, pytz, requests, scikit-learn, scipy, threadpoolctl, urllib3
Required-by: 


In [9]:
!pip install pandas==2.2.2 neurokit2==0.2.2 scipy torch onnx shap --no-cache-dir --force-reinstall


Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting neurokit2==0.2.2
  Downloading neurokit2-0.2.2-py2.py3-none-any.whl.metadata (37 kB)
Collecting scipy
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m243.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting onnx
  Downloading onnx-1.17.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting shap
  Downloading shap-0.46.0-cp311-cp311-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting numpy>=1.23.2 (from pandas==2.2.2)
  Downloading numpy-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K   

In [None]:
import wfdb
import numpy as np
import pandas as pd
import neurokit2 as nk
from scipy.signal import resample
import os
import logging
from multiprocessing import Pool, cpu_count

# Set up logging to track errors and progress
logging.basicConfig(filename='preprocess.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

def process_file(file_info):
    """Function to process a single ECG file."""
    dataset_path, file, output_dir = file_info
    record_name = os.path.join(dataset_path, file.split('.')[0])

    try:
        # Read ECG record
        record = wfdb.rdrecord(record_name)
        signal = record.p_signal[:, 0]  # Use first channel

        # Ensure signal has data
        if len(signal) == 0:
            raise ValueError("Empty signal detected")

        # Resample to 250 Hz
        signal_resampled = resample(signal, int(len(signal) * 250 / record.fs))

        # Normalize to [-1, 1]
        signal_normalized = (signal_resampled - signal_resampled.min()) / \
                            (signal_resampled.max() - signal_resampled.min()) * 2 - 1

        # Extract ECG features
        try:
            ecg_signals, info = nk.ecg_process(signal_normalized, sampling_rate=250)
            features = nk.ecg_analyze(ecg_signals, sampling_rate=250)
        except Exception as e:
            logging.warning(f"Failed to extract features for {file}: {e}. Using minimal features.")
            r_peaks = nk.ecg_peaks(signal_normalized, sampling_rate=250)['ECG_R_Peaks']
            hrv = np.diff(r_peaks) if len(r_peaks) > 1 else np.nan
            features = pd.DataFrame({
                'HRV': [hrv.mean() if not np.isnan(hrv) else 0],
                'R_peaks': [len(r_peaks)],
                'amplitude': [signal_normalized.max() - signal_normalized.min()]
            })

        # Save processed signal and features
        np.save(os.path.join(output_dir, f"{file.split('.')[0]}_processed.npy"), signal_normalized)
        features.to_csv(os.path.join(output_dir, f"{file.split('.')[0]}_features.csv"), index=False)

        return f"Successfully processed {file}"

    except Exception as e:
        return f"Error processing {file}: {e}"

def preprocess_ecg(data_dir, output_dir, max_files=None):
    """Preprocess ECG data using parallel processing."""
    os.makedirs(output_dir, exist_ok=True)

    # Collect all files from both datasets
    file_list = []
    for dataset in ["mitdb", "ptb-xl"]:
        dataset_path = os.path.join(data_dir, dataset)
        if not os.path.exists(dataset_path):
            logging.warning(f"Dataset path {dataset_path} not found, skipping.")
            continue

        files = [f for f in os.listdir(dataset_path) if f.endswith(".dat")]
        if max_files:
            files = files[:max_files]  # Limit for testing

        file_list.extend([(dataset_path, f, output_dir) for f in files])

    logging.info(f"Processing {len(file_list)} files in parallel")

    # Use multiprocessing to speed up processing
    num_workers = min(cpu_count(), 8)  # Use up to 8 cores for efficiency
    with Pool(num_workers) as pool:
        results = pool.map(process_file, file_list)

    for res in results:
        logging.info(res)

    logging.info("Preprocessing completed.")

if __name__ == "__main__":
     # Use paths from the cloned GitHub repo
    data_dir = '/content/Heart/data'
    output_dir = '/content/Heart/data/processed'
    preprocess_ecg(data_dir, output_dir)  # Process all files for final project
