In [None]:
# ECG Analysis Pipeline: Memory-Efficient R-peak Detection and Anomaly Detection
# =============================================================================

import os
import numpy as np
import pandas as pd
import wfdb
import matplotlib.pyplot as plt
from scipy.fft import fft, ifft
from scipy.signal import find_peaks, butter, filtfilt
from typing import Tuple, List, Optional, Dict
from joblib import Parallel, delayed
from tqdm import tqdm
import gc

# Configuration
%matplotlib inline

# Download datasets if needed (commented out to avoid re-downloading)
# wfdb.dl_database("butqdb", "source1", keep_subdirs=False)
# wfdb.dl_database("nsrdb", "source2", keep_subdirs=False)


In [None]:

# =============================================================================
# 1. MEMORY-EFFICIENT SIGNAL PREPROCESSING UTILITIES
# =============================================================================

def center_signal(signal: np.ndarray) -> np.ndarray:
    """Subtract mean to zero-center the signal."""
    return signal - np.mean(signal)

def highpass_filter(signal: np.ndarray, fs: int, cutoff: float = 0.5, order: int = 1) -> np.ndarray:
    """Apply Butterworth high-pass filter to remove baseline drift."""
    b, a = butter(order, cutoff / (0.5 * fs), btype='high')
    return filtfilt(b, a, signal)

def preprocess_ecg(signal: np.ndarray, fs: int) -> np.ndarray:
    """Complete ECG preprocessing pipeline."""
    signal = center_signal(signal)
    signal = highpass_filter(signal, fs, cutoff=0.5, order=2)
    return signal

class ECGStreamProcessor:
    """Memory-efficient ECG stream processor using chunked reading."""

    def __init__(self, fs: int = 1000, chunk_duration_sec: int = 300):
        """
        Initialize processor.

        Args:
            fs: Sampling frequency
            chunk_duration_sec: Duration of each chunk in seconds (default 5 minutes)
        """
        self.fs = fs
        self.chunk_size = chunk_duration_sec * fs
        self.overlap_size = int(fs * 1.0)  # 1 second overlap for continuity

    def process_file_chunked(self, record_path: str, processor_func,
                           max_duration_hours: float = None) -> List:
        """
        Process ECG file in chunks to avoid memory overflow.

        Args:
            record_path: Path to ECG record
            processor_func: Function to apply to each chunk
            max_duration_hours: Maximum duration to process (None for full file)

        Returns:
            List of results from each chunk
        """
        # Get file info
        header = wfdb.rdheader(record_path)
        total_samples = header.sig_len

        if max_duration_hours:
            max_samples = int(max_duration_hours * 3600 * self.fs)
            total_samples = min(total_samples, max_samples)

        results = []
        processed_samples = 0

        print(f"Processing {total_samples/self.fs/3600:.2f} hours in chunks of {self.chunk_size/self.fs/60:.1f} minutes")

        # Process in chunks
        with tqdm(total=total_samples, desc="Processing chunks") as pbar:
            while processed_samples < total_samples:
                # Calculate chunk boundaries
                start_sample = max(0, processed_samples - self.overlap_size)
                end_sample = min(total_samples, processed_samples + self.chunk_size)

                # Read chunk
                record = wfdb.rdrecord(record_path, sampfrom=start_sample, sampto=end_sample)
                chunk = record.p_signal[:, 0]

                # Preprocess chunk
                chunk_processed = preprocess_ecg(chunk, self.fs)

                # Apply processor function
                chunk_result = processor_func(chunk_processed, start_sample)
                results.append(chunk_result)

                # Update progress
                processed_samples += self.chunk_size
                pbar.update(min(self.chunk_size, end_sample - (processed_samples - self.chunk_size)))

                # Force garbage collection
                del chunk, chunk_processed, record
                gc.collect()

        return results


In [None]:

# =============================================================================
# 2. MEMORY-EFFICIENT R-PEAK DETECTION
# =============================================================================

def detect_rpeaks_simple(signal: np.ndarray, fs: int, height_percentile: float = 98,
                        min_distance_ms: float = 200) -> np.ndarray:
    """Detect R-peaks using simple peak detection with dynamic threshold."""
    height = np.percentile(signal, height_percentile)
    min_distance = int(min_distance_ms * fs / 1000)
    peaks, _ = find_peaks(signal, height=height, distance=min_distance)
    return peaks

class RPeakDetector:
    """Memory-efficient R-peak detector for long ECG signals."""

    def __init__(self, fs: int = 1000):
        self.fs = fs
        self.all_peaks = []

    def process_chunk(self, chunk: np.ndarray, chunk_start: int) -> Dict:
        """Process a single chunk for R-peak detection."""
        # Detect peaks in chunk
        peaks = detect_rpeaks_simple(chunk, self.fs)

        # Adjust peak indices to global position
        global_peaks = peaks + chunk_start

        # Store peaks (avoiding overlap region)
        overlap_samples = int(self.fs * 1.0)  # 1 second
        if chunk_start > 0:
            # Skip overlap region for non-first chunks
            valid_mask = peaks >= overlap_samples
            global_peaks = global_peaks[valid_mask]

        self.all_peaks.extend(global_peaks)

        return {
            'chunk_start': chunk_start,
            'chunk_peaks': len(peaks),
            'valid_peaks': len(global_peaks)
        }

    def get_results(self) -> np.ndarray:
        """Get all detected R-peaks."""
        return np.array(sorted(self.all_peaks))

def compute_bpm_windowed_efficient(rpeaks: np.ndarray, fs: int,
                                 total_duration_sec: float, window_sec: int = 15) -> Tuple[np.ndarray, np.ndarray]:
    """Memory-efficient BPM computation."""
    if len(rpeaks) == 0:
        return np.array([]), np.array([])

    n_windows = int(total_duration_sec // window_sec)
    bpm_values = np.zeros(n_windows)

    # Use searchsorted for efficient counting
    window_starts = np.arange(n_windows) * window_sec * fs
    window_ends = window_starts + window_sec * fs

    for i in range(n_windows):
        start_idx = np.searchsorted(rpeaks, window_starts[i], side='left')
        end_idx = np.searchsorted(rpeaks, window_ends[i], side='right')
        peaks_in_window = end_idx - start_idx
        bpm_values[i] = peaks_in_window * 60 / window_sec

    time_minutes = np.arange(n_windows) * (window_sec / 60)
    return bpm_values, time_minutes


In [None]:

# =============================================================================
# 3. MEMORY-EFFICIENT SINGLE FILE ECG PROCESSING
# =============================================================================

def process_single_ecg_file_efficient(data_dir: str, record_name: str, plot: bool = True,
                                    max_duration_hours: float = 1.0) -> Dict:
    """Memory-efficient processing of a single ECG file."""

    record_path = os.path.join(data_dir, f"{record_name}_ECG")
    results = {
        'record_name': record_name,
        'success': False,
        'fs': None,
        'total_samples': None,
        'detected_rpeaks': None,
        'average_bpm': None,
        'bpm_values': None,
        'time_minutes': None,
        'processing_duration_hours': None,
        'error': None
    }

    try:
        # Get file info
        header = wfdb.rdheader(record_path)
        fs = header.fs
        total_samples = header.sig_len

        if max_duration_hours:
            max_samples = int(max_duration_hours * 3600 * fs)
            processing_samples = min(total_samples, max_samples)
        else:
            processing_samples = total_samples

        processing_duration_hours = processing_samples / fs / 3600

        print(f"  Total file duration: {total_samples/fs/3600:.2f} hours")
        print(f"  Processing duration: {processing_duration_hours:.2f} hours")

        # Initialize stream processor and R-peak detector
        processor = ECGStreamProcessor(fs=fs, chunk_duration_sec=300)  # 5-minute chunks
        detector = RPeakDetector(fs=fs)

        # Process file in chunks
        chunk_results = processor.process_file_chunked(
            record_path, detector.process_chunk, max_duration_hours
        )

        # Get all detected R-peaks
        rpeaks = detector.get_results()

        # Compute BPM efficiently
        total_duration_sec = processing_samples / fs
        bpm_values, time_minutes = compute_bpm_windowed_efficient(
            rpeaks, fs, total_duration_sec
        )
        avg_bpm = np.mean(bpm_values) if len(bpm_values) > 0 else 0

        # Update results
        results.update({
            'fs': fs,
            'total_samples': processing_samples,
            'detected_rpeaks': len(rpeaks),
            'average_bpm': avg_bpm,
            'bpm_values': bpm_values,
            'time_minutes': time_minutes,
            'processing_duration_hours': processing_duration_hours,
            'success': True
        })

        # Plot results
        if plot and len(bpm_values) > 0:
            plt.figure(figsize=(12, 4))
            plt.plot(time_minutes, bpm_values, "-", linewidth=1, alpha=0.8)
            plt.title(f"BPM Over Time - {record_name} ({processing_duration_hours:.1f}h)")
            plt.xlabel("Time (minutes)")
            plt.ylabel("BPM")
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.show()
            plt.close()

        # Print summary
        print(f"Record: {record_name}")
        print(f"Sampling rate: {fs} Hz")
        print(f"Processed samples: {processing_samples:,}")
        print(f"Detected R-peaks: {len(rpeaks):,}")
        print(f"Average BPM: {avg_bpm:.1f}")
        print("-" * 50)

    except Exception as e:
        results['error'] = str(e)
        print(f"Error processing {record_name}: {str(e)}")

    return results


In [None]:

# =============================================================================
# 4. MEMORY-EFFICIENT BATCH ECG PROCESSING
# =============================================================================

def process_ecg_batch_efficient(data_dir: str, plot_individual: bool = False,
                              max_duration_hours: float = 1.0) -> pd.DataFrame:
    """Memory-efficient batch processing of ECG files."""

    results = []

    if not os.path.exists(data_dir):
        print(f"Directory {data_dir} not found!")
        return pd.DataFrame()

    all_files = os.listdir(data_dir)
    ecg_files = [f for f in all_files if f.endswith('_ECG.hea')]
    record_names = sorted([f.replace('_ECG.hea', '') for f in ecg_files])

    print(f"Processing {len(record_names)} ECG records from {data_dir}")
    print("=" * 60)

    for i, record_name in enumerate(record_names):
        print(f"\nProcessing file {i+1}/{len(record_names)}: {record_name}")

        result = process_single_ecg_file_efficient(
            data_dir, record_name, plot=plot_individual,
            max_duration_hours=max_duration_hours
        )
        results.append(result)

        # Force garbage collection between files
        gc.collect()

    # Create summary DataFrame
    successful_results = [r for r in results if r.get('success', False)]

    if successful_results:
        df = pd.DataFrame([
            {
                'Record': r['record_name'],
                'Sampling_Rate_Hz': r['fs'],
                'Total_Samples': r['total_samples'],
                'Detected_R_peaks': r['detected_rpeaks'],
                'Average_BPM': r['average_bpm'],
                'Duration_Hours': r['processing_duration_hours']
            }
            for r in successful_results
        ])

        print("\nBatch Processing Summary:")
        print(df.round(1))
        return df
    else:
        print("No files were successfully processed.")
        return pd.DataFrame()


In [None]:

# =============================================================================
# 5. MEMORY-EFFICIENT ANOMALY DETECTION
# =============================================================================

class AnomalyDetector:
    """Memory-efficient anomaly detector for long ECG signals."""

    def __init__(self, fs: int = 1000):
        self.fs = fs
        self.all_anomalies = []

    def detect_anomalies_amplitude(self, signal: np.ndarray, k: float = 3.7) -> Tuple[np.ndarray, float]:
        """Detect anomalies based on amplitude threshold."""
        mean_val = signal.mean()
        std_val = signal.std()
        threshold = mean_val + k * std_val
        anomaly_indices = np.where(np.abs(signal) > threshold)[0]
        return anomaly_indices, threshold

    def detect_anomalies_rolling(self, signal: np.ndarray, window: int = 1000, threshold: float = 15.5) -> np.ndarray:
        """Detect anomalies using rolling statistics."""
        if len(signal) < window:
            window = len(signal) // 2

        rolling_mean = np.convolve(signal, np.ones(window) / window, mode='same')
        rolling_std = np.sqrt(np.convolve((signal - rolling_mean)**2, np.ones(window) / window, mode='same'))

        # Avoid division by zero
        rolling_std = np.maximum(rolling_std, 1e-8)
        anomaly_indices = np.where(np.abs(signal - rolling_mean) > threshold * rolling_std)[0]
        return anomaly_indices

    def detect_anomalies_iqr(self, signal: np.ndarray, k: float = 15.0) -> np.ndarray:
        """Detect anomalies using IQR-based method."""
        q1, q3 = np.percentile(signal, [25, 75])
        iqr = q3 - q1
        if iqr == 0:  # Handle constant signal
            return np.array([])
        lower_bound = q1 - k * iqr
        upper_bound = q3 + k * iqr
        anomaly_indices = np.where((signal < lower_bound) | (signal > upper_bound))[0]
        return anomaly_indices

    def process_chunk(self, chunk: np.ndarray, chunk_start: int) -> Dict:
        """Process a single chunk for anomaly detection."""
        # Apply multiple detection methods
        methods_results = []

        # Amplitude-based detection
        amp_anomalies, threshold = self.detect_anomalies_amplitude(chunk)
        methods_results.append(amp_anomalies)

        # Rolling statistics detection
        rolling_anomalies = self.detect_anomalies_rolling(chunk)
        methods_results.append(rolling_anomalies)

        # IQR-based detection
        iqr_anomalies = self.detect_anomalies_iqr(chunk)
        methods_results.append(iqr_anomalies)

        # Combine results
        if any(len(arr) > 0 for arr in methods_results):
            combined_anomalies = np.unique(np.concatenate([arr for arr in methods_results if len(arr) > 0]))
        else:
            combined_anomalies = np.array([])

        # Adjust indices to global position
        global_anomalies = combined_anomalies + chunk_start

        # Store anomalies (avoiding overlap region)
        overlap_samples = int(self.fs * 1.0)  # 1 second
        if chunk_start > 0:
            # Skip overlap region for non-first chunks
            valid_mask = combined_anomalies >= overlap_samples
            global_anomalies = global_anomalies[valid_mask]

        self.all_anomalies.extend(global_anomalies)

        return {
            'chunk_start': chunk_start,
            'chunk_anomalies': len(combined_anomalies),
            'valid_anomalies': len(global_anomalies),
            'amplitude_threshold': threshold
        }

    def get_results(self) -> np.ndarray:
        """Get all detected anomalies."""
        return np.array(sorted(self.all_anomalies))

def evaluate_anomaly_detection_file_efficient(data_dir: str, record_name: str,
                                             max_duration_hours: float = 1.0) -> Dict:
    """Memory-efficient anomaly detection evaluation for a single file."""

    record_path = os.path.join(data_dir, f"{record_name}_ECG")
    results = {
        'record_name': record_name,
        'success': False,
        'detected_anomalies': 0,
        'anomaly_rate_per_hour': 0,
        'processing_duration_hours': 0,
        'error': None
    }

    try:
        # Get file info
        header = wfdb.rdheader(record_path)
        fs = header.fs
        total_samples = header.sig_len

        if max_duration_hours:
            max_samples = int(max_duration_hours * 3600 * fs)
            processing_samples = min(total_samples, max_samples)
        else:
            processing_samples = total_samples

        processing_duration_hours = processing_samples / fs / 3600

        # Initialize stream processor and anomaly detector
        processor = ECGStreamProcessor(fs=fs, chunk_duration_sec=300)  # 5-minute chunks
        detector = AnomalyDetector(fs=fs)

        # Process file in chunks
        chunk_results = processor.process_file_chunked(
            record_path, detector.process_chunk, max_duration_hours
        )

        # Get all detected anomalies
        anomalies = detector.get_results()
        anomaly_rate = len(anomalies) / processing_duration_hours if processing_duration_hours > 0 else 0

        results.update({
            'success': True,
            'detected_anomalies': len(anomalies),
            'anomaly_rate_per_hour': anomaly_rate,
            'processing_duration_hours': processing_duration_hours
        })

        print(f"Record: {record_name}")
        print(f"Detected anomalies: {len(anomalies):,}")
        print(f"Anomaly rate: {anomaly_rate:.1f} per hour")
        print("-" * 50)

    except Exception as e:
        results['error'] = str(e)
        print(f"Error processing {record_name}: {str(e)}")

    return results


In [None]:

# =============================================================================
# 6. MEMORY-EFFICIENT R-PEAK EVALUATION
# =============================================================================

def evaluate_rpeak_detection(true_peaks: np.ndarray, detected_peaks: np.ndarray,
                           tolerance: float = 1e-4) -> Tuple[int, int, int]:
    """Vectorized evaluation of R-peak detection accuracy."""
    if len(true_peaks) == 0 or len(detected_peaks) == 0:
        return 0, len(true_peaks), len(detected_peaks)

    true_peaks = np.array(true_peaks)
    detected_peaks = np.array(detected_peaks)

    rel_diff = np.abs(true_peaks[:, None] / detected_peaks[None, :] - 1)
    matched = np.any(rel_diff < tolerance, axis=1)
    correct = int(np.sum(matched))
    return correct, len(true_peaks), len(detected_peaks)

def process_file_safe_efficient(test_dir: str, base_name: str, fs: int = 1000,
                               max_duration_hours: float = 1.0) -> Dict:
    """Memory-efficient evaluation of a single file."""

    file_path = os.path.join(test_dir, base_name)
    out = {'File': base_name}

    try:
        # Get file info first
        header = wfdb.rdheader(file_path)
        total_samples = header.sig_len

        if max_duration_hours:
            max_samples = int(max_duration_hours * 3600 * fs)
            processing_samples = min(total_samples, max_samples)
        else:
            processing_samples = total_samples

        # Load annotations
        annotations = wfdb.rdann(file_path, 'atr')
        true_peaks = annotations.sample
        true_peaks = true_peaks[true_peaks < processing_samples]

        # Process ECG in chunks
        processor = ECGStreamProcessor(fs=fs, chunk_duration_sec=300)
        detector = RPeakDetector(fs=fs)

        chunk_results = processor.process_file_chunked(
            file_path, detector.process_chunk, max_duration_hours
        )

        detected_peaks = detector.get_results()

        # Evaluate accuracy
        correct, total_true, total_detected = evaluate_rpeak_detection(true_peaks, detected_peaks)
        accuracy = (correct / total_true * 100) if total_true > 0 else 0

        out.update({
            'Correct': correct,
            'Missed': total_true - correct,
            'Total_True': total_true,
            'Total_Detected': total_detected,
            'Accuracy_Percent': accuracy
        })

    except Exception as e:
        out['Error'] = str(e)

    return out

def evaluate_rpeak_batch_efficient(test_dir: str, fs: int = 1000,
                                 max_duration_hours: float = 1.0) -> pd.DataFrame:
    """Memory-efficient R-peak detection evaluation."""

    if not os.path.isdir(test_dir):
        print(f"Directory {test_dir} not found!")
        return pd.DataFrame()

    dat_files = sorted([f for f in os.listdir(test_dir) if f.endswith('.dat')])
    base_names = [f[:-4] for f in dat_files]

    print(f"Evaluating R-peak detection on {len(base_names)} files from {test_dir}")
    print(f"Processing {max_duration_hours:.1f} hours per file")
    print("=" * 60)

    results = []
    for name in tqdm(base_names, desc="Evaluating"):
        res = process_file_safe_efficient(test_dir, name, fs=fs,
                                        max_duration_hours=max_duration_hours)
        if 'Error' in res:
            print(f"Error processing {name}: {res['Error']}")
        else:
            results.append(res)

        # Force garbage collection
        gc.collect()

    if results:
        df = pd.DataFrame(results).round(1)
        print("R-peak Detection Evaluation Results:")
        print(df)

        overall = df['Correct'].sum() / df['Total_True'].sum() * 100 if df['Total_True'].sum() > 0 else 0
        print(f"\nOverall Accuracy: {overall:.2f}%")
        return df
    else:
        print("No files were successfully processed.")
        return pd.DataFrame()


In [None]:

# =============================================================================
# 7. MAIN EXECUTION FUNCTIONS
# =============================================================================

def run_ecg_analysis_pipeline_efficient(max_duration_hours: float = 1.0):
    """
    Run the complete memory-efficient ECG analysis pipeline.

    Args:
        max_duration_hours: Maximum duration to process per file (default 1 hour)
    """
    print("Memory-Efficient ECG Analysis Pipeline")
    print("=" * 50)
    print(f"Processing up to {max_duration_hours:.1f} hours per file")

    # Step 1: Process single ECG file from source1
    print("\n1. Processing single ECG file from source1...")
    single_result = process_single_ecg_file_efficient("source1", "100001",
                                                    plot=True, max_duration_hours=max_duration_hours)

    # Step 2: Batch process files in source1
    print("\n2. Batch processing ECG files in source1...")
    batch_results = process_ecg_batch_efficient("source1", plot_individual=False,
                                              max_duration_hours=max_duration_hours)

    # Step 3: Evaluate R-peak detection on source2
    print("\n3. Evaluating R-peak detection on source2...")
    rpeak_results = evaluate_rpeak_batch_efficient("source2", max_duration_hours=max_duration_hours)

    # Step 4: Anomaly detection evaluation
    print("\n4. Sample anomaly detection on source1...")
    if not batch_results.empty:
        sample_record = batch_results['Record'].iloc[0]
        anomaly_result = evaluate_anomaly_detection_file_efficient("source1", sample_record,
                                                                  max_duration_hours=max_duration_hours)

    print("\nMemory-efficient pipeline completed!")

    return {
        'single_ecg': single_result,
        'batch_ecg': batch_results,
        'rpeak_evaluation': rpeak_results
    }


In [None]:

# =============================================================================
# 8. USAGE EXAMPLES
# =============================================================================

# Run the memory-efficient pipeline
# results = run_ecg_analysis_pipeline_efficient(max_duration_hours=1.0)

# Or run individual components with limited duration:
# process_single_ecg_file_efficient("source1", "100001", max_duration_hours=None)
# process_ecg_batch_efficient("source1", max_duration_hours=None)
evaluate_rpeak_batch_efficient("source2", max_duration_hours=1.0)

In [None]:
# Test the memory-efficient ECG pipeline
# =============================================================================

# Test with limited duration to avoid memory issues
print("Testing memory-efficient ECG pipeline...")

# Process just 30 minutes of data for testing
test_duration_hours = 1  # 30 minutes

# Test single file processing
try:
    print("\nTesting single file processing...")
    result = process_single_ecg_file_efficient("source1", "100001",
                                              plot=True,
                                              max_duration_hours=test_duration_hours)
    print(f"Single file test: {'SUCCESS' if result['success'] else 'FAILED'}")
except Exception as e:
    print(f"Single file test failed: {e}")

# Test batch processing (limit to first 2 files)
try:
    print(f"\nTesting batch processing with {test_duration_hours} hours per file...")

    # Get list of files
    if os.path.exists("source1"):
        files = [f for f in os.listdir("source1") if f.endswith('_ECG.hea')]
        print(f"Found {len(files)} ECG files")

        # Process first file only for testing
        if files:
            first_file = files[0].replace('_ECG.hea', '')
            print(f"Processing first file: {first_file}")

            result = process_single_ecg_file_efficient("source1", first_file,
                                                      plot=False,
                                                      max_duration_hours=test_duration_hours)
            print(f"Test completed: {'SUCCESS' if result['success'] else 'FAILED'}")
        else:
            print("No ECG files found in source1")
    else:
        print("source1 directory not found")

except Exception as e:
    print(f"Batch processing test failed: {e}")

# Test R-peak evaluation if source2 exists
try:
    if os.path.exists("source2"):
        print(f"\nTesting R-peak evaluation with {test_duration_hours} hours per file...")
        results = evaluate_rpeak_batch_efficient("source2", max_duration_hours=test_duration_hours)
        print(f"R-peak evaluation: {'SUCCESS' if not results.empty else 'NO DATA'}")
    else:
        print("source2 directory not found - skipping R-peak evaluation")
except Exception as e:
    print(f"R-peak evaluation test failed: {e}")

print("\nMemory usage test completed!")
print(f"Processed up to {test_duration_hours} hours per file to conserve memory.")