# Extracción de datos de la base de datos de PTB Diagnostic ECG Database

In [None]:
import wfdb
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt, find_peaks

In [None]:
folder_name = './data/autonomic-aging-a-dataset'
files = []
diagnosis_per_patient = {}
kinds_of_diagnosis = set()
patients_per_diagnosis = {}

In [None]:
def bandpass_filter(signal, lowcut, highcut, fs, order=1):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, signal)

In [None]:
def calculate_hr(p_record_name):
    try:
        record = wfdb.rdrecord(p_record_name)
        ecg_data = record.p_signal  # Load all 12-lead ECG signals
    
        # Step 2: Select Lead II for Heart Rate Calculation
        lead_II = ecg_data[:, 1]  # Assuming lead II is the second column
        
        fs = 1000  # Sampling frequency is 1000 Hz
        filtered_lead_II = bandpass_filter(lead_II, 0.5, 50, fs)
        
        # Step 4: R-Peak Detection
        # Use the find_peaks function to detect R-peaks
        peaks, _ = find_peaks(filtered_lead_II, distance=fs*0.6)  # Assuming a minimum distance of 600ms between peaks
        
        # Step 5: Calculate RR Intervals and Heart Rate
        rr_intervals = np.diff(peaks) / fs  # RR intervals in seconds
        hr_values = 60 / rr_intervals  # Heart rate in beats per minute (bpm)
        
        # Step 6: Generate Time Axis for Heart Rate Plot
        time_peaks = peaks / fs  # Time of R-peaks in seconds
        time_hr = (time_peaks[:-1] + time_peaks[1:]) / 2  # Midpoint between successive peaks
        return time_hr, hr_values
    except Exception as e:
        print(f"Error processing record {p_record_name}: {e}")
        return [], []

In [None]:
def plot_hr_signal(time_hr, hr_values):
    plt.figure(figsize=(10, 6))
    plt.plot(time_hr, hr_values, label='Heart Rate (bpm)', color='b', marker='o', linestyle='-')
    plt.title('Heart Rate over Time')
    plt.xlabel('Time (s)')
    plt.ylabel('Heart Rate (bpm)')
    plt.grid(True)
    plt.legend()
    plt.show()

In [None]:
# Descomentar para visualizar métricas de diagnóstico
# with open(folder_name + '/RECORDS') as f:
#     for line in f:
#         record_name = line.strip()
#         file_name = f'{folder_name}/{record_name}'
#         files.append(file_name)
#         header = wfdb.rdheader(file_name)
#         diagnosis = header.comments[4].split(': ')[1]
#         diagnosis_per_patient[f'{record_name}'] = diagnosis
#         kinds_of_diagnosis.add(diagnosis)
#         if diagnosis in patients_per_diagnosis:
#             patients_per_diagnosis[diagnosis].append(record_name)
#         else:
#             patients_per_diagnosis[diagnosis] = [record_name]
#         time_hr, hr_values = calculate_hr(file_name)
#         plot_hr_signal(time_hr, hr_values)



In [None]:
import pandas as pd
from scipy.signal import find_peaks
from hr_engine import get_features


times = pd.DataFrame()
hear_rates = pd.DataFrame()
process_only_one = False
results = []
with open(folder_name + '/RECORDS') as f:
    for line in f:
        
        record_name = line.strip()
        file_name = f'{folder_name}/{record_name}'
        files.append(file_name)
        header = wfdb.rdheader(file_name)
        # diagnosis = header.comments[4].split(': ')[1]
        # if diagnosis == 'n/a':
        #     continue
        # diagnosis_per_patient[f'{record_name}'] = diagnosis
        # kinds_of_diagnosis.add(diagnosis)
        # if diagnosis in patients_per_diagnosis:
        #     patients_per_diagnosis[diagnosis].append(record_name)
        # else:
        #     patients_per_diagnosis[diagnosis] = [record_name]
        time_hr, hr_values = calculate_hr(file_name)
        
        # remove first 5 and last 5 values of time_hr and hr_values
        time_hr = time_hr[5:-5]
        hr_values = hr_values[5:-5]

        # if diagnosis == 'n/a':
        #     continue
        
        # diagnosis = 0 if diagnosis == "Healthy control" else 1
        # print(time_hr)
        if(len(hr_values) == 0 or len(time_hr) == 0):
            continue
        results.append(get_features(hr_values=hr_values, time_hr=time_hr, record_name=record_name, diagnosis=0))

        if process_only_one:
            break
# print(kinds_of_diagnosis)
df = pd.DataFrame(results)
df.head()

# print(kinds_of_diagnosis)
        

In [None]:
df.to_csv('autonomic-aging-a-dataset-clear-data.csv', index=False)