# Extracción de datos de la base de datos de PTB Diagnostic ECG Database

In [67]:
import wfdb
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt, find_peaks

In [68]:
folder_name = './data/autonomic-aging-a-dataset'
files = []
diagnosis_per_patient = {}
kinds_of_diagnosis = set()
patients_per_diagnosis = {}

In [69]:
def bandpass_filter(signal, lowcut, highcut, fs, order=1):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, signal)

In [70]:
def calculate_hr(p_record_name):
    try:
        record = wfdb.rdrecord(p_record_name)
        ecg_data = record.p_signal  # Load all 12-lead ECG signals
    
        # Step 2: Select Lead II for Heart Rate Calculation
        lead_II = ecg_data[:, 1]  # Assuming lead II is the second column
        
        fs = 1000  # Sampling frequency is 1000 Hz
        filtered_lead_II = bandpass_filter(lead_II, 0.5, 50, fs)
        
        # Step 4: R-Peak Detection
        # Use the find_peaks function to detect R-peaks
        peaks, _ = find_peaks(filtered_lead_II, distance=fs*0.6)  # Assuming a minimum distance of 600ms between peaks
        
        # Step 5: Calculate RR Intervals and Heart Rate
        rr_intervals = np.diff(peaks) / fs  # RR intervals in seconds
        hr_values = 60 / rr_intervals  # Heart rate in beats per minute (bpm)
        
        # Step 6: Generate Time Axis for Heart Rate Plot
        time_peaks = peaks / fs  # Time of R-peaks in seconds
        time_hr = (time_peaks[:-1] + time_peaks[1:]) / 2  # Midpoint between successive peaks
        return time_hr, hr_values
    except Exception as e:
        print(f"Error processing record {p_record_name}: {e}")
        return [], []

In [71]:
def plot_hr_signal(time_hr, hr_values):
    plt.figure(figsize=(10, 6))
    plt.plot(time_hr, hr_values, label='Heart Rate (bpm)', color='b', marker='o', linestyle='-')
    plt.title('Heart Rate over Time')
    plt.xlabel('Time (s)')
    plt.ylabel('Heart Rate (bpm)')
    plt.grid(True)
    plt.legend()
    plt.show()

In [72]:
# Descomentar para visualizar métricas de diagnóstico
# with open(folder_name + '/RECORDS') as f:
#     for line in f:
#         record_name = line.strip()
#         file_name = f'{folder_name}/{record_name}'
#         files.append(file_name)
#         header = wfdb.rdheader(file_name)
#         diagnosis = header.comments[4].split(': ')[1]
#         diagnosis_per_patient[f'{record_name}'] = diagnosis
#         kinds_of_diagnosis.add(diagnosis)
#         if diagnosis in patients_per_diagnosis:
#             patients_per_diagnosis[diagnosis].append(record_name)
#         else:
#             patients_per_diagnosis[diagnosis] = [record_name]
#         time_hr, hr_values = calculate_hr(file_name)
#         plot_hr_signal(time_hr, hr_values)



In [73]:
def calcular_min_hrv(hr):
    min_hrv = 1000
    previous_hr = 0
    for i in range(len(hr)):
        if i == 0:
            previous_hr = hr[i]
        else:
            hrv = abs(hr[i] - previous_hr)
            if hrv != 0 and hrv < min_hrv:
                min_hrv = hrv
            previous_hr = hr[i]
    return min_hrv

In [74]:
def calcular_max_hrv(hr):
    max_hrv = 0
    previous_hr = 0
    for i in range(len(hr)):
        if i == 0:
            previous_hr = hr[i]
        else:
            hrv = abs(hr[i] - previous_hr)
            if hrv > max_hrv:
                max_hrv = hrv
            previous_hr = hr[i]
    return max_hrv

In [75]:
def calcular_mean_hrv(hr):
    mean_hrv = 0
    previous_hr = 0
    for i in range(len(hr)):
        if i == 0:
            previous_hr = hr[i]
        else:
            hrv = abs(hr[i] - previous_hr)
            mean_hrv += hrv
            previous_hr = hr[i]
    return mean_hrv / len(hr)

In [76]:
def calcular_median_hrv(hr):
    hrvs = []
    previous_hr = 0
    for i in range(len(hr)):
        if i == 0:
            previous_hr = hr[i]
        else:
            hrv = abs(hr[i] - previous_hr)
            hrvs.append(hrv)
            previous_hr = hr[i]
    return np.median(hrvs)

In [77]:
def calcular_std_hrv(hr):
    hrvs = []
    previous_hr = 0
    for i in range(len(hr)):
        if i == 0:
            previous_hr = hr[i]
        else:
            hrv = abs(hr[i] - previous_hr)
            hrvs.append(hrv)
            previous_hr = hr[i]
    return np.std(hrvs)

In [78]:
def count_outliers(hr):
    q1 = np.percentile(hr, 5)
    q3 = np.percentile(hr, 95)
    outliers = 0
    for value in hr:
        if value < q1 or value > q3:
            outliers += 1
    return outliers

In [79]:
import pandas as pd
from scipy.signal import find_peaks
from hr_engine import get_features


times = pd.DataFrame()
hear_rates = pd.DataFrame()
process_only_one = False
results = []
with open(folder_name + '/RECORDS') as f:
    for line in f:
        
        record_name = line.strip()
        file_name = f'{folder_name}/{record_name}'
        files.append(file_name)
        header = wfdb.rdheader(file_name)
        # diagnosis = header.comments[4].split(': ')[1]
        # if diagnosis == 'n/a':
        #     continue
        # diagnosis_per_patient[f'{record_name}'] = diagnosis
        # kinds_of_diagnosis.add(diagnosis)
        # if diagnosis in patients_per_diagnosis:
        #     patients_per_diagnosis[diagnosis].append(record_name)
        # else:
        #     patients_per_diagnosis[diagnosis] = [record_name]
        time_hr, hr_values = calculate_hr(file_name)
        
        # remove first 5 and last 5 values of time_hr and hr_values
        time_hr = time_hr[5:-5]
        hr_values = hr_values[5:-5]

        # if diagnosis == 'n/a':
        #     continue
        
        # diagnosis = 0 if diagnosis == "Healthy control" else 1
        # print(time_hr)
        if(len(hr_values) == 0 or len(time_hr) == 0):
            continue
        results.append(get_features(hr_values=hr_values, time_hr=time_hr, record_name=record_name, diagnosis=0))

        if process_only_one:
            break
# print(kinds_of_diagnosis)
df = pd.DataFrame(results)
df.head()

# print(kinds_of_diagnosis)
        

  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=

Error processing record ./data/autonomic-aging-a-dataset/0400: [Errno 2] No such file or directory: '/home/gino/Documents/Repos/pia-02/data/autonomic-aging-a-dataset/0400.dat'


  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=

Unnamed: 0,patient,diagnosis,highest_heart_rate,lowest_heart_rate,mean_heart_rate,median_heart_rate,standard_deviation_hr,minimum_hrv,maximum_hrv,mean_hrv,...,hf_power,mean_rr,standard_deviation_rr,minimum_rr,maximum_rr,mean_deviation,tendency_standard_deviation,approximation_entropy,sample_entropy,outliers_percentage
0,1,0,93.457944,55.096419,71.142706,70.921986,5.419047,0.074156,23.767573,4.378212,...,34520.102755,2.625132,0.728406,1.5775,7.461,8.613049e-15,2.795462,1.712355,2.242627,0.098953
1,2,0,96.774194,50.041701,66.990478,66.740823,5.478786,0.055527,42.127947,4.415088,...,113821.522094,2.816591,0.828534,1.5095,7.3005,-1.046556e-14,3.289874,1.81309,2.231176,0.087383
2,3,0,94.936709,89.153046,92.497468,92.449923,0.879615,0.135882,2.286585,0.739461,...,22417.568105,2.7556,0.918674,1.2755,7.16,-2.096936e-15,0.523282,2.161885,2.388231,0.078119
3,4,0,100.0,51.194539,61.963556,61.728395,4.315109,0.051014,45.206571,2.167439,...,30039.953302,4.361192,1.590367,1.802,10.3065,-2.412289e-15,2.540033,1.644848,1.9533,0.100109
4,5,0,99.502488,49.464138,67.771732,66.518847,7.780453,0.057165,40.957202,7.632254,...,57916.212213,3.145093,1.106143,1.481,6.91,-1.414015e-15,3.314687,1.43717,1.704032,0.099502


In [80]:
df.to_csv('autonomic-aging-a-dataset-clear-data.csv', index=False)