# Extracción de datos de la base de datos de PTB Diagnostic ECG Database

In [1]:
import wfdb
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt, find_peaks

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
folder_name = './data/ptb-diagnostic-ecg-database-1.0.0'
files = []
diagnosis_per_patient = {}
kinds_of_diagnosis = set()
patients_per_diagnosis = {}

In [3]:
def bandpass_filter(signal, lowcut, highcut, fs, order=1):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, signal)

In [4]:
def calculate_hr(p_record_name):
    record = wfdb.rdrecord(p_record_name)
    ecg_data = record.p_signal  # Load all 12-lead ECG signals

    # Step 2: Select Lead II for Heart Rate Calculation
    lead_II = ecg_data[:, 1]  # Assuming lead II is the second column
    
    fs = 1000  # Sampling frequency is 1000 Hz
    filtered_lead_II = bandpass_filter(lead_II, 0.5, 50, fs)
    
    # Step 4: R-Peak Detection
    # Use the find_peaks function to detect R-peaks
    peaks, _ = find_peaks(filtered_lead_II, distance=fs*0.6)  # Assuming a minimum distance of 600ms between peaks
    
    # Step 5: Calculate RR Intervals and Heart Rate
    rr_intervals = np.diff(peaks) / fs  # RR intervals in seconds
    hr_values = 60 / rr_intervals  # Heart rate in beats per minute (bpm)
    
    # Step 6: Generate Time Axis for Heart Rate Plot
    time_peaks = peaks / fs  # Time of R-peaks in seconds
    time_hr = (time_peaks[:-1] + time_peaks[1:]) / 2  # Midpoint between successive peaks
    return time_hr, hr_values

In [5]:
def plot_hr_signal(time_hr, hr_values):
    plt.figure(figsize=(10, 6))
    plt.plot(time_hr, hr_values, label='Heart Rate (bpm)', color='b', marker='o', linestyle='-')
    plt.title('Heart Rate over Time')
    plt.xlabel('Time (s)')
    plt.ylabel('Heart Rate (bpm)')
    plt.grid(True)
    plt.legend()
    plt.show()

In [6]:
# Descomentar para visualizar métricas de diagnóstico
# with open(folder_name + '/RECORDS') as f:
#     for line in f:
#         record_name = line.strip()
#         file_name = f'{folder_name}/{record_name}'
#         files.append(file_name)
#         header = wfdb.rdheader(file_name)
#         diagnosis = header.comments[4].split(': ')[1]
#         diagnosis_per_patient[f'{record_name}'] = diagnosis
#         kinds_of_diagnosis.add(diagnosis)
#         if diagnosis in patients_per_diagnosis:
#             patients_per_diagnosis[diagnosis].append(record_name)
#         else:
#             patients_per_diagnosis[diagnosis] = [record_name]
#         time_hr, hr_values = calculate_hr(file_name)
#         plot_hr_signal(time_hr, hr_values)



In [8]:
import pandas as pd
from scipy.signal import find_peaks
from hr_engine import get_features


times = pd.DataFrame()
hear_rates = pd.DataFrame()
process_only_one = False
results = []
with open(folder_name + '/RECORDS') as f:
    for line in f:
        
        record_name = line.strip()
        file_name = f'{folder_name}/{record_name}'
        files.append(file_name)
        header = wfdb.rdheader(file_name)
        diagnosis = header.comments[4].split(': ')[1]
        if diagnosis == 'n/a':
            continue
        diagnosis_per_patient[f'{record_name}'] = diagnosis
        kinds_of_diagnosis.add(diagnosis)
        if diagnosis in patients_per_diagnosis:
            patients_per_diagnosis[diagnosis].append(record_name)
        else:
            patients_per_diagnosis[diagnosis] = [record_name]
        time_hr, hr_values = calculate_hr(file_name)
        
        # remove first 5 and last 5 values of time_hr and hr_values
        time_hr = time_hr[5:-5]
        hr_values = hr_values[5:-5]

        if diagnosis == 'n/a':
            continue
        
        diagnosis = 0 if diagnosis == "Healthy control" else 1
        
        results.append(get_features(hr_values=hr_values, time_hr=time_hr, record_name=record_name, diagnosis=diagnosis))

        if process_only_one:
            break
print(kinds_of_diagnosis)
df = pd.DataFrame(results)
df.head()

# print(kinds_of_diagnosis)
        

  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=

{'Myocardial infarction', 'Healthy control', 'Heart failure (NYHA 4)', 'Palpitation', 'Heart failure (NYHA 2)', 'Unstable angina', 'Dysrhythmia', 'Bundle branch block', 'Cardiomyopathy', 'Stable angina', 'Hypertrophy', 'Myocarditis', 'Heart failure (NYHA 3)', 'Valvular heart disease'}


  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(
  hr_smoothed = pd.Series(hr_values).rolling(window=window_size, center=True).mean().fillna(method='bfill').fillna(


Unnamed: 0,diagnosis,minimum_hrv,maximum_hrv,standard_deviation_hrv,mean_hr_slope,tendency_slope,lowest_heart_rate,vlf_power,lf_power,hf_power,approximation_entropy
0,1.0,0.110015,3.831879,1.082505,-0.015019,-0.043778,0.791557,0.016909,0.070872,0.430561,0.181738
1,1.0,0.115902,33.458156,4.575001,-0.002718,-0.015723,0.540054,0.261005,2.562335,8.194554,1.186733
2,1.0,0.101858,32.215969,5.592603,-0.060638,-0.010089,0.497512,0.225568,3.679011,18.931846,1.174023
3,1.0,0.094814,2.659452,0.56683,-0.079587,-0.001058,0.753769,0.378114,1.523669,2.507117,0.88034
4,1.0,0.082173,1.821804,0.348418,-0.005154,-0.00365,0.682594,0.311337,1.204288,1.233432,0.679967


In [9]:
df.to_csv('ptb-diagnostic-clear-data.csv', index=False)