In [1]:
!wget https://physionet.org/static/published-projects/ptbdb/ptb-diagnostic-ecg-database-1.0.0.zip -O temp.zip; unzip -q temp.zip; rm temp.zip

wget: /opt/conda/lib/libuuid.so.1: no version information available (required by wget)
--2020-01-06 18:05:00--  https://physionet.org/static/published-projects/ptbdb/ptb-diagnostic-ecg-database-1.0.0.zip
Resolving physionet.org (physionet.org)... 18.13.52.205
Connecting to physionet.org (physionet.org)|18.13.52.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1790456359 (1.7G) [application/zip]
Saving to: ‘temp.zip’


2020-01-06 18:05:26 (67.0 MB/s) - ‘temp.zip’ saved [1790456359/1790456359]



In [2]:
import glob

import numpy as np
import pandas as pd
import wfdb

In [3]:
# path to contents of archive from https://physionet.org/content/ptbdb/1.0.0/
p = "ptb-diagnostic-ecg-database-1.0.0/"

template = p + "*/*.hea"
file_list = glob.glob(template)
file_list[:10]

['ptb-diagnostic-ecg-database-1.0.0/patient069/s0234lre.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient069/s0233lre.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient069/s0284lre.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient069/s0232lre.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient289/s0550_re.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient269/s0508_re.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient129/s0189_re.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient097/s0394lre.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient097/s0384lre.hea',
 'ptb-diagnostic-ecg-database-1.0.0/patient097/s0380lre.hea']

In [4]:
meta = []
data_raw = {}

for file in file_list:
    patient, record_id = file[len(p):-4].split("/")
    m = wfdb.rdsamp(file[:-4])

    # convert data
    arr = m[0]

    key = patient + "/" + record_id
    data_raw[key] = arr.astype(np.float32)

    # generate meta data
    dct = {}
    dct.update({"patient": patient, "record_id": record_id})
    dct.update(m[1])
    meta.append(dct)

In [5]:
list(data_raw.keys())[:10]

['patient069/s0234lre',
 'patient069/s0233lre',
 'patient069/s0284lre',
 'patient069/s0232lre',
 'patient289/s0550_re',
 'patient269/s0508_re',
 'patient129/s0189_re',
 'patient097/s0394lre',
 'patient097/s0384lre',
 'patient097/s0380lre']

In [6]:
np.savez_compressed("data_raw.npz", **data_raw)

In [7]:
df = pd.DataFrame(meta)
df.sort_values(['patient', 'record_id'], ascending=[True, True], inplace=True)
df.head()

Unnamed: 0,patient,record_id,fs,sig_len,n_sig,base_date,base_time,units,sig_name,comments
356,patient001,s0010_re,1000,38400,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...","[age: 81, sex: female, ECG date: 01/10/1990, D..."
357,patient001,s0014lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...","[age: 81, sex: female, ECG date: 17/10/1990, D..."
355,patient001,s0016lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...","[age: 81, sex: female, ECG date: 18/10/1990, D..."
308,patient002,s0015lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...","[age: 58, sex: female, ECG date: 17/10/1990, D..."
540,patient003,s0017lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...","[age: 63, sex: male, ECG date: 18/10/1990, Dia..."


In [8]:
df.iloc[0, 9]

['age: 81',
 'sex: female',
 'ECG date: 01/10/1990',
 'Diagnose:',
 'Reason for admission: Myocardial infarction',
 'Acute infarction (localization): infero-latera',
 'Former infarction (localization): no',
 'Additional diagnoses: Diabetes mellitus',
 'Smoker: no',
 'Number of coronary vessels involved: 1',
 'Infarction date (acute): 29-Sep-90',
 'Previous infarction (1) date: n/a',
 'Previous infarction (2) date: n/a',
 'Hemodynamics:',
 'Catheterization date: 16-Oct-90',
 'Ventriculography: Akinesia inferior wall',
 'Chest X-ray: Heart size upper limit of norm',
 'Peripheral blood Pressure (syst/diast):  140/80 mmHg',
 'Pulmonary artery pressure (at rest) (syst/diast): n/a',
 'Pulmonary artery pressure (at rest) (mean): n/a',
 'Pulmonary capillary wedge pressure (at rest): n/a',
 'Cardiac output (at rest): n/a',
 'Cardiac index (at rest): n/a',
 'Stroke volume index (at rest): n/a',
 'Pulmonary artery pressure (laod) (syst/diast): n/a',
 'Pulmonary artery pressure (laod) (mean): n/a'

"Comments" column is key/value pairs list, convert it to pandas columns

In [9]:
def convert_to_dct(row):
    rowp = [r.split(":") for r in row]
    rowp = [(x[0].strip().replace(" ", "_"), np.nan if x[1].strip() in ["", "n/a"] else x[1].strip()) for x in rowp]
    assert all([len(x) == 2 for x in rowp])
    return dict(rowp)

In [10]:
df["comments"] = df["comments"].apply(convert_to_dct)
df = pd.concat([df.drop(['comments'], axis=1), df['comments'].apply(pd.Series)], axis=1)

In [11]:
df.head()

Unnamed: 0,patient,record_id,fs,sig_len,n_sig,base_date,base_time,units,sig_name,age,...,Therapy,Infarction_date,Admission_date,Medication_pre_admission,Start_lysis_therapy_(hh.mm),Lytic_agent,Dosage_(lytic_agent),Additional_medication,In_hospital_medication,Medication_after_discharge
356,patient001,s0010_re,1000,38400,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...",81,...,,29-Sep-90,29-Sep-90,Isosorbit-Dinitrate Digoxin Glibenclamide,19.0,Gamma-TPA,30 mg,Heparin Isosorbit-Mononitrate ASA Diazepam,ASA Isosorbit-Mononitrate Ca-antagonist Amilor...,ASA Isosorbit-Mononitrate Amiloride+Chlorothia...
357,patient001,s0014lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...",81,...,,29-Sep-90,29-Sep-90,Isosorbit-Dinitrate Digoxin Glibenclamide,19.0,Gamma-TPA,30 mg,Heparin Isosorbit-Mononitrate ASA Diazepam,ASA Isosorbit-Mononitrate Ca-antagonist Amilor...,ASA Isosorbit-Mononitrate Amiloride+Chlorothia...
355,patient001,s0016lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...",81,...,,29-Sep-90,29-Sep-90,Isosorbit-Dinitrate Digoxin Glibenclamide,19.0,Gamma-TPA,30 mg,Heparin Isosorbit-Mononitrate ASA Diazepam,ASA Isosorbit-Mononitrate Ca-antagonist Amilor...,ASA Isosorbit-Mononitrate Amiloride+Chlorothia...
308,patient002,s0015lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...",58,...,,08-Oct-90,08-Oct-90,Ca-antagonist Estriol,8.0,Streptokinase,1.5 Mio IE,Nitrate Heparin Ca-antagonist Atropin Ranitidin,ASA Isosorbit-Mononitrate Ca-antagonist Ca-ant...,ASA Bisoprolol Ofloxazin
540,patient003,s0017lre,1000,115200,15,,,"[mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, mV, m...","[i, ii, iii, avr, avl, avf, v1, v2, v3, v4, v5...",63,...,,02-Oct-90,02-Oct-90,-,,Gamma-TPA,30 mg,Furosemide,ASA Isosorbit-Mononitrate,ASA Isosorbit-Mononitrate


In [12]:
df.to_csv("meta.csv", index=False)