In [27]:
import pandas as pd
import numpy as np
import os
import wfdb
import ast

import tensorflow as tf
from tensorflow import keras
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Num GPUs Available:  1
Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1060, pci bus id: 0000:01:00.0, compute capability: 6.1



# Load in Data

In [2]:
data_path = 'C:\\Users\\burke\\Documents\\data\\ptb_xl'

database = pd.read_csv(os.path.join(data_path,'ptbxl_database.csv'))
scp_statements = pd.read_csv(os.path.join(data_path,'scp_statements.csv'))

# Get diagnostic labels

In [1]:
def spc_to_diag(spc_codes, subclass=False):
    ''' 
    Function that returns the diagnostic label of the highest SPC codes as given in the PTB_XL database.
    
    SPC codes are labelled in a dictionary with probabilities of being that label
     ex. {'NORM': 100.0, 'LVOLT': 0.0, 'SR': 0.0}
    spc_statements.csv provides alookup to convert spc codes to diagnostic label
    
    INPUTS: 
        spc_codes: string of spc_code dictionary as in the spc_codes column of the ptbxl_databse
        subclass: returns superclass if TRUE, subclass if FALSE 
    ''' 
    
    # Convert string to dictionary
    codes = ast.literal_eval(spc_codes)
    
    # Take the key that has the highest value
    max_code = max(codes, key=codes.get)

    # Convert the maximum spc code to the diganostic class
    if not subclass:
        diagnosis = scp_statements[scp_statements.iloc[:,0] == max_code]['diagnostic_class'].item()
    else:
        diagnosis = scp_statements[scp_statements.iloc[:,0] == max_code]['diagnostic_subclass'].item()

    return diagnosis

In [4]:
superclasses = database['scp_codes'].apply(spc_to_diag)
subclasses = database['scp_codes'].apply(spc_to_diag, subclass=True)

In [111]:
keep = ['strat_fold', 'filename_lr', 'filename_hr']
database = database[keep]
database.insert(0, 'class', superclasses)
database.insert(1, 'subclass', subclasses)

database = database.dropna(axis=0)

database.head()

Unnamed: 0,class,subclass,strat_fold,filename_lr,filename_hr
0,NORM,NORM,3,records100/00000/00001_lr,records500/00000/00001_hr
1,NORM,NORM,2,records100/00000/00002_lr,records500/00000/00002_hr
2,NORM,NORM,5,records100/00000/00003_lr,records500/00000/00003_hr
3,NORM,NORM,3,records100/00000/00004_lr,records500/00000/00004_hr
4,NORM,NORM,4,records100/00000/00005_lr,records500/00000/00005_hr


## Dealing with dignositc labels

# Get ECG Data

In [112]:
def get_ecg_data(filename, data_path):
    
    record = wfdb.rdrecord(os.path.join(data_path,filename))
    record = record.p_signal
    record = record.astype(np.float64)
    
    return record

In [113]:
filenames = database['filename_lr']
data_path = 'C:\\Users\\burke\\Documents\\data\\ptb_xl'

ecg_data = filenames.apply(get_ecg_data, data_path = data_path)
ecg_data

0        [[-0.119, -0.055, 0.064, 0.086, -0.091, 0.004,...
1        [[0.004, 0.138, 0.134, -0.072, -0.065, 0.136, ...
2        [[-0.029, -0.079, -0.049, 0.054, 0.011, -0.064...
3        [[-0.054, -0.138, -0.083, 0.096, 0.015, -0.11,...
4        [[-0.034, -0.574, -0.54, 0.304, 0.253, -0.556,...
                               ...                        
21832    [[-0.052, -0.034, 0.018, 0.043, -0.034, -0.008...
21833    [[-0.05, -0.013, 0.036, 0.031, -0.042, 0.011, ...
21834    [[0.038, 0.024, -0.014, -0.03, 0.026, 0.005, 0...
21835    [[-0.057, -0.057, 0.0, 0.057, -0.028, -0.028, ...
21836    [[-0.049, -0.025, 0.024, 0.037, -0.036, 0.0, 0...
Name: filename_lr, Length: 21181, dtype: object

# Convert to Pickle

In [None]:
data_path = 'C:\\Users\\burke\\Documents\\data\\ptb_xl'

ecg_data.to_pickle(os.path.joing(data_path,'ecg_data.pkl'))
database.to_pickle(os.path.joing(data_path,'database.pkl'))