## **ECG Diagnosis Code**

This code is based on the code developed here: https://doi.org/10.1038/s41467-020-15432-4

**Define Libraries**

In [1]:
from tensorflow.keras.layers import (
    Input, Conv1D, MaxPooling1D, Dropout, BatchNormalization, Activation, Add, Flatten, Dense)
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (ModelCheckpoint, TensorBoard, ReduceLROnPlateau,
                                        CSVLogger, EarlyStopping)
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report
import numpy as np
import h5py
import math
import pandas as pd
from tensorflow.keras.utils import Sequence
import numpy as np
import os

In [2]:
cwd = os.getcwd()

**Load the data**

In [3]:
def ProcessLabels(data_info: pd.DataFrame, train_ids: np.ndarray):
    abnormalities = ['1dAVb', 'AF', 'LBBB', 'RBBB', 'SB', 'ST']
    good_rows = []
    bad_rows = []
    labels = np.array([])
    for ii, id in enumerate(train_ids):
        if id in data_info['exam_id'].to_numpy():
            good_rows.append(ii)
            data_row = data_info.iloc[np.where(data_info['exam_id'].to_numpy() == id)[0][0]]
            labels = np.append(labels, data_row[abnormalities].values.tolist())
        else:
            bad_rows.append(ii)

    labels = np.reshape(labels, (len(good_rows), len(abnormalities)))
    
    return labels.astype('int64'), good_rows

#Load in training data
path_to_hdf5 = cwd + '\\data\\train\\exams_part17.hdf5'
dataset_name = 'tracings'
path_to_csv = cwd + '\\data\\train\\exams.csv'

#Process the exams data
f = h5py.File(path_to_hdf5, "r")
tracings_train = f[dataset_name][()]
train_ids = f['exam_id'][()]
f.close()
labels_train, good_rows = ProcessLabels(pd.read_csv(path_to_csv), train_ids)
tracings_train = tracings_train[good_rows]

#Load in test data
path_to_hdf5 = cwd + '\\data\\test\\ecg_tracings.hdf5'
dataset_name = 'tracings'
path_to_csv = cwd + '\\data\\test\\gold_standard.csv'

labels_test = pd.read_csv(path_to_csv).values
f = h5py.File(path_to_hdf5, "r")
tracings_test = f[dataset_name][()]
f.close()

## Current Model

**Define the NN model**

In [4]:
class ResidualUnit(object):
    def __init__(self, n_samples_out, n_filters_out, kernel_initializer='he_normal',
                 dropout_keep_prob=0.8, kernel_size=17, preactivation=True,
                 postactivation_bn=False, activation_function='relu'):
        self.n_samples_out = n_samples_out
        self.n_filters_out = n_filters_out
        self.kernel_initializer = kernel_initializer
        self.dropout_rate = 1 - dropout_keep_prob
        self.kernel_size = kernel_size
        self.preactivation = preactivation
        self.postactivation_bn = postactivation_bn
        self.activation_function = activation_function

    def _skip_connection(self, y, downsample, n_filters_in):
        """Implement skip connection."""
        # Deal with downsampling
        if downsample > 1:
            y = MaxPooling1D(downsample, strides=downsample, padding='same')(y)
        elif downsample == 1:
            y = y
        else:
            raise ValueError("Number of samples should always decrease.")
        # Deal with n_filters dimension increase
        if n_filters_in != self.n_filters_out:
            # This is one of the two alternatives presented in ResNet paper
            # Other option is to just fill the matrix with zeros.
            y = Conv1D(self.n_filters_out, 1, padding='same',
                       use_bias=False, kernel_initializer=self.kernel_initializer)(y)
        return y

    def _batch_norm_plus_activation(self, x):
        if self.postactivation_bn:
            x = Activation(self.activation_function)(x)
            x = BatchNormalization(center=False, scale=False)(x)
        else:
            x = BatchNormalization()(x)
            x = Activation(self.activation_function)(x)
        return x

    def __call__(self, inputs):
        """Residual unit."""
        x, y = inputs
        n_samples_in = y.shape[1]
        downsample = n_samples_in // self.n_samples_out
        n_filters_in = y.shape[2]
        y = self._skip_connection(y, downsample, n_filters_in)
        # 1st layer
        x = Conv1D(self.n_filters_out, self.kernel_size, padding='same',
                   use_bias=False, kernel_initializer=self.kernel_initializer)(x)
        x = self._batch_norm_plus_activation(x)
        if self.dropout_rate > 0:
            x = Dropout(self.dropout_rate)(x)

        # 2nd layer
        x = Conv1D(self.n_filters_out, self.kernel_size, strides=downsample,
                   padding='same', use_bias=False,
                   kernel_initializer=self.kernel_initializer)(x)
        if self.preactivation:
            x = Add()([x, y])  # Sum skip connection and main connection
            y = x
            x = self._batch_norm_plus_activation(x)
            if self.dropout_rate > 0:
                x = Dropout(self.dropout_rate)(x)
        else:
            x = BatchNormalization()(x)
            x = Add()([x, y])  # Sum skip connection and main connection
            x = Activation(self.activation_function)(x)
            if self.dropout_rate > 0:
                x = Dropout(self.dropout_rate)(x)
            y = x
        return [x, y]


def get_model(n_classes, last_layer='sigmoid'):
    kernel_size = 16
    kernel_initializer = 'he_normal'
    signal = Input(shape=(4096, 12), dtype=np.float32, name='signal')
    x = signal
    x = Conv1D(64, kernel_size, padding='same', use_bias=False,
               kernel_initializer=kernel_initializer)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x, y = ResidualUnit(1024, 128, kernel_size=kernel_size,
                        kernel_initializer=kernel_initializer)([x, x])
    x, y = ResidualUnit(256, 196, kernel_size=kernel_size,
                        kernel_initializer=kernel_initializer)([x, y])
    x, y = ResidualUnit(64, 256, kernel_size=kernel_size,
                        kernel_initializer=kernel_initializer)([x, y])
    x, _ = ResidualUnit(16, 320, kernel_size=kernel_size,
                        kernel_initializer=kernel_initializer)([x, y])
    x = Flatten()(x)
    diagn = Dense(n_classes, activation=last_layer, kernel_initializer=kernel_initializer)(x)
    model = Model(signal, diagn)
    return model


**Load Parameters**

Loading the parameters for the model that were found in the paper. We will call this our first model

They trained 10 NN with different initializations. The choose the model to use based on the median micro average persion (mAP = 0.951). They had to choose the one right above the median since 10 is even so they can't take the median execution

In [5]:
path_to_model = cwd + '\\model\\model.hdf5'

loss = 'binary_crossentropy'
lr = 0.001
batch_size = 64
opt = Adam(lr)

model_1 = load_model(path_to_model, compile=False)
model_1.compile(loss=loss, optimizer=opt)

**Data Fromatting**

Here is the class for transforming the data into the proper format

In [6]:
class ECGSequence(Sequence):
    @classmethod
    def get_seq(cls, tracings: np.ndarray, labels: np.ndarray=None, batch_size=8):
        train_seq = cls(tracings, labels, batch_size)
        return train_seq

    def __init__(self, tracings:np.ndarray, labels:np.ndarray=None, batch_size:int=8):
        if labels is None:
            self.y = None
        else:
            self.y = labels
        # Get tracings
        self.x = tracings
        self.batch_size = batch_size
        self.sample_count = tracings.shape[0]

    @property
    def n_classes(self):
        return self.y.shape[1]

    def __getitem__(self, idx):
        start = idx * self.batch_size
        end = start + self.batch_size
        if self.y is None:
            return np.array(self.x[start:end, :, :])
        else:
            return np.array(self.x[start:end, :, :]), np.array(self.y[start:end])

    def __len__(self):
        return math.ceil(self.sample_count / self.batch_size)


**Training Function**

We will also train the model with the data accessible for better comparison with the simplified model. We will call this the second model

For sake of computational resources and time, the second model was only trained once instead of trained 10 times and then taking the model based on the median mAP

In [12]:
class MyCNN:
    def __init__(self, loss, opt, verbose):
        # Optimization settings
        self.callbacks = [ReduceLROnPlateau(monitor='val_loss',
                            factor=0.1,
                            patience=7,
                            min_lr=lr / 100),
                            EarlyStopping(monitor='val_loss', 
                            patience=9,  # Patience should be larger than the one in ReduceLROnPlateau
                            min_delta=0.00001)]

        self.loss = loss
        self.optimizer = opt
        self.verbose = verbose
        
        # Save the BEST and LAST model
        '''callbacks += [ModelCheckpoint('./backup_model_last.hdf5'),
        ModelCheckpoint('./backup_model_best.hdf5', save_best_only=True)]'''

    def train(self, train_seq, val_seq):
        self.model = get_model(train_seq.n_classes)
        self.model.compile(loss=self.loss, optimizer=self.optimizer)
        # Train neural network
        self.model.fit(train_seq,
            epochs=70,
            initial_epoch=0,  # If you are continuing a interrupted section change here
            callbacks=self.callbacks,
            validation_data=val_seq,
            verbose=self.verbose)

    def predict(self, test_seq):
        return self.model.predict(test_seq,  verbose=1)




## Simplified Models

Need to choose what model I want

Going to have to use something like random forest because I need a multi-label classifier, or I can use sklearn.multioutput.MultiOutputClassifier and use any classifier

I think all of the data for all 12 leads is the set of features for each sample

**Define Models**

In [8]:
#TODO: Tune the hyperparameters
class RF_Model:
    def __init__(self, verbose = 1):
        self.model = RandomForestClassifier(verbose=verbose)

    def train(self, X: np.ndarray, y: np.ndarray):
        self.model.fit(X,y)

    def predict(self, X):
        return self.model.predict(X)

class LR_model:
    def __init__(self, verbose = 1):
        self.model = MultiOutputClassifier(LogisticRegression(verbose=verbose))

    def train(self, X: np.ndarray, y: np.ndarray):
        self.model.fit(X, y)

    def predict(self, X: np.ndarray):
        return self.model.predict(X)

**Data Fromatting**

PCA for the simplified models

In [9]:
class PCA_Transform:
    def __init__(self, r:int):
        self.PCA_instance = PCA(n_components=r)
    
    def _flattenData(self, X:np.ndarray):
        return np.hstack(X)

    def _processData(self, X: np.ndarray):
        X_flat = self._flattenData(X)
        self.preprocess = StandardScaler()
        self.preprocess.fit(X_flat)

    def FitData(self, X: np.ndarray):
        self._processData(X)
        self.PCA_instance.fit(self.preprocess.transform(self._flattenData(X)))

    def TransformData(self, X_train: np.ndarray, X_test: np.ndarray):
        X_train = self.preprocess.transform(self._flattenData(X_train))
        X_test = self.preprocess.transform(self._flattenData(X_test))
        return self.PCA_instance.transform(X_train), self.PCA_instance.transform(X_test)

## K-Fold

K-fold procedure for validation of the models

They use a validation set of 2% so something to think about

They didn't round for the outputs, seems to be a threshold in which they consider it to occur

They used precision-recall curves for things, but in total found precision, recall, specificity and F1 score

**Metrics Function**

Making a function to be able to call all of the metrics each fold

In [10]:
metrics = {'Precision': {'Model_1': np.array([]), 'Model_2': np.array([]), 'Model_3': np.array([])},\
        'Recall': {'Model_1': np.array([]), 'Model_2': np.array([]), 'Model_3': np.array([])},\
        'F1': {'Model_1': np.array([]), 'Model_2': np.array([]), 'Model_3': np.array([])}}

#TODO: Add specificity and add the ability to pull these for each class, maybe can just make an index and label encoder for the metrics
#with each numpy array being 2d
def Find_metrics(metrics: dict, model_name: str, y_true: np.ndarray, y_pred: np.ndarray):
        metrics['Precision'][model_name] = np.append(metrics['Precision'][model_name], precision_score(y_true, y_pred, average=None))
        metrics['Recall'][model_name] = np.append(metrics['Recall'][model_name], recall_score(y_true, y_pred, average=None))
        metrics['F1'][model_name] = np.append(metrics['F1'][model_name], f1_score(y_true, y_pred, average=None))

        return metrics

In [13]:
kf = KFold(n_splits=3, shuffle=True)

#Initilaize the models that need to be trained
model_2 = MyCNN(loss, opt, verbose = 1)
'''model_3 = RF_Model(verbose = 1)
model_4 = LR_model(verbose = 1)'''

#PCA initlization
'''PCA_transformer = PCA_Transform(r = 60)'''

for train_index, test_index in kf.split(X = tracings_train[:,1,1], y = labels_train):

        X_train, X_test = tracings_train[train_index,:,:], tracings_train[test_index,:,:]
        y_train, y_test = labels_train[train_index], labels_train[test_index]


        #Put data in sequence for models 1 and 2 (CNN)
        train_seq = ECGSequence.get_seq(
                X_train, y_train, batch_size=64)

        test_seq = ECGSequence.get_seq(
                X_test, y_test, batch_size=64)

        #Transform data with PCA for models 3 and 4
        '''PCA_transformer.FitData(X_train)
        PCA_X_train, PCA_X_test = PCA_transformer.TransformData(X_train, X_test)'''

        #Train models
        #Here the validation seq is just the test seq since we are using a k-fold analysis
        print('\n-------------------Training model 2----------------------')
        model_2.train(train_seq, val_seq=test_seq)
        '''print('\n-------------------Training model 3----------------------')
        model_3.train(X = PCA_X_train, y = y_train)
        print('\n-------------------Training model 4----------------------')
        model_4.train(X = PCA_X_train, y = y_train)'''

        #Test models
        print('\n-------------------Testing model 1----------------------')
        model_1_predict = model_1.predict(test_seq)
        print('\n-------------------Testing model 2----------------------')
        model_2_predict = model_2.predict(test_seq)
        '''print('\n-------------------Testing model 3----------------------')
        model_3_predict = model_3.predict(PCA_X_train)
        print('\n-------------------Testing model 4----------------------')
        model_4_predict = model_4.predict(PCA_X_train)'''

        

        #Find metrics
        #Call the function each time and then can average for each class after

        metrics = Find_metrics(metrics, 'Model_1', y_test, model_1_predict)
        metrics = Find_metrics(metrics, 'Model_2', y_test, model_2_predict)
        '''metrics = Find_metrics(metrics, 'Model_3', y_test, model_3_predict)
        metrics = Find_metrics(metrics, 'Model_3', y_test, model_3_predict)'''



-------------------Training model 2----------------------
Epoch 1/70

KeyboardInterrupt: 

## Testing

**Train Models**

Train all the models that need to be trained on the entire training set

**Test Models**

Test all the models with the test set