In [1]:
# import all packages needed
import string, math
import numpy as np
import pandas as pd
from matplotlib import pyplot
from base64 import b64decode as decode
from transformers import GPT2Tokenizer, GPT2Model


import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

## Data Processing / Cleaning

In [2]:
# use class base64 to decode waveform data
def to_array(wf):
    barr = bytearray(decode(wf))
    vals = np.array(barr)
    return vals.view(np.int16)

# read in data
exam_data = pd.read_csv("data/d_exam.csv").drop(columns = ["site_num", "patient_id_edit"])
waveform_data = pd.read_csv("data/d_waveform.csv")
lead_data = pd.read_csv("data/d_lead_data.csv").drop(columns = ["exam_id"])
diagnosis_data = pd.read_csv("data/d_diagnosis.csv").drop(columns = ["user_input"])

# add decoded data as a column to lead data
waveforms = list(lead_data['waveform_data'])
lead_data['decoded_waveform'] = [to_array(i) for i in waveforms]

# merge waveform data and lead data
waveform_lead = lead_data.merge(waveform_data, how = "left", left_on = "waveform_id", right_on = "waveform_id", suffixes = (None, None))

#  sort by exam id and lead id
waveform_lead.sort_values(by = ["waveform_id", "lead_id"], inplace = True)

waveform_lead.loc[:, ['exam_id', 'lead_id', 'decoded_waveform', 'waveform_type']]
waveform_lead

Unnamed: 0,lead_data_id,waveform_id,WavfmType,lead_id,lead_byte_count_total,lead_time_offset,waveform_data,lead_sample_count_total,lead_amplitude,lead_units,...,exam_id,waveform_type,number_of_leads,Waveform_Start_Time,Sample_Type,Sample_Base,Sample_Exponent,High_Pass_Filter,Low_Pass_Filter,AC_Filter
10,9078054,1095618,,I,5000,0,+P/4//j/+P/4//j/+P/5//r/+//8//z//P/7//r/+f/4/...,2500,4.88,MICROVOLTS,...,549871,Rhythm,8,0,CONTINUOUS_SAMPLES,250,0,5,150,NONE
15,9081703,1095618,,II,5000,0,9v/2//b/8//w//D/8P/x//L/8//0//T/9P/z//L/8f/w/...,2500,4.88,MICROVOLTS,...,549871,Rhythm,8,0,CONTINUOUS_SAMPLES,250,0,5,150,NONE
8,9074278,1095618,,V1,5000,0,/v/+//7//v/+////AAAAAAAAAQACAAIAAgACAAIAAgACA...,2500,4.88,MICROVOLTS,...,549871,Rhythm,8,0,CONTINUOUS_SAMPLES,250,0,5,150,NONE
1,9066887,1095618,,V2,5000,0,9v/1//T/9P/0//T/9P/0//T/9f/2//b/9v/2//b/9v/2/...,2500,4.88,MICROVOLTS,...,549871,Rhythm,8,0,CONTINUOUS_SAMPLES,250,0,5,150,NONE
18,9082771,1095618,,V3,5000,0,7v/u/+7/7f/s/+z/7P/t/+7/7v/u/+7/7v/u/+7/7v/u/...,2500,4.88,MICROVOLTS,...,549871,Rhythm,8,0,CONTINUOUS_SAMPLES,250,0,5,150,NONE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,9187141,1109067,,V4,1200,0,KAApACoAKwAsACwALQAtAC4ALgAuAC4ALgAuAC4ALgAvA...,600,4.88,MICROVOLTS,...,554080,Rhythm,8,0,CONTINUOUS_SAMPLES,500,0,16,150,NONE
152,9190675,1109067,,V5,1200,0,FgAXABkAGQAbABsAGwAbABsAGwAbABwAHQAeAB4AHgAfA...,600,4.88,MICROVOLTS,...,554080,Rhythm,8,0,CONTINUOUS_SAMPLES,500,0,16,150,NONE
155,9177603,1109067,,V5,10000,0,+v/6//r/+v/7//z//f/+//z//P/8//z//v/+//7//v/+/...,5000,4.88,MICROVOLTS,...,554080,Rhythm,8,0,CONTINUOUS_SAMPLES,500,0,16,150,NONE
140,9172851,1109067,,V6,10000,0,7v/u/+7/7v/x//L/8//0//T/9P/0//T/9P/0//T/9P/0/...,5000,4.88,MICROVOLTS,...,554080,Rhythm,8,0,CONTINUOUS_SAMPLES,500,0,16,150,NONE


In [3]:
# concatenate all leads into a single array
waveform_lead_concat = waveform_lead.groupby(["exam_id", "waveform_type"])['decoded_waveform'].apply(lambda x: tuple(x)).reset_index()
waveform_lead_concat
# remove irregular observations, concat tuple into numpy array
waveform_lead_concat = waveform_lead_concat.drop([12,17], axis = 0)
waveform_lead_concat['decoded_waveform'] = waveform_lead_concat['decoded_waveform'].apply(lambda x: np.vstack(x))#.apply(lambda x: np.transpose(x))

waveform_lead_rhythm = waveform_lead_concat[waveform_lead_concat['waveform_type'] == "Rhythm"]
#waveform_lead_median = waveform_lead_concat[waveform_lead_concat['waveform_type'] == "Median"]

for value in waveform_lead_rhythm["decoded_waveform"]:
    print(np.max(value))
    print(np.min(value))

135
-189
257
-191
290
-425
408
-212
289
-193
350
-314
195
-167
202
-100


In [4]:
# Adding the labels/sentences
exams = diagnosis_data["exam_id"].unique()

# Let's look over this tomorrow
diagnosis_data = diagnosis_data[diagnosis_data['Original_Diag'] == 1].dropna()
#searchfor = ['previous', 'unconfirmed', 'compared', 'interpretation', 'significant']
#diagnosis_data = diagnosis_data.loc[diagnosis_data['Full_text'].str.contains('|'.join(searchfor)) != 1]
#

diagnosis_data.sort_values(by=["exam_id", "statement_order"], inplace=True)
diagnoses = []
curr_id = 0
curr_string = ""
for i, row in diagnosis_data.iterrows():
    if row["statement_order"] == 1 and curr_string != "":
        curr_string = curr_string.lower().translate(str.maketrans('', '', string.punctuation))
        val = [curr_id, curr_string[1:]]
        diagnoses.append(val)
        curr_string = ""
        curr_id = row["exam_id"]

    if curr_id == 0:
        curr_id = row["exam_id"]
    
    curr_string += " " + row["Full_text"]

diagnosis_df = pd.DataFrame(diagnoses, columns = ['exam_id', 'diagnosis'])
waveform_lead_rhythm_diag = pd.merge(left=waveform_lead_rhythm, right=diagnosis_df, left_on='exam_id', right_on='exam_id')

#waveform_lead_rhythm_diag
for i in waveform_lead_rhythm_diag["diagnosis"]:
    print(i)

normal sinus rhythm low voltage qrs borderline ecg when compared with ecg of unconfirmed no significant change was found
sinus bradycardia otherwise normal ecg no previous ecgs available
sinus tachycardia otherwise normal ecg no previous ecgs available
normal sinus rhythm normal ecg no previous ecgs available
normal sinus rhythm normal ecg no previous ecgs available
normal sinus rhythm with sinus arrhythmia minimal voltage criteria for lvh may be normal variant borderline ecg no previous ecgs available
atrial fibrillation abnormal ecg no previous ecgs available
 poor data quality interpretation may be adversely affected normal sinus rhythm with sinus arrhythmia normal ecg no previous ecgs available


In [5]:
unique_words = set()
for num, sentence in diagnoses:
    for word in sentence.split():
        unique_words.add(word)
print(unique_words)
unique_words = list(unique_words)
word_map = dict()
for i, word in enumerate(unique_words):
    word_map[word] = i+1
word_map[""] = 0
print(word_map)

{'ecg', 't', 'arrhythmia', 'significant', 'of', 'ischemia', 'quality', 'lvh', 'poor', 'affected', 'compared', 'change', 'for', 'qrs', 'inferior', 'be', 'minimal', 'criteria', 'voltage', 'atrial', 'unconfirmed', 'data', 'otherwise', 'abnormality', 'adversely', 'wave', 'when', 'borderline', 'rhythm', 'low', 'fibrillation', 'consider', 'was', 'no', 'may', 'sinus', 'bradycardia', 'found', 'interpretation', 'previous', 'with', 'ecgs', 'variant', 'abnormal', 'available', 'tachycardia', 'normal'}
{'ecg': 1, 't': 2, 'arrhythmia': 3, 'significant': 4, 'of': 5, 'ischemia': 6, 'quality': 7, 'lvh': 8, 'poor': 9, 'affected': 10, 'compared': 11, 'change': 12, 'for': 13, 'qrs': 14, 'inferior': 15, 'be': 16, 'minimal': 17, 'criteria': 18, 'voltage': 19, 'atrial': 20, 'unconfirmed': 21, 'data': 22, 'otherwise': 23, 'abnormality': 24, 'adversely': 25, 'wave': 26, 'when': 27, 'borderline': 28, 'rhythm': 29, 'low': 30, 'fibrillation': 31, 'consider': 32, 'was': 33, 'no': 34, 'may': 35, 'sinus': 36, 'brady

In [21]:
# split data into training and testing datasets
# y not included for now
def one_hot(x, dict_words):
    x = x.split(" ")
    array = []
    for i in x:
        array.append([0] + [1 if y == i else 0 for y in dict_words] + [0,0])
    for i in range(17-len(x)):
        array.append([1 if i == 30 else 0 for i in range(32)])
    return array

dict_words = list(unique_words)
#waveform_lead_rhythm_diag['diagnosis'] = waveform_lead_rhythm_diag['diagnosis'].apply(lambda x: one_hot(x, dict_words))

len(waveform_lead_rhythm_diag["diagnosis"][5])
train_x, test_x, train_y, test_y = train_test_split(waveform_lead_rhythm_diag['decoded_waveform'], waveform_lead_rhythm_diag['diagnosis'], test_size = 0.1, random_state = 2021)
train_x = torch.tensor(list(train_x)).float()
train_x.shape
train_x = torch.tensor(list(waveform_lead_rhythm_diag['decoded_waveform'])).float()
train_x.shape

torch.Size([8, 8, 2500])

## Model 1 - Conv1D Encoder w/ LSTM Decoder

In [25]:
# HYPERPARAMETERS
J = 8 # max number of filters per class
LR = 1e-3

# define global max pooling
class global_max_pooling_1d(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        x, _ = torch.max(x, dim = 2)
        return(x)

# 1D grouped encoder model
encoder_conv = nn.Sequential()
encoder_conv.add_module('initial_norm', nn.BatchNorm1d(8))
encoder_conv.add_module('conv_1', nn.Conv1d(in_channels = 8, out_channels = 8, groups = 8, kernel_size = 5, padding = 2))
for i in range(2, (J+2), 2):
    if (i-2) == 0: 
        prev = 8
    else:
        prev = (i-2)*8
    encoder_conv.add_module('conv_{num}'.format(num = int(i / 2 + 1)), nn.Conv1d(in_channels = prev, out_channels = i*8, groups = 8, kernel_size = 5, padding = 2, stride = 3))
    encoder_conv.add_module('activation_{num}'.format(num = int(i / 2 + 1)), nn.ELU())
    encoder_conv.add_module('batch_norm_{num}'.format(num = int(i / 2 + 1)), nn.BatchNorm1d(i*8))
    
#encoder_conv.add_module('final_conv', nn.Conv1d(in_channels = J * 8, out_channels = 8, groups = 8, kernel_size = 5, padding = 2))
#encoder_conv.add_module('max_pool', nn.MaxPool1d(kernel_size = 5, padding = 2, stride = 1))
encoder_conv.add_module('reshape', nn.MaxPool1d(kernel_size = 5, padding = 2, stride = 1))


# summarize model, verify output is of desired shape
print(train_x[0].shape)
print(encoder_conv(torch.unsqueeze(train_x[0], 0)).shape)

torch.Size([8, 2500])
torch.Size([1, 64, 31])


## Model 2 - LSTM Encoder w/ Huggingface Decoder

In [1]:
# define hyperparameters 
hidden_layers = 512
embedding_dim = 8
num_words = len(unique_words)

class ECG_LSTM(nn.Module):
    def __init__(self, encoder, h_dim, e_dim, word_list_length):
        super(ECG_LSTM, self).__init__()
        self.encoder = encoder
        self.lstm = nn.LSTM(e_dim, h_dim)
        self.linear = nn.Linear(h_dim, word_list_length)
        
    def forward(self, seq):
        seq_embedded = self.encoder(seq)
        final_hidd, _ = self.lstm(seq_embedded)
        dec_seq = self.linear(final_hidd)
        return F.log_softmax(dec_seq)
    
lstm_dec = ECG_LSTM(encoder_conv, hidden_layers, embedding_dim, num_words)
lstm_dec(train_x).shape

NameError: name 'unique_words' is not defined

## Model 3 - Basic Transformer Architecture with Multi-Head Attention

# Transformer testing

In [8]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class ECGTransformerEncoder(nn.Module):
    # Takes the ECG discrete signals sequence and maps into a probability distribution of diagnosis
    # For working/verification purposes
    def __init__(self, vector_size, n_inputs, n_heads, hidden_linear_dim, n_layers, dropout):
        super(ECGTransformerEncoder, self).__init__()
        self.model_type = "Transformer"
        self.positional_encoder = PositionalEncoder(64, dropout)
        
        #Since our data is already discrete numbers, might need some tweaking for this
        #self.embedder = SignalEmbedder(n_inputs, vector_size)
                        #64 31              #39        64
        
        
        self.encoder = TransformerEncoder(
            TransformerEncoderLayer(64, n_heads, hidden_linear_dim, dropout),
            n_layers)
        
        self.n_inputs = n_inputs
        self.n_layers = n_layers
        
        # Simple linear decoder
        self.decoder = nn.Sequential(
                        Transpose(64, 31),
                        nn.Linear(31, 17),
                        Transpose(17, 64),
                        nn.Linear(64, 30),
                        nn.LogSoftmax()
                        )
        self.init_weights()
        
    def init_weights(self):
        #self.embedder.weight.data.uniform_(-.1, .1)
        #self.decoder.bias.data.zero_()
        #self.decoder.weight.data.uniform_(-.1, .1)
        pass
        
    def forward(self, x):
        #x = self.embedder(x) # * math.sqrt(self.n_inputs)
        
        #x = x.squeeze(0)
        x = x.view(8,2500)
        x = self.positional_encoder(x)
        x = self.encoder(x)
        x = x.squeeze(0)   
        x = self.decoder(x)
        return x

class Transpose(nn.Module):
    def __init__(self, *args):
        super(Transpose, self).__init__()
        self.shape = args

    def forward(self, x):
        # If the number of the last batch sample in the data set is smaller than the defined batch_batch size, mismatch problems will occur. You can modify it yourself, for example, just pass in the shape behind, and then enter it through x.szie(0).
        return x.view(self.shape)

class SignalEmbedder(nn.Module):
    # Necessary to convert the signal into "word" vectors for transformer processing.
    # Currently a simple group and slice method, but will modify later for multi-channel inputs
    
    def __init__(self, num_slices, size_of_slice):
        super(SignalEmbedder, self).__init__()
        self.num_slices = num_slices
        self.size_of_slice = size_of_slice
        
    def forward(self, x):
        x = x[: self.num_slices * self.size_of_slice]
        x = x.reshape((self.num_slices, self.size_of_slice))
        return x
'''
class OneHotConverter(nn.Module):
    # Converts the sigmoid output into one-hots
    
    def __init__(self, size, sentence_length):
        super(OneHotConverter, self).__init__()
        self.arr_length = size
        self.num_words = sentence_length
        
    def forward(self, x):
        output = []
        for num in x:
            num = num.item()
            num *= self.arr_length
            val = np.zeros(self.arr_length)
            val[int(round(num))] = 1
        
            output.append(val)
        output = torch.as_tensor(output)
        output.requires_grad_()
        return output
'''    

class PositionalEncoder(nn.Module):
    # Necessary to store positional data about the input data
    def __init__(self, d_model, dropout=0.1, max_len=1):
        super(PositionalEncoder, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pos_encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        divisor = torch.exp(torch.arange(0, d_model, 2).float() * (- math.log(10000.0) / d_model))
        
        pos_encoding[:, 0::2] = torch.sin(position * divisor)
        pos_encoding[:, 1::2] = torch.cos(position * divisor)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        
        self.register_buffer("pos_encoding", pos_encoding)

        
    def forward(self, x):
        x = x + self.pos_encoding[:x.size(0), :]
        return self.dropout(x)

In [10]:
# Training pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model to set to
model = ECGTransformerEncoder(vector_size=8, n_inputs=2500, n_heads=8, hidden_linear_dim=2048, n_layers=4, dropout=0.3).to(device)

# Training params
loss_function = nn.NLLLoss()

optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
torch.autograd.set_detect_anomaly(True)
data = []
for arr in waveform_lead_rhythm_diag["decoded_waveform"]:
    #print(arr)
    data.append(arr)

labels = []
for sentence in waveform_lead_rhythm_diag["diagnosis"]:
    #label = one_hot(sentence, dict_words)
    label = []
    for word in sentence.split():
        label.append(word_map[word])
    
    while len(label) < 17:
        label.append(0)
    labels.append(label)
data = torch.from_numpy(np.array(data)).type(torch.FloatTensor)
#labels = torch.from_numpy(np.array(labels))
print(data.shape)
model.train()

torch.Size([8, 8, 2500])


ECGTransformerEncoder(
  (positional_encoder): PositionalEncoder(
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.3, inplace=False)
        (dropout2): Dropout(p=0.3, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_f

In [12]:
for i in range(500):
    losses = 0
    for x, y in zip(data, labels):
        #print(x.shape)
        optimizer.zero_grad()
        outputs = model(torch.unsqueeze(x, 0))
        #print(outputs.shape)
        loss = loss_func-tion(outputs, y)
        losses += loss
    losses.backward(retain_graph=True)
    optimizer.step()
    print(losses)
    if losses < .001:
        break

for x, y in zip(train_x, labels):
    print(np.argmax(model(x).detach().numpy(), axis=1))
    print(y.detach().numpy())

RuntimeError: shape '[31, 64]' is invalid for input of size 20000

In [29]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

encoded_inputs = tokenizer.encode(labels[0], return_tensors='pt')
#print(labels[0], encoded_inputs)
labels[0]

[12, 23, 46, 32, 16, 47, 24, 33, 18, 14, 39, 33, 6, 5, 34, 19, 22, 28, 1]

## Model 4 - FNET Transformer Architecture

## Model 5 - FNET/Basic Mixup Architecture 