In [1]:
# import all packages needed
import numpy as np
import pandas as pd
from matplotlib import pyplot
from base64 import b64decode as decode
import torch 

from sklearn.model_selection import train_test_split

## Data Processing / Cleaning

In [9]:
# use class base64 to decode waveform data
def to_array(wf):
    barr = bytearray(decode(wf))
    vals = np.array(barr)
    return vals.view(np.int16)

# read in data
exam_data = pd.read_csv("data/d_exam.csv").drop(columns = ["site_num", "patient_id_edit"])
waveform_data = pd.read_csv("data/d_waveform.csv")
lead_data = pd.read_csv("data/d_lead_data.csv").drop(columns = ["exam_id"])
diagnosis_data = pd.read_csv("data/d_diagnosis.csv").drop(columns = ["user_input"])

# add decoded data as a column to lead data
waveforms = list(lead_data['waveform_data'])
lead_data['decoded_waveform'] = [to_array(i) for i in waveforms]

# merge waveform data and lead data
waveform_lead = lead_data.merge(waveform_data, how = "left", left_on = "waveform_id", right_on = "waveform_id", suffixes = (None, None))

#  sort by exam id and lead id
waveform_lead.sort_values(by = ["waveform_id", "lead_id"], inplace = True)

waveform_lead.loc[:, ['exam_id', 'lead_id', 'decoded_waveform', 'waveform_type']]


# adding the diagnosis and labels
waveform_and_diag = pd.merge(waveform_lead[['exam_id', 'lead_id', 'decoded_waveform', 'waveform_type']], diagnosis_data[["exam_id", "Full_text", "Original_Diag"]], left_on= "exam_id", right_on="exam_id")
waveform_and_diag.head()

Unnamed: 0,exam_id,lead_id,decoded_waveform,waveform_type,Full_text,Original_Diag
0,549871,I,"[-8, -8, -8, -8, -8, -8, -8, -7, -6, -5, -4, -...",Rhythm,No previous ECGs available,0
1,549871,I,"[-8, -8, -8, -8, -8, -8, -8, -7, -6, -5, -4, -...",Rhythm,Otherwise normal ECG,0
2,549871,I,"[-8, -8, -8, -8, -8, -8, -8, -7, -6, -5, -4, -...",Rhythm,Sinus bradycardia,0
3,549871,I,"[-8, -8, -8, -8, -8, -8, -8, -7, -6, -5, -4, -...",Rhythm,,0
4,549871,I,"[-8, -8, -8, -8, -8, -8, -8, -7, -6, -5, -4, -...",Rhythm,Sinus bradycardia,1


In [6]:
# concatenate all leads into a single array
waveform_lead_concat = waveform_lead.groupby(["exam_id", "waveform_type"])['decoded_waveform'].apply(lambda x: tuple(x)).reset_index()

# remove irregular observations, concat tuple into numpy array
waveform_lead_concat = waveform_lead_concat.drop([12,17], axis = 0)
waveform_lead_concat['decoded_waveform'] = waveform_lead_concat['decoded_waveform'].apply(lambda x: np.swapaxes(np.vstack(x), 0, 1))


(300, 8)

In [106]:
# split data into training and testing datasets
# y not included for now
train_x, test_x, _, _ = train_test_split(waveform_lead_concat['decoded_waveform'], waveform_lead_concat['decoded_waveform'], test_size = 0.1, random_state = 2021)
train_x[0].shape

(300, 8)

## Model 1 - Conv1D Encoder w/ Huggingface Decoder

## Model 2 - LSTM Encoder w/ Huggingface Decoder

## Model 3 - Basic Transformer Architecture with Multi-Head Attention

## Model 4 - FNET Transformer Architecture

## Model 5 - FNET/Basic Mixup Architecture 