In [26]:
# import all packages needed
import string 
import numpy as np
import pandas as pd
from matplotlib import pyplot
from base64 import b64decode as decode
import math
import torch
import torch.nn as nn 
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Data Processing / Cleaning

In [3]:
# use class base64 to decode waveform data
def to_array(wf):
    barr = bytearray(decode(wf))
    vals = np.array(barr)
    return vals.view(np.int16)

# read in data
exam_data = pd.read_csv("data/d_exam.csv").drop(columns = ["site_num", "patient_id_edit"])
waveform_data = pd.read_csv("data/d_waveform.csv")
lead_data = pd.read_csv("data/d_lead_data.csv").drop(columns = ["exam_id"])
diagnosis_data = pd.read_csv("data/d_diagnosis.csv").drop(columns = ["user_input"])

# add decoded data as a column to lead dataz
waveforms = list(lead_data['waveform_data'])
lead_data['decoded_waveform'] = [to_array(i) for i in waveforms]

# merge waveform data and lead data
waveform_lead = lead_data.merge(waveform_data, how = "left", left_on = "waveform_id", right_on = "waveform_id", suffixes = (None, None))

#  sort by exam id and lead id
waveform_lead.sort_values(by = ["waveform_id", "lead_id"], inplace = True)

waveform_lead.loc[:, ['exam_id', 'lead_id', 'decoded_waveform', 'waveform_type']]


# adding the diagnosis and labels
waveform_and_diag = pd.merge(waveform_lead[['exam_id', 'lead_id', 'decoded_waveform', 'waveform_type']], diagnosis_data[["exam_id", "Full_text", "Original_Diag"]], left_on= "exam_id", right_on="exam_id")


In [4]:
# concatenate all leads into a single array
waveform_lead_concat = waveform_lead.groupby(["exam_id", "waveform_type"])['decoded_waveform'].apply(lambda x: tuple(x)).reset_index()

# remove irregular observations, concat tuple into numpy array
waveform_lead_concat = waveform_lead_concat.drop([12,17], axis = 0)
waveform_lead_concat['decoded_waveform'] = waveform_lead_concat['decoded_waveform'].apply(lambda x: MinMaxScaler().fit_transform(np.vstack(x)))
waveform_lead_rhythm = waveform_lead_concat[waveform_lead_concat['waveform_type'] == "Rhythm"]
waveform_lead_median = waveform_lead_concat[waveform_lead_concat['waveform_type'] == "Median"]

waveform_lead_rhythm['decoded_waveform'][1].shape

(8, 2500)

In [5]:
# Adding the labels/sentences
exams = diagnosis_data["exam_id"].unique()

# Let's look over this tomorrow
diagnosis_data = diagnosis_data[diagnosis_data['Original_Diag'] == 1].dropna()
searchfor = ['previous', 'unconfirmed', 'compared', 'interpretation', 'significant']
diagnosis_data = diagnosis_data.loc[diagnosis_data['Full_text'].str.contains('|'.join(searchfor)) != 1]
#

diagnosis_data.sort_values(by=["exam_id", "statement_order"], inplace=True)
diagnoses = []
curr_id = 0
curr_string = ""
for i, row in diagnosis_data.iterrows():
    if row["statement_order"] == 1 and curr_string != "":
        curr_string = curr_string.lower().translate(str.maketrans('', '', string.punctuation))
        val = [curr_id, curr_string[1:]]
        diagnoses.append(val)
        curr_string = ""
        curr_id = row["exam_id"]

    if curr_id == 0:
        curr_id = row["exam_id"]
    
    curr_string += " " + row["Full_text"]

diagnosis_df = pd.DataFrame(diagnoses, columns = ['exam_id', 'diagnosis'])
waveform_lead_rhythm_diag = pd.merge(left=waveform_lead_rhythm, right=diagnosis_df, left_on='exam_id', right_on='exam_id')

#waveform_lead_rhythm_diag
waveform_lead_rhythm_diag

Unnamed: 0,exam_id,waveform_type,decoded_waveform,diagnosis
0,548759,Rhythm,"[[0.42857142857142855, 0.45454545454545453, 0....",normal sinus rhythm low voltage qrs borderline...
1,549871,Rhythm,"[[0.75, 0.7391304347826086, 0.7272727272727272...",sinus bradycardia otherwise normal ecg
2,550602,Rhythm,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",sinus tachycardia otherwise normal ecg
3,551485,Rhythm,"[[0.5, 0.5079365079365079, 0.5161290322580645,...",normal sinus rhythm normal ecg
4,552077,Rhythm,"[[0.5333333333333332, 0.53125, 0.5714285714285...",normal sinus rhythm normal ecg
5,552856,Rhythm,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",normal sinus rhythm with sinus arrhythmia mini...
6,553115,Rhythm,"[[0.0, 0.2, 0.375, 0.375, 0.375, 0.1875, 0.0, ...",atrial fibrillation abnormal ecg normal sinus ...


In [5]:
unique_words = set()
for num, sentence in diagnoses:
    for word in sentence.split():
        unique_words.add(word)
print(unique_words)

{'criteria', 'borderline', 'variant', 'otherwise', 'fibrillation', 'lvh', 'ischemia', 'abnormal', 'abnormality', 'tachycardia', 'normal', 'wave', 'consider', 'inferior', 'atrial', 'voltage', 'for', 'with', 'may', 'ecg', 'low', 'bradycardia', 'qrs', 'be', 'rhythm', 'sinus', 't', 'arrhythmia', 'minimal'}


In [6]:
# split data into training and testing datasets
# y not included for now
def one_hot(x, dict_words, max_length):
    x = x.split(" ")
    array = []
    for i in x:
        array.append(dict_words.index(i))
    while(len(array) < max_length):
        array.append(29)
    return array

dict_words = list(unique_words)
dict_words.append([" "])
print(len(dict_words))
Y = waveform_lead_rhythm_diag['diagnosis'].apply(lambda x: one_hot(x, dict_words, 20))

train_x, test_x, train_y, test_y = train_test_split(waveform_lead_rhythm_diag['decoded_waveform'], Y, test_size = 0.1, random_state = 2021)
train_x = torch.tensor(list(train_x)).float()
train_y = torch.tensor(list(train_y))

test_x = torch.tensor(list(test_x)).float()
test_y = torch.tensor(list(test_y))

train_y

30


tensor([[10, 25, 24, 10, 19, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29],
        [14,  4,  7, 19, 10, 25, 24, 17, 25, 27, 10, 19, 29, 29, 29, 29, 29, 29,
         29, 29],
        [10, 25, 24, 20, 15, 22,  1, 19, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29],
        [25, 21,  3, 10, 19, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29],
        [10, 25, 24, 17, 25, 27, 28, 15,  0, 16,  5, 18, 23, 10,  2,  1, 19, 29,
         29, 29],
        [10, 25, 24, 10, 19, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
         29, 29]])

## Model 1 - Conv1D Encoder w/ LSTM Decoder

In [7]:
# HYPERPARAMETERS
J = 10 # max number of filters per class
LR = 1e-3

# define global max pooling
class global_max_pooling_1d(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        x, _ = torch.max(x, dim = 2)
        return(x)

# define resblock for neural nets
class ResBlock1D(nn.Module):
    def __init__(self, num_filters, kernel_size, padding, groups = 1, stride = 1):
        super(ResBlock1D, self).__init__()
        self.act = nn.ReLU()
        self.conv1d_1 = nn.Conv1d(num_filters, num_filters, kernel_size = kernel_size, padding = padding, groups = groups, stride = 1)
        self.conv1d_2 = nn.Conv1d(num_filters, num_filters, kernel_size = kernel_size, padding = padding, groups = groups, stride = 1)
        self.batch_norm_1 = nn.BatchNorm1d(num_filters)
        self.batch_norm_2 = nn.BatchNorm1d(num_filters)

    def forward(self, x):
        res = x
        x = self.batch_norm_1(self.act(self.conv1d_1(x)))
        x = self.batch_norm_2(self.act(self.conv1d_2(x)))
        return x + res

conv_model = nn.Sequential()
init_channels = 8
for i in range(5):
    next_channels = 2 * init_channels
    conv_model.add_module('conv_{num}'.format(num = i), nn.Conv1d(in_channels = init_channels, out_channels = next_channels, kernel_size = 249, padding = 124, stride = 1))
    conv_model.add_module('act_{num}'.format(num = i), nn.ReLU())
    conv_model.add_module('batch_norm_{num}'.format(num = i), nn.BatchNorm1d(next_channels))
    conv_model.add_module('res_{num}'.format(num = i), ResBlock1D(num_filters = next_channels, kernel_size = 249, padding = 124))
    conv_model.add_module('act_res_{num}'.format(num = i), nn.ReLU())
    init_channels = next_channels
conv_model.add_module('conv_fin', nn.Conv1d(in_channels = init_channels, out_channels = 8, kernel_size = 249, padding = 124))
conv_model.add_module('act_fin', nn.ReLU())
conv_model.add_module('batch_fin', nn.BatchNorm1d(8))
print(conv_model)
print(conv_model(train_x).shape)

Sequential(
  (conv_0): Conv1d(8, 16, kernel_size=(249,), stride=(1,), padding=(124,))
  (act_0): ReLU()
  (batch_norm_0): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (res_0): ResBlock1D(
    (act): ReLU()
    (conv1d_1): Conv1d(16, 16, kernel_size=(249,), stride=(1,), padding=(124,))
    (conv1d_2): Conv1d(16, 16, kernel_size=(249,), stride=(1,), padding=(124,))
    (batch_norm_1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (batch_norm_2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (act_res_0): ReLU()
  (conv_1): Conv1d(16, 32, kernel_size=(249,), stride=(1,), padding=(124,))
  (act_1): ReLU()
  (batch_norm_1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (res_1): ResBlock1D(
    (act): ReLU()
    (conv1d_1): Conv1d(32, 32, kernel_size=(249,), stride=(1,), padding=(124,))
    (conv1d_2): Conv1d(32, 32, kernel_size=(249,), stride

In [31]:
# HYPERPARAMETERS
J = 4 # max number of filters per class
LR = 1e-3

# define global max pooling
class global_max_pooling_1d(nn.Module):
    def __init__(self):
        super().__init__()
    
    def forward(self, x):
        x, _ = torch.max(x, dim = 2)
        return(x)

# 1D grouped encoder model
encoder_conv = nn.Sequential()
encoder_conv.add_module('initial_norm', nn.BatchNorm1d(8))
encoder_conv.add_module('conv_1', nn.Conv1d(in_channels = 8, out_channels = 8, kernel_size = 5, padding = 4, stride = 1))
for i in range(2, (J+2), 2):
    if (i-2) == 0: 
        prev = 8
    else:
        prev = (i-2)*8
    encoder_conv.add_module('conv_{num}'.format(num = int(i / 2 + 1)), nn.Conv1d(in_channels = prev, out_channels = i*8, kernel_size = 5, padding = 2, stride = 2))
    encoder_conv.add_module('activation_{num}'.format(num = int(i / 2 + 1)), nn.ELU())
    encoder_conv.add_module('batch_norm_{num}'.format(num = int(i / 2 + 1)), nn.BatchNorm1d(i*8))
    
encoder_conv.add_module('final_conv', nn.Conv1d(in_channels = J * 8, out_channels = 8, kernel_size = 5, padding = 2))
encoder_conv.add_module('max_pool', nn.MaxPool1d(kernel_size = 5, padding = 2, stride = 1))
#encoder_conv.add_module('reshape', nn.MaxPool1d(kernel_size = 5, padding = 2, stride = 1))

# summarize model, verify output is of desired shape
print(encoder_conv)
print(encoder_conv(train_x).shape)

Sequential(
  (initial_norm): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_1): Conv1d(8, 8, kernel_size=(5,), stride=(1,), padding=(4,))
  (conv_2): Conv1d(8, 16, kernel_size=(5,), stride=(2,), padding=(2,))
  (activation_2): ELU(alpha=1.0)
  (batch_norm_2): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv_3): Conv1d(16, 32, kernel_size=(5,), stride=(2,), padding=(2,))
  (activation_3): ELU(alpha=1.0)
  (batch_norm_3): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (final_conv): Conv1d(32, 8, kernel_size=(5,), stride=(1,), padding=(2,))
  (max_pool): MaxPool1d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False)
)
torch.Size([6, 8, 626])


## Model 2 - LSTM Encoder w/ Huggingface Decoder

In [17]:
# define hyperparameters 
hidden_layers = 250
embedding_dim = 8
num_words = len(dict_words)

class LSTM_EncoderDecoder(nn.Module):
    def __init__(self, h_dim, e_dim, word_list_length):
        super(ECG_LSTM, self).__init__()
        self.lstm = nn.LSTM(e_dim, h_dim, num_layers = 4, bidirectional = True)
        
    def forward(self, seq):
        seq_embedded = seq.view(len(seq), -1, embedding_dim)
        final_hidd, _ = self.lstm(seq_embedded)
        dec_seq = self.linear(final_hidd)
        return F.log_softmax(dec_seq, dim = 1)
    

In [80]:
epoch = 1000
loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(lstm_mod.parameters(), lr = 1e-3)
torch.autograd.set_detect_anomaly(True)

for i in range(epoch):
    for j, k in zip(train_x, train_y):
        optimizer.zero_grad()
        outputs = lstm_mod(j.unsqueeze(0)).squeeze(0)
        loss = loss_fn(outputs, k)
        loss.backward(retain_graph=True)
        optimizer.step()


KeyboardInterrupt: 

In [81]:
#torch.save(lstm_mod.state_dict(), 'model/lstm.pt')
lstm_mod = ECG_LSTM(encoder_conv, hidden_layers, embedding_dim, num_words)
lstm_mod.load_state_dict(torch.load('model/lstm.pt'))

out = lstm_mod(train_x[5].unsqueeze(0))
print(out.squeeze(0).detach().numpy().shape)
out = np.argmax(out.squeeze(0).detach().numpy(), axis = 1)
print(out)
print(train_y[5])

(20, 30)
[21 11 10 21  5 26 23 25 22  6 29 25 27 26  4 26  4 26 26  7]
tensor([21, 11, 10, 21,  5, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
        29, 29])


## Model 3 - Basic Transformer Architecture with Multi-Head Attention

## Model 4 - FNET Transformer Architecture

In [132]:
class FeedForwardNet(nn.Module):
    def __init__(self, features, expansion, dropout):
        super(FeedForwardNet, self).__init__()
        self.linear_1 = nn.Linear(features, features * expansion)
        self.linear_2 = nn.Linear(features * expansion, features)
        self.dropout_1 = nn.Dropout(dropout)
        #self.dropout_2 = nn.Dropout(dropout)
        self.norm_1 = nn.LayerNorm(features)

    def forward(self, x):
        res = x
        x = F.relu(self.linear_1(x))
        x = self.dropout_1(x)
        x = self.linear_2(x)
        x = self.norm_1(x + res)
        return x
    
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
class FNETLayer(nn.Module):
    def __init__(self, features, expansion, dropout):
        super(FNETLayer, self).__init__()
        self.feed_forward = FeedForwardNet(features, expansion, dropout)
        self.norm_1 = nn.LayerNorm(features)
    
    def forward(self, x):
        res = x
        x = torch.fft.fftn(x, dim = (-1, -2)).real
        x = self.norm_1(x + res)
        x = self.feed_forward(x)
        return x
    
class FNETEncoder(nn.TransformerEncoder):
    def __init__(self, features, expansion=2, dropout=0.5, num_layers=6):
        encoder_layer = FNETLayer(features, expansion, dropout)
        super().__init__(encoder_layer=encoder_layer, num_layers=num_layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

class FNETModel(nn.Module):
    def __init__(self, expansion, dropout, d_model, num_layers, decoder):
        super(FNETModel, self).__init__()
        self.decoder = decoder
        self.pos_enb = PositionalEncoding(d_model = d_model)
        self.encoder = FNETEncoder(features = d_model, expansion = expansion, dropout = dropout, num_layers = num_layers)
        
    
    def forward(self, x):
        x = self.pos_enb(x)
        x = self.encoder(x)
        out = self.decoder(x)
        return out    

class Transpose(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, x):
        return x.transpose(1, 2)

decoder = nn.Sequential(Transpose(), 
                        nn.Linear(8, 15),
                        nn.ReLU(), 
                        nn.Linear(15, 20), 
                        nn.ReLU(), 
                        Transpose(), 
                        nn.Linear(2500, 2000), 
                        nn.ReLU(), 
                        nn.Linear(2000, 1500), 
                        nn.ReLU(), 
                        nn.Linear(1500, 1000), 
                        nn.ReLU(), 
                        nn.Linear(1000, 500), 
                        nn.ReLU(), 
                        nn.Linear(500, 100), 
                        nn.ReLU(), 
                        nn.Linear(100, 30))

In [135]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epoch = 300
#ECG_LSTM(hidden_layers, embedding_dim, num_words)
model = FNETModel(expansion = 2, dropout = 0, d_model = train_x.shape[2], num_layers = 6, decoder = decoder)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
torch.autograd.set_detect_anomaly(True)

for i in range(epoch):
    losses = 0
    for j, k in zip(train_x, train_y):
        optimizer.zero_grad()
        outputs = model(j.unsqueeze(0)).squeeze(0)
        loss = loss_fn(outputs, k)
        losses += loss
    losses.backward(retain_graph=True)
    optimizer.step()
    print(losses)


tensor(12.1033, grad_fn=<AddBackward0>)
tensor(13.5739, grad_fn=<AddBackward0>)
tensor(21.0495, grad_fn=<AddBackward0>)
tensor(15.1338, grad_fn=<AddBackward0>)
tensor(10.9025, grad_fn=<AddBackward0>)
tensor(9.9206, grad_fn=<AddBackward0>)
tensor(9.7928, grad_fn=<AddBackward0>)
tensor(9.9478, grad_fn=<AddBackward0>)
tensor(9.3364, grad_fn=<AddBackward0>)
tensor(8.5085, grad_fn=<AddBackward0>)
tensor(8.4253, grad_fn=<AddBackward0>)
tensor(8.1964, grad_fn=<AddBackward0>)
tensor(8.8776, grad_fn=<AddBackward0>)
tensor(8.2562, grad_fn=<AddBackward0>)
tensor(7.9390, grad_fn=<AddBackward0>)
tensor(7.6749, grad_fn=<AddBackward0>)
tensor(7.3612, grad_fn=<AddBackward0>)
tensor(7.3261, grad_fn=<AddBackward0>)
tensor(7.1017, grad_fn=<AddBackward0>)
tensor(6.7182, grad_fn=<AddBackward0>)
tensor(6.6983, grad_fn=<AddBackward0>)
tensor(7.1226, grad_fn=<AddBackward0>)
tensor(7.0148, grad_fn=<AddBackward0>)
tensor(6.7546, grad_fn=<AddBackward0>)
tensor(6.6756, grad_fn=<AddBackward0>)
tensor(6.7715, grad_

tensor(0.1394, grad_fn=<AddBackward0>)
tensor(0.0896, grad_fn=<AddBackward0>)
tensor(0.0246, grad_fn=<AddBackward0>)
tensor(0.4736, grad_fn=<AddBackward0>)
tensor(0.1304, grad_fn=<AddBackward0>)
tensor(0.1634, grad_fn=<AddBackward0>)
tensor(0.1823, grad_fn=<AddBackward0>)
tensor(0.7106, grad_fn=<AddBackward0>)
tensor(0.0537, grad_fn=<AddBackward0>)
tensor(0.0415, grad_fn=<AddBackward0>)
tensor(1.8078, grad_fn=<AddBackward0>)
tensor(0.9299, grad_fn=<AddBackward0>)
tensor(0.3593, grad_fn=<AddBackward0>)
tensor(0.2644, grad_fn=<AddBackward0>)
tensor(0.3232, grad_fn=<AddBackward0>)
tensor(0.2660, grad_fn=<AddBackward0>)
tensor(0.8886, grad_fn=<AddBackward0>)
tensor(1.7445, grad_fn=<AddBackward0>)
tensor(0.1726, grad_fn=<AddBackward0>)
tensor(0.4069, grad_fn=<AddBackward0>)
tensor(0.2657, grad_fn=<AddBackward0>)
tensor(3.5218, grad_fn=<AddBackward0>)
tensor(1.5644, grad_fn=<AddBackward0>)
tensor(0.7932, grad_fn=<AddBackward0>)
tensor(0.3712, grad_fn=<AddBackward0>)
tensor(0.9135, grad_fn=<A

In [136]:
torch.save(model.state_dict(), 'model/fnet_2.pt')
model = FNETModel(expansion = 2, dropout = 0, d_model = train_x.shape[2], num_layers = 6, decoder = decoder)
model.load_state_dict(torch.load('model/fnet_2.pt'))
print(model)
out = (model(train_x[3].unsqueeze(0)))
print(out.squeeze(0).detach().numpy().shape)
out = np.argmax(out.squeeze(0).detach().numpy(), axis = 1)
print(out)
print(train_y[3])

FNETModel(
  (decoder): Sequential(
    (0): Transpose()
    (1): Linear(in_features=8, out_features=15, bias=True)
    (2): ELU(alpha=1.0)
    (3): Linear(in_features=15, out_features=20, bias=True)
    (4): ELU(alpha=1.0)
    (5): Transpose()
    (6): Linear(in_features=2500, out_features=2000, bias=True)
    (7): ELU(alpha=1.0)
    (8): Linear(in_features=2000, out_features=1500, bias=True)
    (9): ELU(alpha=1.0)
    (10): Linear(in_features=1500, out_features=1000, bias=True)
    (11): ELU(alpha=1.0)
    (12): Linear(in_features=1000, out_features=500, bias=True)
    (13): ELU(alpha=1.0)
    (14): Linear(in_features=500, out_features=100, bias=True)
    (15): ELU(alpha=1.0)
    (16): Linear(in_features=100, out_features=30, bias=True)
  )
  (pos_enb): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): FNETEncoder(
    (layers): ModuleList(
      (0): FNETLayer(
        (feed_forward): FeedForwardNet(
          (linear_1): Linear(in_features=2500, out

In [44]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

#encoded_inputs = tokenizer(list(waveform_lead_rhythm_diag['diagnosis'])[0], return_tensors='pt')

encoded_inputs = tokenizer('Hello my name is Daniel', return_tensors = 'pt')
print(model)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP

## Model 5 - FNET/Basic Mixup Architecture 

In [2]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer(prompt, next_sentence, return_tensors='pt')

outputs = model(**encoding, labels=torch.LongTensor([1]))
logits = outputs.logits
#assert logits[0, 0] < logits[0, 1] # next sentence was random
print(logits)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[-3.0729,  5.9056]], grad_fn=<AddmmBackward>)


In [50]:
epochs = 100

# define tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# preprocess training labels and tokenize
train_labels = list(waveform_lead_rhythm_diag['diagnosis'])
inputs = tokenizer(train_labels, padding = True, pad_token = tokenizer.add_special_tokens({'pad_token': '[PAD]'}), verbose = False, return_tensors="pt")

# adjust model parameters to account for padding token
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
torch.autograd.set_detect_anomaly(True)

for i in range(epochs):
#model_gpt2DoubleHeadsModel.resize_token_embeddings(len(gpt2_tokenizer))
    optimizer.zero_grad()
    outputs = model(**inputs, labels = inputs["input_ids"])
    loss = outputs.loss
    loss.backwards()
    optimizer.step()
    
    print(loss)
    
logits = model.logits
print(np.argmax(logits[0].detach().numpy(), axis = 1))


Keyword arguments {'pad_token': 1} not recognized.
Keyword arguments {'pad_token': 1} not recognized.
Keyword arguments {'pad_token': 1} not recognized.
Keyword arguments {'pad_token': 1} not recognized.
Keyword arguments {'pad_token': 1} not recognized.
Keyword arguments {'pad_token': 1} not recognized.
Keyword arguments {'pad_token': 1} not recognized.


AttributeError: 'Tensor' object has no attribute 'backwards'

1