In [1]:
import sys
sys.executable

'/Users/b/bio/cardiac/model1/new_env/bin/python'

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import pandas as pd
import spacy
import datasets
from datasets import Dataset
import tqdm
import evaluate
from collections import Counter, defaultdict
import pyabf
import ast
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [29]:
df = pd.read_csv('content/drive/MyDrive/' + 'phases.csv')

In [30]:
len(df) / 5

27.0

In [31]:
def get_phase_data(df, index):
    phase_x = ast.literal_eval(df['X Values'][index])
    phase_y = ast.literal_eval(df['Y Values'][index])
    return list(zip(phase_x, phase_y))
    

In [32]:
def flatten_tuple_list(tuple_list):
    return [item for tup in tuple_list for item in tup]

In [33]:
def round_list_numbers(numbers_list, decimals=4):
    return [round(num, decimals) for num in numbers_list]

In [34]:
def process_sweep_data(data):
    return round_list_numbers(flatten_tuple_list(data))

In [35]:
data = []
for i in range(0, len(df), 5):
    file = df['File Path'][i]
    sweepNum = df["Sweep Number"][i]

    abf = pyabf.ABF('content/drive/MyDrive' + file)
    abf.setSweep(sweepNum)

    totalSweep = list(zip(abf.sweepX, abf.sweepY))
    edges = [get_phase_data(df, i+j)[0] for j in range(4)]
    edges.append(get_phase_data(df, i+4)[-1])

    data.append({
        "input": process_sweep_data(totalSweep),
        "output": process_sweep_data(edges)
    })


In [36]:
len(data)

27

In [37]:
def split_dataset(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    assert sum([train_ratio, val_ratio, test_ratio])==1.0, "ratios must sum to 1.0"

    total_length = len(data)
    train_index = int(total_length*train_ratio)
    val_index = int(total_length*val_ratio) + train_index

    return data[:train_index], data[train_index:val_index], data[val_index:]
    

In [76]:
train_data, valid_data, test_data = split_dataset(data)

print(len(train_data))
print(len(valid_data))
print(len(test_data))

21
2
4


In [79]:
print(train_data[0]["output"])
print(test_data[0]["output"])

[0.025, -82.1228, 0.035, 54.0161, 0.1, 37.2009, 0.3, 9.613, 1.9999, -82.5195]
[0.02, -81.1462, 0.038, 41.5649, 0.05, 24.6582, 0.12, -1.8005, 2.0, -81.0242]


In [80]:
def tokenize_example(example, sos_token, eos_token):
    input_tokens = [sos_token] + [str(num) for num in example["input"]] + [eos_token]
    output_tokens = [sos_token] + [str(num) for num in example["output"]] + [eos_token]
    return {"input_tokens": input_tokens, "output_tokens":output_tokens}



In [81]:
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "sos_token": sos_token,
    "eos_token": eos_token
}

In [82]:
train_data = Dataset.from_list(train_data)
valid_data = Dataset.from_list(valid_data)
test_data = Dataset.from_list(test_data)

In [83]:
train_data = train_data.map(lambda x: tokenize_example(x, **fn_kwargs))
valid_data = valid_data.map(lambda x: tokenize_example(x, **fn_kwargs))
test_data = test_data.map(lambda x: tokenize_example(x, **fn_kwargs))

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [84]:
train_data[0]['output_tokens']

['<sos>',
 '0.025',
 '-82.1228',
 '0.035',
 '54.0161',
 '0.1',
 '37.2009',
 '0.3',
 '9.613',
 '1.9999',
 '-82.5195',
 '<eos>']

In [85]:
print(test_data[0]["output"])

[0.02, -81.1462, 0.038, 41.5649, 0.05, 24.6582, 0.12, -1.8005, 2.0, -81.0242]


In [86]:
class Vocab:
    def __init__(self, token_to_index, unk_token="<unk>"):
        self.token_to_index = token_to_index
        self.index_to_token = {idx:token for token,idx in token_to_index.items()}
        self.unk_token = unk_token
        self.unk_index = token_to_index[unk_token]

    def __len__(self):
        return len(self.token_to_index)

    def __getitem__(self, token):
        return self.token_to_index.get(token, self.unk_index)

    def token_to_idx(self, token):
        return self.__getitem__(token)

    def idx_to_token(self, index):
        return self.index_to_token.get(index, self.unk_token)

    def set_default_index(self, index):
        if index==self.unk_index:
            return

        current_token = self.index_to_token.get(index, None)

        if token in self.token_to_index and current_token:
            self.token_to_index[token], self.token_to_index[current_token] = index, self.unk_token

        if index in self.index_to_token and self.unk_index in self.index_to_token:
            self.index_to_token[index], self.index_to_token[self.unk_index] = self.index_to_token[self.unk_index], self.index_to_token[index]

        self.unk_index = index

    def get_stoi(self):
        return self.token_to_index

    def get_itos(self):
        return self.index_to_token

    def lookup_indices(self, tokens):
        return [self.token_to_idx(token) for token in tokens]

    def lookup_tokens(self, indices):
        if torch.is_tensor(indices):
            indices = indices.tolist()

        return [self.idx_to_token(index) for index in indices]

In [87]:
def build_vocab_from_iterator(iterator, min_freq=1, specials=None):
    counter = Counter(token for tokens in iterator for token in tokens)

    token_to_index = {token:idx for idx, token in enumerate(specials or [])}

    for token, freq in counter.items():
        if freq >= min_freq and token not in token_to_index:
            token_to_index[token] = len(token_to_index)

    unk_token = specials[0] if specials else "<unk>"
    token_to_index.setdefault(unk_token, len(token_to_index))

    return Vocab(token_to_index, unk_token=unk_token)


tokens = [['<sos>',
  '0.02',
  '-79.40674',
  '0.036',
  '42.51099',
  '0.08',
  '22.33887',
  '0.18',
  '-1.58691',
  '0.25',
  '-77.63672',
  '1.99995',
  '-79.49829',
  '<eos>']]
min_freq = 1
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]
vocab = build_vocab_from_iterator(tokens, min_freq, specials)
print(vocab.get_stoi())
print(vocab.idx_to_token(6))




{'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3, '0.02': 4, '-79.40674': 5, '0.036': 6, '42.51099': 7, '0.08': 8, '22.33887': 9, '0.18': 10, '-1.58691': 11, '0.25': 12, '-77.63672': 13, '1.99995': 14, '-79.49829': 15}
0.036


In [88]:
min_freq = 1
unk_token = "<unk>"
pad_token = "<pad>"

specials = [
    unk_token,
    pad_token,
    sos_token,
    eos_token,
]

input_vocab = build_vocab_from_iterator(
    train_data["input_tokens"],
    min_freq=min_freq,
    specials=specials,
)

output_vocab = build_vocab_from_iterator(
    train_data["output_tokens"],
    min_freq=min_freq,
    specials=specials,
)


In [89]:
print(len(input_vocab))
print(len(output_vocab))

24762
133


In [90]:
assert input_vocab[unk_token]==output_vocab[unk_token]
assert input_vocab[pad_token]==output_vocab[pad_token]

unk_index = input_vocab[unk_token]
pad_index = input_vocab[pad_token]

In [91]:
input_vocab.set_default_index(unk_index)
output_vocab.set_default_index(unk_index)

In [92]:
def numericalize_example(example, input_vocab, output_vocab):
    input_ids = input_vocab.lookup_indices(example["input_tokens"])
    output_ids = output_vocab.lookup_indices(example["output_tokens"])
    return {"input_ids": input_ids, "output_ids": output_ids}

In [93]:
fn_kwargs = {"input_vocab": input_vocab, "output_vocab": output_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [94]:
data_type = "torch"
format_columns = ["input_ids", "output_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

test_data = test_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

In [95]:
print(test_data[0]["output"])

[0.02, -81.1462, 0.038, 41.5649, 0.05, 24.6582, 0.12, -1.8005, 2.0, -81.0242]


In [96]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_input_ids = [example["input_ids"] for example in batch]
        batch_output_ids = [example["output_ids"] for example in batch]
        batch_input_ids = nn.utils.rnn.pad_sequence(batch_input_ids, padding_value=pad_index)
        batch_output_ids = nn.utils.rnn.pad_sequence(batch_output_ids, padding_value=pad_index)
        batch = {
            "input_ids": batch_input_ids,
            "output_ids": batch_output_ids
        }
        return batch
    return collate_fn



In [97]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [98]:
batch_size = 5

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [115]:
class Encoder(nn.Module):
    def __init__(
        self, input_dim, embedding_dim, encoder_hidden_dim, decoder_hidden_dim, dropout
    ):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.GRU(embedding_dim, encoder_hidden_dim, bidirectional=True)
        self.fc = nn.Linear(encoder_hidden_dim * 2, decoder_hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        hidden = torch.tanh(
            self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        )
        return outputs, hidden


In [129]:
class Attention(nn.Module):
    def __init__(self, encoder_hidden_dim, decoder_hidden_dim):
        super().__init__()
        self.attn_fc = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim, decoder_hidden_dim
        )
        self.v_fc = nn.Linear(decoder_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch_size, decoder_hidden_dim]
        # encoder_hidden_dim = [src length, batch size, encoder hidden dim * 2]
        src_length = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_length, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn_fc(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v_fc(energy).squeeze(2)
        return torch.softmax(attention, dim=1)



In [171]:
class Decoder(nn.Module):
    def __init__(
        self,
        output_dim,
        embedding_dim,
        encoder_hidden_dim,
        decoder_hidden_dim,
        dropout,
        attention,
    ):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.GRU((encoder_hidden_dim * 2) + embedding_dim, decoder_hidden_dim)
        self.fc_out = nn.Linear(
            (encoder_hidden_dim * 2) + decoder_hidden_dim + embedding_dim, output_dim
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))

        a = self.attention(hidden, encoder_outputs)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)

        rnn_inputs = torch.cat((embedded, weighted), dim=2)
        outputs, hidden = self.rnn(rnn_inputs, hidden.unsqueeze(0))

        outputs = outputs.squeeze(0)
        weighted = weighted.squeeze(0)
        embedded = embedded.squeeze(0)
        predictions = self.fc_out(torch.cat((outputs, weighted, embedded), dim=1))
        return predictions, hidden.squeeze(0), a.squeeze(1)

        

        
        



    


In [172]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio):
        batch_size = src.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_forcing = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

In [173]:
input_dim = len(input_vocab)
output_dim = len(output_vocab)
encoder_embedding_dim = 64
decoder_embedding_dim = 64
encoder_hidden_dim = 128
decoder_hidden_dim = 128
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

attention = Attention(encoder_hidden_dim, decoder_hidden_dim)

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    encoder_hidden_dim,
    decoder_hidden_dim,
    decoder_dropout,
    attention
)

model = Seq2Seq(encoder, decoder, device).to(device)

In [174]:
def init_weights(m):
    for name, param in m.named_parameters():
        if "weight" in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(24762, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn_fc): Linear(in_features=1536, out_features=512, bias=True)
      (v_fc): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(133, 256)
    (rnn): GRU(1280, 512)
    (fc_out): Linear(in_features=1792, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [175]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

print(f"The model has {count_parameters(model):,} trainable parameters.")

The model has 13,724,416 trainable parameters.


In [176]:
optimizer = optim.Adam(model.parameters())

In [177]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [178]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0

    for i, batch in enumerate(data_loader):
        src = batch["input_ids"].to(device)
        trg = batch["output_ids"].to(device)

        output = model(src, trg, teacher_forcing_ratio)
        
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

In [179]:
def evaluate_fn(model, data_loader, criterion, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["input_ids"].to(device)
        trg = batch["output_ids"].to(device)

        output = model(src, trg, 0)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [180]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(
        model, 
        train_data_loader, 
        optimizer, 
        criterion, 
        clip, 
        teacher_forcing_ratio, 
        device
    )
    valid_loss = evaluate_fn(
        model, 
        valid_data_loader, 
        criterion, 
        device
    )

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict, "s2s-0.2-model.pt")
        
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")


  0%|                                                    | 0/10 [05:07<?, ?it/s]


KeyboardInterrupt: 

In [181]:
def classify_ecg(
    data, model, input_vocab, output_vocab, sos_token, eos_token, device, max_output_length=25
):
    model.eval()
    with torch.no_grad():
        input_tokens = [sos_token] + [str(item) for item in data] + [eos_token]
        ids = input_vocab.lookup_indices(input_tokens)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)

        encoder_outputs, hidden = model.encoder(tensor)
        inputs = [output_vocab.lookup_indices([sos_token])[0]]
        attentions = torch.zeros(max_output_length, 1, len(ids))

        for i in range(max_output_length):
            input_tensor = torch.LongTensor([inputs[-1]]).to(device)
            print("Input tensor:", input_tensor)
            output, hidden, attention = model.decoder(input_tensor, hidden, encoder_outputs)
            print("Input tensor:", input_tensor)
            attentions[i] = attention
            predicted_token = output.argmax(1).item()
            inputs.append(predicted_token)
            if predicted_token == output_vocab[eos_token]:
                break
        output_tokens = output_vocab.lookup_tokens(inputs)
    return output_tokens, input_tokens, attentions[:len(output_tokens) - 1]
            
        




In [182]:
sweep = test_data[0]["output"]
phases = test_data[0]["input"]

sweep[0], phases[0]

(0.02, 0.0)

In [183]:
categorization, sweep_tokens, attention = classify_ecg(
    sweep, model,
    input_vocab,
    output_vocab,
    sos_token,
    eos_token,
    device,
)

Input tensor: tensor([2])
2 133
Input tensor: tensor([2])
Input tensor: tensor([416])
416 133


ValueError: Index 416 out of range for embedding layer with 133 embeddings.

In [184]:
print(input_vocab.lookup_tokens(torch.tensor([370])))
print(input_vocab.lookup_tokens(torch.tensor([2])))

['-41.56489944458008']
['<sos>']
