In [1]:
import csv
import pandas as pd
import numpy as np
import os
import torch
import datetime
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
! ls data

icu  measurements  preprocessed


In [3]:
PATH = "data/preprocessed"        

class MortalityDataset(Dataset):    
    def __init__(self):
        self.sequences = []
        self.labels = []
        
        sequences = []
        directory = os.fsencode(f"{PATH}")
        fs = os.listdir(directory)
        for idx, file in enumerate(fs):
            filename = os.fsdecode(file)
            if not filename.endswith(".csv"): 
                continue
                
            df = pd.read_csv(f"{PATH}/{filename}")
            y_true = df["y_true"][0]
            del df["y_true"]
            
            data = torch.from_numpy(df.values)
            self.sequences.append(data)
            self.labels.append(y_true)
            
            if idx % 500 == 0:
                print(f"loaded {idx+1} out of {len(fs)}")

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [4]:
m = MortalityDataset()
print(m[0])
print(m[1])

loaded 1 out of 20030
loaded 501 out of 20030
loaded 1001 out of 20030
loaded 1501 out of 20030
loaded 2001 out of 20030
loaded 2501 out of 20030
loaded 3001 out of 20030
loaded 3501 out of 20030
loaded 4001 out of 20030
loaded 4501 out of 20030
loaded 5001 out of 20030
loaded 5501 out of 20030
loaded 6001 out of 20030
loaded 6501 out of 20030
loaded 7001 out of 20030
loaded 7501 out of 20030
loaded 8001 out of 20030
loaded 8501 out of 20030
loaded 9001 out of 20030
loaded 9501 out of 20030
loaded 10001 out of 20030
loaded 10501 out of 20030
loaded 11001 out of 20030
loaded 11501 out of 20030
loaded 12001 out of 20030
loaded 12501 out of 20030
loaded 13001 out of 20030
loaded 13501 out of 20030
loaded 14001 out of 20030
loaded 14501 out of 20030
loaded 15001 out of 20030
loaded 15501 out of 20030
loaded 16001 out of 20030
loaded 16501 out of 20030
loaded 17001 out of 20030
loaded 17501 out of 20030
loaded 18001 out of 20030
loaded 18501 out of 20030
loaded 19001 out of 20030
loaded 195

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [30]:
from torch.nn.utils.rnn import pack_sequence

def create_batch(inp):
    features = [i[0].type(torch.float).to(device) for i in inp]
    labels = [i[1] for i in inp]
    packed_seq = pack_sequence(features, enforce_sorted=False)
    labels = torch.tensor(labels, dtype=torch.float).to(device)
    return packed_seq, labels

In [31]:
from torch.utils.data import DataLoader, random_split

train_data, test_data, validation_data = random_split(m, [int(0.8 * len(m)), int(0.1 * len(m)), int(0.1 * len(m))], torch.Generator())
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=create_batch, num_workers=0)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True, collate_fn=create_batch, num_workers=0)
validation_dataloader = DataLoader(validation_data, batch_size=len(validation_data), shuffle=True, collate_fn=create_batch, num_workers=0)

In [32]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score


constant_classifier = DummyClassifier(strategy="constant", constant=1)
uniform_classifier = DummyClassifier(strategy="uniform")

inputs, labels = None, None
for inp, lab in validation_dataloader:
    inputs, labels = inp, lab
    labels = labels.clone().cpu().detach()

constant_classifier.fit([0] * len(labels), labels)
uniform_classifier.fit([0] * len(labels), labels)
y_constant = constant_classifier.predict([0] * len(labels))
y_uniform = uniform_classifier.predict([0] * len(labels))

print(f"Baseline F1 score constant 1: {f1_score(labels, y_constant)}")
print(f"Baseline F1 score uniform: {f1_score(labels, y_uniform)}")

Baseline F1 score constant 1: 0.19630796938316075
Baseline F1 score uniform: 0.2095857026807474


In [67]:
import torch.nn as nn
import torch.nn.utils.rnn as rnn

class TinyModel(torch.nn.Module):
    def __init__(self):
        super(TinyModel, self).__init__()
        
#       Bidirectional (experimenting)
#         self.rnn = nn.GRU(input_size=149, hidden_size=32, num_layers=1, bidirectional=True)
#         self.fc1 = nn.Linear(in_features=64, out_features=1)

        self.rnn = nn.GRU(input_size=149, hidden_size=32, num_layers=1, bidirectional=False)
        self.fc1 = nn.Linear(in_features=32, out_features=1)
        

    def forward(self, x):
        _, h_n = self.rnn(x)
#         print(h_n.size())
#         h_n = torch.cat((h_n[0], h_n[1]), dim=1)
#         print(h_n.size())
        fc1_out = self.fc1(torch.squeeze(h_n))
        res = torch.sigmoid(fc1_out)
        
        return torch.squeeze(res)

tinymodel = TinyModel()
tinymodel.to(device)

print('The model:')
print(tinymodel)

The model:
TinyModel(
  (rnn): GRU(149, 32)
  (fc1): Linear(in_features=32, out_features=1, bias=True)
)


In [68]:
import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(tinymodel.parameters(), lr=0.001)

In [69]:
for epoch in range(20):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(train_dataloader):
        tinymodel.train(True)
        optimizer.zero_grad()
        outputs = tinymodel(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if (i + 1) % 50 == 0:
            print(f'Epoch: [{epoch + 1}, training sample {i + 1:5d}] running loss: {running_loss / 50:.3f}')
            running_loss = 0.0
    
    running_vloss = 0.0
    tinymodel.train(False)
    for i, (vinputs, vlabels) in enumerate(test_dataloader):
        voutputs = tinymodel(vinputs)
        vloss = criterion(voutputs, vlabels)
        running_vloss += vloss
    
    print(f"Test loss: {running_vloss / len(test_dataloader)}")

print('Finished Training')

Epoch: [1, training sample    50] running loss: 0.440
Epoch: [1, training sample   100] running loss: 0.389
Epoch: [1, training sample   150] running loss: 0.371
Epoch: [1, training sample   200] running loss: 0.377
Epoch: [1, training sample   250] running loss: 0.356
Epoch: [1, training sample   300] running loss: 0.389
Epoch: [1, training sample   350] running loss: 0.392
Epoch: [1, training sample   400] running loss: 0.385
Epoch: [1, training sample   450] running loss: 0.330
Epoch: [1, training sample   500] running loss: 0.358
Test loss: 0.3742932081222534
Epoch: [2, training sample    50] running loss: 0.332
Epoch: [2, training sample   100] running loss: 0.366
Epoch: [2, training sample   150] running loss: 0.350
Epoch: [2, training sample   200] running loss: 0.341
Epoch: [2, training sample   250] running loss: 0.369
Epoch: [2, training sample   300] running loss: 0.350
Epoch: [2, training sample   350] running loss: 0.355
Epoch: [2, training sample   400] running loss: 0.37

In [70]:
from sklearn.metrics import f1_score, roc_auc_score

tinymodel.train(False)
for inputs, labels in validation_dataloader:
    outputs = tinymodel(inputs)
    outputs = outputs > 0.5
    labels = labels.detach().cpu().numpy()
    outputs = outputs.detach().cpu().numpy()
    print(f"F1 score: {f1_score(labels, outputs)}")
    print(f"ROC AUC: {roc_auc_score(labels, outputs)}")

F1 score: 0.4883720930232558
ROC AUC: 0.7108536992778763
