In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from pathlib import Path
import time, copy


In [2]:
# Load data

X_train = pd.read_csv(Path('cleaned_data','X_train.csv'), index_col=0)
X_val = pd.read_csv(Path('cleaned_data','X_val.csv'), index_col=0)
X_test = pd.read_csv(Path('cleaned_data','X_test.csv'), index_col=0)

y_train = pd.read_csv(Path('cleaned_data','y_train.csv'), index_col=0)
y_val = pd.read_csv(Path('cleaned_data','y_val.csv'), index_col=0)
y_test = pd.read_csv(Path('cleaned_data','y_test.csv'), index_col=0)

In [3]:
# Define device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
# Define custom dataset

class MimicIvDataset(Dataset):
    """MIMIC IV dataset."""

    def __init__(self, csv_file_X, csv_file_y):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
        """
        self.mimic_df_X = pd.read_csv(Path(csv_file_X), index_col=0)
        self.mimic_df_y = pd.read_csv(Path(csv_file_y), index_col=0)

    def __len__(self):
        return len(self.mimic_df_X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        inputs = torch.tensor(self.mimic_df_X.iloc[idx], dtype=torch.float64)
        labels = torch.tensor(self.mimic_df_y.iloc[idx], dtype=torch.float64)

        return inputs, labels

In [5]:
# Load datasets

train_dataset = MimicIvDataset(csv_file_X="cleaned_data/X_train.csv", csv_file_y="cleaned_data/y_train.csv")
val_dataset = MimicIvDataset(csv_file_X="cleaned_data/X_val.csv", csv_file_y="cleaned_data/y_val.csv")
test_dataset = MimicIvDataset(csv_file_X="cleaned_data/X_test.csv", csv_file_y="cleaned_data/y_test.csv")

In [6]:
# Create dataloaders

batch_size = 1000

dataloaders = {'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True),
               'val': DataLoader(val_dataset, batch_size=batch_size, shuffle=True),
               'test': DataLoader(test_dataset, batch_size=batch_size, shuffle=True)}

dataset_sizes = {'train': len(train_dataset),
                 'val': len(val_dataset),
                 'test': len(test_dataset)}
print(f'dataset_sizes = {dataset_sizes}')

dataset_sizes = {'train': 344320, 'val': 38258, 'test': 42509}


In [7]:
import torch
import torch.nn as nn

# Define model parameters
input_size = 67
hidden_size1 = 256
hidden_size2 = 256
hidden_size3 = 256
num_classes = 1
dropout_rate = 0.5  # Adjust dropout rate
num_heads = 64  # Number of attention heads
num_transformer_layers = 80  # Number of transformer encoder layers

# External training parameters
learning_rate = 0.001
num_epochs = 50

# Revised model with Transformer instead of MLP
class MimicAdmissionTransformer(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, num_classes, dropout_rate, num_heads, num_transformer_layers, dtype=torch.float32):
        """
        A Transformer-based model with multiple encoder layers for classification tasks.
        
        Args:
            input_size (int): Size of the input features.
            hidden_size1 (int): Number of units in the first hidden layer.
            hidden_size2 (int): Number of units in the second hidden layer.
            num_classes (int): Number of output classes (1 for binary classification).
            dropout_rate (float): Dropout rate to use between layers (0 for no dropout).
            num_heads (int): Number of attention heads for multi-head self-attention.
            num_transformer_layers (int): Number of transformer encoder layers.
            dtype (torch.dtype): Data type for the model (default is torch.float32).
        """
        super(MimicAdmissionTransformer, self).__init__()
        
        # Embedding layer to match input_size
        self.embedding = nn.Linear(input_size, hidden_size1, dtype=dtype)
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size1, nhead=num_heads, dim_feedforward=hidden_size2, dropout=dropout_rate, dtype=dtype)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_transformer_layers)

        # Final classification layer
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size1, num_classes, dtype=dtype),
            nn.Sigmoid()  # For binary classification
        )

        # Dropout
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        """
        Forward pass of the network.
        
        Args:
            x (Tensor): Input data tensor of shape (batch_size, input_size).
        
        Returns:
            Tensor: Output prediction tensor of shape (batch_size, num_classes).
        """
        # Embedding the input (batch_size, input_size) -> (batch_size, hidden_size1)
        x = self.embedding(x)
        x = self.dropout(x)

        # Adding a sequence dimension to feed into the transformer (required for the encoder)
        x = x.unsqueeze(1)  # (batch_size, 1, hidden_size1)
        
        # Transformer expects (seq_len, batch_size, d_model) so we permute
        x = x.permute(1, 0, 2)  # (1, batch_size, hidden_size1)
        
        # Pass through the transformer encoder
        x = self.transformer_encoder(x)  # (1, batch_size, hidden_size1)

        # Squeeze sequence dimension and pass to classifier
        x = x.squeeze(0)  # (batch_size, hidden_size1)
        
        # Classification output
        out = self.classifier(x)
        
        return out

# Example instantiation of the model
mimic_admission_classifier = MimicAdmissionTransformer(
    input_size=input_size,
    hidden_size1=hidden_size1,
    hidden_size2=hidden_size2,
    num_classes=num_classes,
    dropout_rate=dropout_rate,
    num_heads=num_heads,
    num_transformer_layers=num_transformer_layers,
    dtype=torch.float64  # Data type can be changed if needed
).to(device)

# Print model summary (optional)
print(mimic_admission_classifier)


MimicAdmissionTransformer(
  (embedding): Linear(in_features=67, out_features=256, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-79): 80 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=256, bias=True)
        (dropout): Dropout(p=0.5, inplace=False)
        (linear2): Linear(in_features=256, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.5, inplace=False)
        (dropout2): Dropout(p=0.5, inplace=False)
      )
    )
  )
  (classifier): Sequential(
    (0): Linear(in_features=256, out_features=1, bias=True)
    (1): Sigmoid()
  )
  (dropout): Dropout(p=0.5, inplace=False)
)




In [8]:
# From https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict()) # keep the best weights stored separately
    best_acc = 0.0
    best_epoch = 0

    # Each epoch has a training, validation, and test phase
    phases = ['train', 'val', 'test']
    
    # Keep track of how loss and accuracy evolves during training
    training_curves = {}
    for phase in phases:
        training_curves[phase+'_loss'] = []
        training_curves[phase+'_acc'] = []
    
    for epoch in range(num_epochs):
        print(f'\nEpoch {epoch+1}/{num_epochs}')
        print('-' * 10)

        for phase in phases:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                # This ensures all of our datapoints are flattened
                # before feeding them to our model
                inputs = inputs.view(inputs.shape[0],-1)
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, predictions = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + update weights only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(predictions == labels.data)
 
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            training_curves[phase+'_loss'].append(epoch_loss)
            training_curves[phase+'_acc'].append(epoch_acc)

            print(f'{phase:5} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model if it's the best accuracy (based on validation)
            if phase == 'val' and epoch_acc > best_acc:
                best_epoch = epoch
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print(f'\nTraining complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f} at epoch {best_epoch}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, training_curves

In [9]:
# Training
# loss and optimizer
criterion = nn.BCELoss() # BCELoss for binary classification
optimizer = torch.optim.Adam(mimic_admission_classifier.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

# Train the model. We also will store the results of training to visualize
mimic_admission_classifier, training_curves = train_model(mimic_admission_classifier, dataloaders, dataset_sizes, criterion, optimizer, scheduler, num_epochs=num_epochs)


Epoch 1/50
----------


  inputs = torch.tensor(self.mimic_df_X.iloc[idx], dtype=torch.float64)
  labels = torch.tensor(self.mimic_df_y.iloc[idx], dtype=torch.float64)
  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


train Loss: 0.6969 Acc: 522.0512
val   Loss: 0.6920 Acc: 521.3801
test  Loss: 0.6923 Acc: 518.2884

Epoch 2/50
----------
train Loss: 0.6928 Acc: 522.0749
val   Loss: 0.6924 Acc: 521.2443
test  Loss: 0.6925 Acc: 518.2306

Epoch 3/50
----------
train Loss: 0.6927 Acc: 522.0532
val   Loss: 0.6921 Acc: 521.1862
test  Loss: 0.6924 Acc: 518.3692

Epoch 4/50
----------


KeyboardInterrupt: 