<a href="https://colab.research.google.com/github/Zfeng0207/FIT3199-FYP/blob/dev%2Fdarin/FYP_LSTM_DL_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [101]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
import gdown
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score
import torch.nn.functional as F

#Import Data from Google Drive

In [102]:
file_id = "1Ic8WjPbYYxqdvDznQdjN8B86yEghwzvV"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "health_data.csv", quiet=False)
data = pd.read_csv('health_data.csv')

Downloading...
From: https://drive.google.com/uc?id=1Ic8WjPbYYxqdvDznQdjN8B86yEghwzvV
To: /content/health_data.csv
100%|██████████| 9.90M/9.90M [00:00<00:00, 220MB/s]


In [103]:
# Convert 'charttime' to datetime and sort by subject_id, charttime
data['charttime'] = pd.to_datetime(data['charttime'], errors='coerce')
data = data.sort_values(by=['subject_id', 'charttime'])

# Drop non-relevant columns
columns_to_drop = ["stay_id_x", "stay_id_y", "charttime", "dod", "icd_title"]
data = data.drop(columns=columns_to_drop, errors='ignore')

# Handle missing values by filling with column mean for numeric columns only
numeric_data = data.select_dtypes(include=np.number)
data[numeric_data.columns] = numeric_data.fillna(numeric_data.mean())

# Ensure all remaining columns are numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Our Stroke Target Column
target_column = "Stroke_Y/N"

In [104]:
# Function to create time-series sequences for each patient
def create_sequences(df, n_previous=3):
    sequences, labels = [], []

    patient_groups = df.groupby("subject_id")  # Group by patient
    for _, group in patient_groups:
        group = group.drop(columns=["subject_id"])  # Drop ID for training
        if len(group) < n_previous:
            continue  # Skip patients with too few records

        # Ensure only numeric values
        group = group.apply(pd.to_numeric, errors='coerce')

        X_patient = group.drop(columns=["Stroke_Y/N"]).values
        y_patient = group["Stroke_Y/N"].values

        # Create sequences of length `n_previous`
        for i in range(len(group) - n_previous + 1):
            seq_X = X_patient[i:i + n_previous]  # Past admissions
            seq_y = y_patient[i + n_previous - 1]  # Predict next admission stroke outcome
            sequences.append(seq_X)
            labels.append(seq_y)

    return np.array(sequences, dtype=np.float32), np.array(labels, dtype=np.float32)

# Generate time-series sequences
X_seq, y_seq = create_sequences(data, n_previous=3)

# Ensure X_seq is 3D (samples, time_steps, features)
#if len(X_seq.shape) == 2:
#    X_seq = np.expand_dims(X_seq, axis=1)  # Adds time-step dimension

In [105]:
X_seq = np.nan_to_num(X_seq, nan=0.0)
y_seq = np.nan_to_num(y_seq, nan=0.0)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y_seq, dtype=torch.float32).unsqueeze(1)  # Shape: (N,1)

# Debugging Output
print(f"X_tensor shape: {X_tensor.shape}")  # Should be (samples, time_steps, features)
print(f"y_tensor shape: {y_tensor.shape}")  # Should be (samples, 1)

X_tensor shape: torch.Size([65263, 3, 17])
y_tensor shape: torch.Size([65263, 1])


In [106]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

X_tensor, y_tensor = X_tensor.to(device), y_tensor.to(device)

Using device: cpu


In [107]:
# Split into train, validation, and test sets
train_size = int(0.7 * len(X_tensor))
val_size = int(0.15 * len(X_tensor))
test_size = len(X_tensor) - train_size - val_size
train_data, val_data, test_data = random_split(TensorDataset(X_tensor, y_tensor), [train_size, val_size, test_size])

In [108]:
# Train-test split (70% Train, 15% Val, 15% Test)
train_size = int(0.7 * len(X_tensor))
val_size = int(0.15 * len(X_tensor))
test_size = len(X_tensor) - train_size - val_size

train_data, val_data, test_data = random_split(TensorDataset(X_tensor, y_tensor), [train_size, val_size, test_size])


In [109]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

#LSTM Model

In [110]:
class StrokeLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=3, dropout=0.3):
        super(StrokeLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = self.fc(lstm_out[:, -1, :])
        return torch.sigmoid(x)  # Ensure output is in [0,1] range

#Model Parameters

In [111]:
# Initialize model and move to device
input_size = X_tensor.shape[-1]
model = StrokeLSTM(input_size).to(device)

# Loss function & optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

#Testing Model

In [112]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=30):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_val, y_val in val_loader:
                y_val_pred = model(X_val)
                val_loss += criterion(y_val_pred, y_val).item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

In [113]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=30)

Epoch 1/30, Train Loss: 0.3924, Val Loss: 0.3834
Epoch 2/30, Train Loss: 0.3856, Val Loss: 0.3837
Epoch 3/30, Train Loss: 0.3855, Val Loss: 0.3860
Epoch 4/30, Train Loss: 0.3855, Val Loss: 0.3836
Epoch 5/30, Train Loss: 0.3855, Val Loss: 0.3834
Epoch 6/30, Train Loss: 0.3852, Val Loss: 0.3836
Epoch 7/30, Train Loss: 0.3851, Val Loss: 0.3838
Epoch 8/30, Train Loss: 0.3853, Val Loss: 0.3838
Epoch 9/30, Train Loss: 0.3851, Val Loss: 0.3834
Epoch 10/30, Train Loss: 0.3851, Val Loss: 0.3840
Epoch 11/30, Train Loss: 0.3850, Val Loss: 0.3836
Epoch 12/30, Train Loss: 0.3850, Val Loss: 0.3834
Epoch 13/30, Train Loss: 0.3850, Val Loss: 0.3838
Epoch 14/30, Train Loss: 0.3851, Val Loss: 0.3837
Epoch 15/30, Train Loss: 0.3850, Val Loss: 0.3836
Epoch 16/30, Train Loss: 0.3850, Val Loss: 0.3836
Epoch 17/30, Train Loss: 0.3849, Val Loss: 0.3834
Epoch 18/30, Train Loss: 0.3850, Val Loss: 0.3835
Epoch 19/30, Train Loss: 0.3849, Val Loss: 0.3843
Epoch 20/30, Train Loss: 0.3850, Val Loss: 0.3838
Epoch 21/

#Evaluate Model

In [116]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    y_true = []
    y_pred_probs = []
    y_pred_labels = []
    total_loss = 0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()

            # Store true labels and predicted probabilities
            y_true.extend(y_batch.cpu().numpy().flatten())
            y_pred_probs.extend(y_pred.cpu().numpy().flatten())

    # Convert to NumPy arrays
    y_true = np.array(y_true)
    y_pred_probs = np.array(y_pred_probs)

    # Convert probabilities to binary predictions using a threshold of 0.5
    y_pred_labels = (y_pred_probs >= 0.5).astype(int)

    # Calculate Metrics
    rmse = np.sqrt(np.mean((y_true - y_pred_probs) ** 2))
    accuracy = accuracy_score(y_true, y_pred_labels)
    auc = roc_auc_score(y_true, y_pred_probs) if len(np.unique(y_true)) > 1 else float('nan')  # AUC needs at least two classes
    precision = precision_score(y_true, y_pred_labels, zero_division=0)
    recall = recall_score(y_true, y_pred_labels, zero_division=0)
    avg_loss = total_loss / len(test_loader)

    # Print results
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    return avg_loss, rmse, accuracy, auc, precision, recall

# Run evaluation on test set
evaluate_model(model, test_loader, criterion)

Test Loss: 0.3945
RMSE: 0.3408
Accuracy: 0.8660
AUC: 0.5000
Precision: 0.0000
Recall: 0.0000


(0.39448141369944306,
 np.float32(0.34084183),
 0.8659856996935649,
 np.float64(0.5),
 0.0,
 0.0)