<a href="https://colab.research.google.com/github/Zfeng0207/FIT3199-FYP/blob/dev%2Fzfeng/FYP_LSTM_DL_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [100]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
import gdown

#Import Data from Google Drive

In [101]:
file_id = "1CsLg1kROZWRtGXBJ1YHK_CW4XbBqSI3P"
url = f"https://drive.google.com/uc?id={file_id}"
gdown.download(url, "health_data.csv", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1CsLg1kROZWRtGXBJ1YHK_CW4XbBqSI3P
To: /content/health_data.csv
100%|██████████| 5.92M/5.92M [00:00<00:00, 205MB/s]


'health_data.csv'

In [102]:
data = pd.read_csv('health_data.csv')
data

Unnamed: 0,subject_id,stay_id_x,charttime,temperature,heartrate,resprate,o2sat,sbp,dbp,rhythm,...,seq_num,icd_code,icd_version,icd_title,Stroke_Y/N,gender,anchor_age,anchor_year,anchor_year_group,dod
0,14394983,34259230.0,2202-07-10 12:24:00,98.3,74.0,19.0,97.0,153.0,67.0,-1,...,2,262,10,369,0,1,46,2196,0,
1,17730806,30258441.0,2182-07-17 16:58:00,,82.0,16.0,96.0,115.0,102.0,-1,...,1,58,9,256,1,1,68,2176,1,
2,15973805,30481231.0,2145-07-03 17:20:00,98.0,109.0,18.0,98.0,113.0,77.0,-1,...,2,471,9,452,0,0,51,2139,1,
3,16945691,36369419.0,2117-12-18 23:50:00,99.9,74.0,16.0,100.0,151.0,76.0,-1,...,1,388,10,158,0,0,81,2111,1,2118-09-20
4,15632719,33618002.0,2170-04-19 20:58:00,98.4,83.0,,94.0,142.0,81.0,-1,...,1,58,9,256,1,1,55,2170,2,2170-10-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52285,15853169,33648217.0,2174-04-13 17:56:00,98.1,62.0,16.0,100.0,121.0,62.0,-1,...,2,330,10,482,1,0,52,2170,2,
52286,10913302,37712176.0,2196-11-26 09:23:00,,,,,,,-1,...,1,403,10,203,0,1,25,2185,0,
52287,11553072,31817224.0,2164-12-23 16:45:00,99.8,92.0,16.0,96.0,160.0,80.0,-1,...,1,34,9,34,0,1,46,2156,0,
52288,10295929,35519857.0,2150-12-05 06:15:00,97.9,63.0,20.0,99.0,149.0,97.0,-1,...,3,462,9,219,1,1,61,2141,0,


In [103]:
# Convert 'charttime' to datetime and sort by subject_id, charttime
data['charttime'] = pd.to_datetime(data['charttime'], errors='coerce')
data = data.sort_values(by=['subject_id', 'charttime'])

# Drop non-relevant columns
columns_to_drop = ["stay_id_x", "stay_id_y", "charttime", "dod", "icd_title"]
data = data.drop(columns=columns_to_drop, errors='ignore')

# Handle missing values by filling with column mean
data = data.fillna(data.mean())

# Our Stroke Target Column
target_column = "Stroke_Y/N"

In [104]:
# Function to create time-series sequences for each patient
def create_sequences(df, n_previous=3):
    sequences = []
    labels = []
    patient_groups = df.groupby("subject_id")  # Group by patient

    for _, group in patient_groups:
        group = group.drop(columns=["subject_id"])  # Drop ID for training
        X_patient = group.drop(columns=[target_column]).values
        y_patient = group[target_column].values

        # Create sequences of length `n_previous`
        if len(group) >= n_previous:
            for i in range(len(group) - n_previous + 1):
                seq_X = X_patient[i:i + n_previous]  # Past admissions
                seq_y = y_patient[i + n_previous - 1]    # Predict next admission stroke outcome
                sequences.append(seq_X)
                labels.append(seq_y)

    return np.array(sequences), np.array(labels)

# Generate time-series sequences
X_seq, y_seq = create_sequences(data, n_previous=3)

# Ensure X_seq is 3D (samples, time_steps, features)
if len(X_seq.shape) == 2:
    X_seq = np.expand_dims(X_seq, axis=1)  # Adds time-step dimension

In [105]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y_seq, dtype=torch.float32).unsqueeze(1)  # Shape: (N,1)

In [106]:
print(f"X_tensor shape: {X_tensor.shape}")  # Should be (samples, time_steps, features

X_tensor shape: torch.Size([48178, 3, 15])


In [107]:
# Split into train, validation, and test sets
train_size = int(0.7 * len(X_tensor))
val_size = int(0.15 * len(X_tensor))
test_size = len(X_tensor) - train_size - val_size
train_data, val_data, test_data = random_split(TensorDataset(X_tensor, y_tensor), [train_size, val_size, test_size])

In [108]:
# Train-test split (70% Train, 15% Val, 15% Test)
train_size = int(0.7 * len(X_tensor))
val_size = int(0.15 * len(X_tensor))
test_size = len(X_tensor) - train_size - val_size

train_data, val_data, test_data = torch.utils.data.random_split(
    torch.utils.data.TensorDataset(X_tensor, y_tensor), [train_size, val_size, test_size]
)

In [109]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

#LSTM Model

In [110]:
class StrokeLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=3, dropout=0.3):
        super(StrokeLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = self.fc(lstm_out[:, -1, :])  # Use last LSTM output
        return self.sigmoid(x)

#Model Parameters

In [111]:
# Model parameters
input_size = X_tensor.shape[-1]  # Correcting input size
model = StrokeLSTM(input_size)

# Loss function & optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005)

In [112]:
print(f"X_tensor shape: {X_tensor.shape}")


X_tensor shape: torch.Size([48178, 3, 15])


#Testing Model

In [113]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=30):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_val, y_val in val_loader:
                y_val_pred = model(X_val)
                val_loss += criterion(y_val_pred, y_val).item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

In [114]:
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=30)

Epoch 1/30, Train Loss: 0.6148, Val Loss: 0.5685
Epoch 2/30, Train Loss: 0.5892, Val Loss: 0.5462
Epoch 3/30, Train Loss: 0.5764, Val Loss: 0.5571
Epoch 4/30, Train Loss: 0.5722, Val Loss: 0.5522
Epoch 5/30, Train Loss: 0.5613, Val Loss: 0.5285
Epoch 6/30, Train Loss: 0.5576, Val Loss: 0.5294
Epoch 7/30, Train Loss: 0.5758, Val Loss: 0.5863
Epoch 8/30, Train Loss: 0.5658, Val Loss: 0.5582
Epoch 9/30, Train Loss: 0.5592, Val Loss: 0.5283
Epoch 10/30, Train Loss: 0.5767, Val Loss: 0.5653
Epoch 11/30, Train Loss: 0.5582, Val Loss: 0.5288
Epoch 12/30, Train Loss: 0.5544, Val Loss: 0.5231
Epoch 13/30, Train Loss: 0.5606, Val Loss: 0.5528
Epoch 14/30, Train Loss: 0.5553, Val Loss: 0.5325
Epoch 15/30, Train Loss: 0.5544, Val Loss: 0.5322
Epoch 16/30, Train Loss: 0.5408, Val Loss: 0.5396
Epoch 17/30, Train Loss: 0.5246, Val Loss: 0.4498
Epoch 18/30, Train Loss: 0.4896, Val Loss: 0.4369
Epoch 19/30, Train Loss: 0.4821, Val Loss: 0.4680
Epoch 20/30, Train Loss: 0.4680, Val Loss: 0.4257
Epoch 21/

#Evaluate Model

In [115]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for X_test, y_test in test_loader:
            y_test_pred = model(X_test)
            predicted = (y_test_pred > 0.5).float()
            correct += (predicted == y_test).sum().item()
            total += y_test.size(0)
    print(f'Test Accuracy: {correct / total:.4f}')

evaluate_model(model, test_loader)

Test Accuracy: 0.7864


In [116]:
y_test_pred

NameError: name 'y_test_pred' is not defined