This notebook implements a data-driven approach for tuning PID controller parameters (Kp, Ki, Kd) using an LSTM neural network for artificial pancreas systems in Type-1 Diabetes care. The workflow includes parsing CGM data from XML, extracting time-series features, training an LSTM model to predict PID gains based on glucose dynamics, and validating the model on real patient data. The final model is saved and can be integrated into a closed-loop insulin delivery system for personalized glucose regulation.

 Import Libraries

In [1]:
import os
import glob
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input
import matplotlib.pyplot as plt
from datetime import datetime

Load and Validate XML Data

In [5]:
# Automatically loads all XML files from the dataset directory
# Filters: training files end with '-ws-training.xml', testing with '-ws-testing.xml'
train_files = sorted(glob.glob("data/test/*-ws-training.xml"))
test_files = sorted(glob.glob("data/train/*-ws-testing.xml"))

"""Check Corrupted XML Files"""

import xml.etree.ElementTree as ET

# Iterate through all training and testing XML files
# Attempt to parse each file to ensure it is not corrupted
for file in train_files + test_files:
    try:
        tree = ET.parse(file)
        print(f"✅ {file} is valid")
    except ET.ParseError as e:
        print(f"❌ Error in {file}: {e}")

✅ /content/drive/MyDrive/GP PID/dataset/559-ws-training.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/563-ws-training.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/570-ws-training.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/575-ws-training.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/588-ws-training.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/591-ws-training.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/559-ws-testing.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/563-ws-testing.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/570-ws-testing.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/575-ws-testing.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/588-ws-testing.xml is valid
✅ /content/drive/MyDrive/GP PID/dataset/591-ws-testing.xml is valid


Parse XML to DataFrames

In [6]:
# Parses XML file and extracts timestamped glucose readings with patient weight
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []

     # Extract patient weight from root attributes (default to 0 if not present)
    weight = float(root.get('weight', 0))

    # Extract timestamp and glucose value from each glucose_level entry
    for event in root.find('glucose_level'):
        timestamp = datetime.strptime(event.get('ts'), "%d-%m-%Y %H:%M:%S")
        glucose = float(event.get('value'))


        # Append extracted data
        data.append({
            'timestamp': timestamp,
            'glucose': glucose,
            'weight': weight
        })

    df = pd.DataFrame(data)

    # Display parsing result for validation
    print(f"Parsed {file_path}, {len(df)} records")
    print(df.head(10))  # View first 10 rows

    # Return DataFrame sorted by timestamp to maintain chronological order
    return df.sort_values('timestamp')

# Parse and combine all training and testing data into single DataFrames
train_df = pd.concat([parse_xml(f) for f in train_files], ignore_index=True)
test_df = pd.concat([parse_xml(f) for f in test_files], ignore_index=True)

Parsed /content/drive/MyDrive/GP PID/dataset/559-ws-training.xml, 10796 records
            timestamp  glucose  weight
0 2021-12-07 01:17:00    101.0    99.0
1 2021-12-07 01:22:00     98.0    99.0
2 2021-12-07 01:27:00    104.0    99.0
3 2021-12-07 01:32:00    112.0    99.0
4 2021-12-07 01:37:00    120.0    99.0
5 2021-12-07 01:42:00    127.0    99.0
6 2021-12-07 01:47:00    135.0    99.0
7 2021-12-07 01:52:00    142.0    99.0
8 2021-12-07 01:57:00    140.0    99.0
9 2021-12-07 02:02:00    145.0    99.0
Parsed /content/drive/MyDrive/GP PID/dataset/563-ws-training.xml, 12124 records
            timestamp  glucose  weight
0 2021-09-13 12:33:00    219.0    99.0
1 2021-09-13 12:38:00    229.0    99.0
2 2021-09-13 12:43:00    224.0    99.0
3 2021-09-13 12:48:00    221.0    99.0
4 2021-09-13 12:53:00    215.0    99.0
5 2021-09-13 12:58:00    209.0    99.0
6 2021-09-13 13:03:00    203.0    99.0
7 2021-09-13 13:08:00    199.0    99.0
8 2021-09-13 13:13:00    196.0    99.0
9 2021-09-13 13:18:00

Preprocess Time Features (Cyclic Encoding)

In [7]:
# Preprocess time-based features using cyclic encoding for hour of the day
def preprocess_time_features(df):
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute

    # Encode time as cyclic features to preserve periodicity (e.g., 23:00 ≈ 00:00)
    df['time_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['time_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

     # Drop raw timestamp and intermediate columns to avoid redundancy
    return df.drop(['timestamp', 'hour', 'minute'], axis=1)

# Apply preprocessing to training and testing datasets
train_df = preprocess_time_features(train_df)
test_df = preprocess_time_features(test_df)

# Convert all numerical values to float32 to optimize memory usage and ensure consistency for model input
train_df = train_df.astype(np.float32)
test_df = test_df.astype(np.float32)

Prepare Training Data for LSTM-PID Model

In [8]:
# Prepare training data for LSTM-based PID gain prediction
def prepare_pid_training_data(df, sequence_length=30):
    X_pid, y_pid = [], []

    # Loop through the dataset to create sequences of input-output pairs
    for i in range(len(df) - sequence_length):
         # Calculate glucose error from the target (setpoint = 120 mg/dL)
        glucose_error = df['glucose'].iloc[i] - 120

         # Calculate the rate of glucose change (first derivative)
        glucose_change = df['glucose'].iloc[i] - df['glucose'].iloc[i-1] if i > 0 else 0

        # Estimate PID controller gains using log-scaled transformations
        Kp = 0.05 * np.log(1 + abs(glucose_error))
        Ki = 0.005 * np.log(1 + abs(glucose_error))
        Kd = 0.002 * np.log(1 + abs(glucose_change))

         # Prepare input features: error, change, weight, and encoded time features
        X_pid.append([glucose_error, glucose_change, df['weight'].iloc[i],
                      df['time_sin'].iloc[i], df['time_cos'].iloc[i]])

         # Prepare corresponding target PID gains
        y_pid.append([Kp, Ki, Kd])

    return np.array(X_pid), np.array(y_pid)

# Generate training and testing datasets for the model
X_pid_train, y_pid_train = prepare_pid_training_data(train_df)
X_pid_test, y_pid_test = prepare_pid_training_data(test_df)

Define and Compile LSTM Model

In [9]:
# Define the input shape for the LSTM model: (timesteps, features)
pid_input = Input(shape=(X_pid_train.shape[1], 1))

# First LSTM layer with 64 units, returning sequences to feed into next LSTM
pid_lstm = LSTM(64, activation='tanh', return_sequences=True)(pid_input)

# Second LSTM layer with 32 units, outputting the final sequence representation
pid_lstm = LSTM(32, activation='tanh')(pid_lstm)

# Output layer with 3 neurons for predicting the PID gains: Kp, Ki, Kd
pid_output = Dense(3, activation='linear')(pid_lstm)

# Construct the model
pid_model = Model(inputs=pid_input, outputs=pid_output)

# Compile the model using Adam optimizer and mean squared error loss
pid_model.compile(optimizer='adam', loss='mse')

Reshape Input and Train Model

In [10]:
# Reshape training data to match LSTM input format: (samples, timesteps, features)
X_pid_train_reshaped = X_pid_train.reshape((X_pid_train.shape[0], X_pid_train.shape[1], 1))

# Reshape test data similarly
X_pid_test_reshaped = X_pid_test.reshape((X_pid_test.shape[0], X_pid_test.shape[1], 1))  # (samples, timesteps, features)

# Train the LSTM model on the reshaped training data
pid_model.fit(X_pid_train_reshaped, y_pid_train, epochs=50, batch_size=32, validation_data=(X_pid_test_reshaped, y_pid_test))

# Save the trained model for later use
pid_model.save("models/nn_pid_tuning_model.h5")

Epoch 1/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 10ms/step - loss: 0.0016 - val_loss: 7.1634e-06
Epoch 2/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - loss: 5.6785e-06 - val_loss: 1.5987e-06
Epoch 3/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - loss: 3.5327e-06 - val_loss: 9.2126e-07
Epoch 4/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 9ms/step - loss: 2.4086e-06 - val_loss: 2.7863e-06
Epoch 5/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - loss: 2.0639e-06 - val_loss: 1.2407e-06
Epoch 6/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 9ms/step - loss: 1.5051e-06 - val_loss: 8.1904e-07
Epoch 7/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - loss: 1.4886e-06 - val_loss: 2.7864e-06
Epoch 8/50
[1m2164/2164[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 10ms/s

