From 98900a396b0a955fb4801e3c6e38ca5181b7703f Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 16 Apr 2026 17:52:27 -0400
Subject: [PATCH 01/24] Add airstack image-delete. Also change URDF check to
 consider env variables not just .env

---
 airstack.sh | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/airstack.sh b/airstack.sh
index b4c91df08..d5be609a7 100755
--- a/airstack.sh
+++ b/airstack.sh
@@ -712,8 +712,8 @@ function cmd_up {
     local n=0; for s in isaac-sim ms-airsim simple; do [[ ",$p," == *",$s,"* ]] && n=$((n+1)); done
     (( n > 1 )) && log_error "Only one simulator profile can be active at a time (isaac-sim, ms-airsim, simple)." && exit 1
 
-    # Warn if URDF_FILE doesn't match the active simulator
-    local urdf=$(sed -n 's/^URDF_FILE=//p' "$PROJECT_ROOT/.env" 2>/dev/null | tr -d '"')
+    # Warn if URDF_FILE doesn't match the active simulator (env var overrides .env)
+    local urdf="${URDF_FILE:-$(sed -n 's/^URDF_FILE=//p' "$PROJECT_ROOT/.env" 2>/dev/null | tr -d '"')}"
     if [[ -n "$urdf" ]]; then
         [[ ",$p," == *",ms-airsim,"* && "$urdf" != *.ms-airsim.* ]] && log_warn "URDF_FILE ($urdf) does not match ms-airsim profile. Expected *.ms-airsim.* URDF."
         [[ ",$p," == *",isaac-sim,"* && "$urdf" != *.pegasus.* && "$urdf" != *.isaacsim.* ]] && log_warn "URDF_FILE ($urdf) does not match isaac-sim profile. Expected *.pegasus.* or *.isaacsim.* URDF."
@@ -783,6 +783,47 @@ function cmd_images {
     fi
 }
 
+function cmd_image_delete {
+    check_docker
+
+    local env_file="$PROJECT_ROOT/.env"
+    local project_name=""
+    if [ -f "$env_file" ]; then
+        project_name=$(grep -E "^PROJECT_NAME=" "$env_file" | cut -d'=' -f2 | tr -d '"' | tr -d "'")
+    fi
+    if [ -z "$project_name" ]; then
+        log_error "PROJECT_NAME not found in .env; refusing to delete."
+        return 1
+    fi
+
+    # Match images whose repository contains "/PROJECT_NAME" or equals "PROJECT_NAME".
+    # Using a regex anchored to a path segment avoids false positives on similar names.
+    local refs
+    refs=$(docker images --format '{{.Repository}}:{{.Tag}}' \
+        | grep -E "(^|/)${project_name}(:|$)" || true)
+
+    if [ -z "$refs" ]; then
+        log_info "No images found matching project: $project_name"
+        return 0
+    fi
+
+    log_info "The following images will be deleted:"
+    echo "$refs" | sed 's/^/  /'
+
+    # Confirm unless --yes / -y is passed.
+    local auto_yes=false
+    for arg in "$@"; do
+        [[ "$arg" == "-y" || "$arg" == "--yes" ]] && auto_yes=true
+    done
+    if ! $auto_yes; then
+        read -r -p "Delete these images? [y/N] " reply
+        [[ "$reply" =~ ^[Yy]$ ]] || { log_info "Aborted."; return 0; }
+    fi
+
+    echo "$refs" | xargs -r docker rmi -f
+    log_info "Done."
+}
+
 function cmd_down {
     check_docker
     
@@ -1121,6 +1162,7 @@ function register_builtin_commands {
     COMMANDS["image-push"]="cmd_image_push"
     COMMANDS["image-pull"]="cmd_image_pull"
     COMMANDS["images"]="cmd_images"
+    COMMANDS["image-delete"]="cmd_image_delete"
     COMMANDS["up"]="cmd_up"
     COMMANDS["down"]="cmd_down"
     COMMANDS["clean"]="cmd_clean"
@@ -1138,6 +1180,7 @@ function register_builtin_commands {
     COMMAND_HELP["image-push"]="Push Docker Compose service images to a registry"
     COMMAND_HELP["image-pull"]="Pull Docker Compose service images from a registry"
     COMMAND_HELP["images"]="List Docker images filtered by PROJECT_NAME from .env"
+    COMMAND_HELP["image-delete"]="Delete all Docker images matching PROJECT_NAME (prompts unless -y)"
     COMMAND_HELP["up"]="Start services using Docker Compose"
     COMMAND_HELP["down"]="down services"
     COMMAND_HELP["clean"]="Remove all ROS 2 build artifacts (build/, install/, log/)"

From a07950c699656e53a54fcb905490148f2eac2b5a Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 16 Apr 2026 17:53:33 -0400
Subject: [PATCH 02/24] Add delay before starting PX4 for airsim. This is
 allowing PX4 to converge to correct location at startup. More elegent
 solution is needed.

Also added tmux plugin to airsim tmux windows
---
 .env                                          |  7 ++--
 simulation/ms-airsim/docker/Dockerfile        |  3 ++
 .../ms-airsim/docker/docker-compose.yaml      |  1 +
 simulation/ms-airsim/docker/entrypoint.sh     | 41 +++++++++++++------
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/.env b/.env
index 23c886d92..584dcfb09 100644
--- a/.env
+++ b/.env
@@ -11,7 +11,8 @@
 PROJECT_NAME="airstack"
 # If you've run ./airstack.sh setup, then this will auto-generate from the git commit hash every time a change is made 
 # to a Dockerfile or docker-compose.yaml file. Otherwise this can also be set explicitly to make a release version.
-VERSION="0.18.0-alpha.4"
+# auto-generated from git commit hash
+VERSION="98900a39"
 # Choose "dev" or "prebuilt". "dev" is for mounted code that must be built live. "prebuilt" is for built ros_ws baked into the image
 DOCKER_IMAGE_BUILD_MODE="dev"  
 # Where to push and pull images from. Can replace with your docker hub username if using docker hub.
@@ -37,8 +38,8 @@ PLAY_SIM_ON_START="false"
 # ===============================================
 
 # ================= MS-AIRSIM =====================
-MS_AIRSIM_ENV_DIR="../environments"
-MS_AIRSIM_BINARY_PATH="/ms-airsim-env/AbandonedPark/LinuxNoEditor/AbandonedPark.sh"
+MS_AIRSIM_ENV_DIR="/home/oalama/Desktop/Airsim/Environments/AirSimNH/LinuxNoEditor"
+MS_AIRSIM_BINARY_PATH="/ms-airsim-env/AirSimNH.sh"
 # =================================================
 
 # ================= ROBOT =====================
diff --git a/simulation/ms-airsim/docker/Dockerfile b/simulation/ms-airsim/docker/Dockerfile
index ff41256f6..dcb2c4537 100644
--- a/simulation/ms-airsim/docker/Dockerfile
+++ b/simulation/ms-airsim/docker/Dockerfile
@@ -37,6 +37,9 @@ RUN useradd -m -s /bin/bash -G sudo ms-airsim && \
 
 RUN echo "source /opt/ros/jazzy/setup.bash" >> /root/.bashrc
 
+# tmux plugin manager (tpm)
+RUN git clone --depth 1 https://github.com/tmux-plugins/tpm /root/.tmux/plugins/tpm
+
 COPY entrypoint.sh /root/entrypoint.sh
 RUN chmod +x /root/entrypoint.sh
 
diff --git a/simulation/ms-airsim/docker/docker-compose.yaml b/simulation/ms-airsim/docker/docker-compose.yaml
index 2058d3651..7db6642a3 100644
--- a/simulation/ms-airsim/docker/docker-compose.yaml
+++ b/simulation/ms-airsim/docker/docker-compose.yaml
@@ -40,3 +40,4 @@ services:
       - ../ros_ws:/root/ros_ws:rw
       - ${MS_AIRSIM_ENV_DIR:-../environments}:/ms-airsim-env:rw
       - /usr/share/vulkan/icd.d/nvidia_icd.json:/usr/share/vulkan/icd.d/nvidia_icd.json:ro
+      - ../../../common/.tmux.conf:/root/.tmux.conf:rw
diff --git a/simulation/ms-airsim/docker/entrypoint.sh b/simulation/ms-airsim/docker/entrypoint.sh
index fa61d6d39..3f3a1f1ce 100755
--- a/simulation/ms-airsim/docker/entrypoint.sh
+++ b/simulation/ms-airsim/docker/entrypoint.sh
@@ -4,19 +4,38 @@ set -e
 NUM_ROBOTS="${NUM_ROBOTS:-1}"
 MS_AIRSIM_BINARY_PATH="${MS_AIRSIM_BINARY_PATH:-/ms-airsim-env/LinuxNoEditor/Blocks.sh}"
 MS_AIRSIM_HEADLESS="${MS_AIRSIM_HEADLESS:-false}"
+# Seconds to let AirSim sensors settle before PX4 starts its EKF.
+# Too short → PX4 snapshots a bad local origin (altitude offset).
+MS_AIRSIM_PX4_START_DELAY="${MS_AIRSIM_PX4_START_DELAY:-3}"
 
 # Generate settings.json from template
 python3 /home/ms-airsim/Documents/AirSim/generate_settings.py
 
 # Start tmux session
-tmux new -d -s ms-airsim
+tmux new -d -s ms-airsim -n airsim
+
+# Drop the keyboard-tips status-right from common/.tmux.conf so the centered
+# window list (airsim, robot_<i>_px4, robot_<i>_bridge) has room to breathe.
+tmux set-option -t ms-airsim status-right ''
 
 # Launch Microsoft AirSim (legacy) UE4 binary (optionally headless via -RenderOffScreen)
 UE4_FLAGS=""
 if [ "$MS_AIRSIM_HEADLESS" = "true" ]; then
     UE4_FLAGS="-RenderOffScreen -nosound"
 fi
-tmux send-keys -t ms-airsim "sudo -u ms-airsim $MS_AIRSIM_BINARY_PATH $UE4_FLAGS" ENTER
+tmux send-keys -t ms-airsim:airsim \
+    "sudo -u ms-airsim $MS_AIRSIM_BINARY_PATH $UE4_FLAGS" ENTER
+
+# Build ROS workspace
+cd /root/ros_ws && colcon build --symlink-install
+
+# Launch bridge nodes — one window per robot, named robot_<i>_bridge
+for i in $(seq 1 "$NUM_ROBOTS"); do
+    window="robot_${i}_bridge"
+    tmux new-window -t ms-airsim -n "$window"
+    tmux send-keys -t "ms-airsim:$window" \
+        "source /root/ros_ws/install/setup.bash && ROS_DOMAIN_ID=$i ros2 run ms_airsim_ros_bridge bridge_node --ros-args -p robot_name:=robot_$i" ENTER
+done
 
 # Wait for AirSim API to be ready
 python3 -c "
@@ -31,22 +50,18 @@ while True:
 print('AirSim ready')
 "
 
-# Launch PX4 SITL instances
+echo "Waiting ${MS_AIRSIM_PX4_START_DELAY}s for AirSim sensors to settle..."
+sleep "$MS_AIRSIM_PX4_START_DELAY"
+
+# Launch PX4 SITL instances — one window per robot, named robot_<i>_px4
 for i in $(seq 1 "$NUM_ROBOTS"); do
     mkdir -p "/root/px4_instance_$i"
-    tmux new-window -t ms-airsim
-    tmux send-keys -t ms-airsim \
+    window="robot_${i}_px4"
+    tmux new-window -t ms-airsim -n "$window"
+    tmux send-keys -t "ms-airsim:$window" \
         "cd /root/px4_instance_$i && PX4_SIM_MODEL=none_iris /root/PX4-Autopilot/build/px4_sitl_default/bin/px4 /root/PX4-Autopilot/ROMFS/px4fmu_common -s /root/PX4-Autopilot/ROMFS/px4fmu_common/init.d-posix/rcS -i $i" ENTER
 done
 
-# Build ROS workspace
-cd /root/ros_ws && colcon build --symlink-install
 
-# Launch bridge nodes (one per robot, each on its own ROS domain)
-for i in $(seq 1 "$NUM_ROBOTS"); do
-    tmux new-window -t ms-airsim
-    tmux send-keys -t ms-airsim \
-        "source /root/ros_ws/install/setup.bash && ROS_DOMAIN_ID=$i ros2 run ms_airsim_ros_bridge bridge_node --ros-args -p robot_name:=robot_$i" ENTER
-done
 
 sleep infinity

From 98428762e03bfff21dbb1111b7e0c021fe848f56 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Fri, 17 Apr 2026 08:56:52 -0400
Subject: [PATCH 03/24] Initial docker build and package build tests

---
 .airstack/modules/dev.sh                  | 40 +++++------
 .env                                      |  2 +-
 .gitignore                                |  3 +
 simulation/ms-airsim/docker/entrypoint.sh | 62 ++++++++---------
 tests/conftest.py                         | 84 +++++++++++++++++++++++
 tests/docker/Dockerfile                   | 16 +++++
 tests/docker/docker-compose.yaml          | 13 ++++
 tests/pytest.ini                          | 10 +++
 tests/requirements.txt                    |  2 +
 tests/test_build_docker.py                | 29 ++++++++
 tests/test_build_packages.py              | 63 +++++++++++++++++
 11 files changed, 268 insertions(+), 56 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/docker/Dockerfile
 create mode 100644 tests/docker/docker-compose.yaml
 create mode 100644 tests/pytest.ini
 create mode 100644 tests/requirements.txt
 create mode 100644 tests/test_build_docker.py
 create mode 100644 tests/test_build_packages.py

diff --git a/.airstack/modules/dev.sh b/.airstack/modules/dev.sh
index 89599111d..f6f12e4b6 100644
--- a/.airstack/modules/dev.sh
+++ b/.airstack/modules/dev.sh
@@ -3,32 +3,24 @@
 # dev.sh - Development-related commands for AirStack
 # This module provides commands for development tasks
 
-# Function to run tests
+# Function to run tests via the dockerized test runner.
+# Usage: airstack test                        — run all tests
+#        airstack test build_packages         — run one marker
+#        airstack test build_docker build_packages — multiple markers
 function cmd_dev_test {
-    log_info "Running tests..."
-    
-    local test_path="$PROJECT_ROOT/tests"
-    local test_filter=""
-    
-    # Parse arguments
-    for arg in "$@"; do
-        if [[ "$arg" == --path=* ]]; then
-            test_path="${arg#--path=}"
-        elif [[ "$arg" == --filter=* ]]; then
-            test_filter="${arg#--filter=}"
-        fi
-    done
-    
-    if [ -n "$test_filter" ]; then
-        log_info "Running tests matching '$test_filter' in $test_path"
-        # Add your test command here, e.g.:
-        # pytest "$test_path" -k "$test_filter"
-        echo "Test command would run here with filter: $test_filter"
+    check_docker
+    local compose_file="$PROJECT_ROOT/tests/docker/docker-compose.yaml"
+    local markers=("$@")
+
+    export AIRSTACK_PATH="$PROJECT_ROOT"
+    docker compose -f "$compose_file" build --quiet
+
+    if [ ${#markers[@]} -eq 0 ]; then
+        docker compose -f "$compose_file" run --rm test pytest
     else
-        log_info "Running all tests in $test_path"
-        # Add your test command here, e.g.:
-        # pytest "$test_path"
-        echo "Test command would run here"
+        local marker
+        marker=$(IFS=" or "; echo "${markers[*]}")
+        docker compose -f "$compose_file" run --rm test pytest -m "$marker"
     fi
 }
 
diff --git a/.env b/.env
index 584dcfb09..28ac1935f 100644
--- a/.env
+++ b/.env
@@ -12,7 +12,7 @@ PROJECT_NAME="airstack"
 # If you've run ./airstack.sh setup, then this will auto-generate from the git commit hash every time a change is made 
 # to a Dockerfile or docker-compose.yaml file. Otherwise this can also be set explicitly to make a release version.
 # auto-generated from git commit hash
-VERSION="98900a39"
+VERSION="a07950c6"
 # Choose "dev" or "prebuilt". "dev" is for mounted code that must be built live. "prebuilt" is for built ros_ws baked into the image
 DOCKER_IMAGE_BUILD_MODE="dev"  
 # Where to push and pull images from. Can replace with your docker hub username if using docker hub.
diff --git a/.gitignore b/.gitignore
index cdb047ca7..2f9282334 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,3 +75,6 @@ simulation/isaac-sim/launch_scripts/prepare_scene.py
 
 # Generated Microsoft AirSim (legacy) config
 simulation/ms-airsim/config/settings.json
+
+# Test results
+tests/results/
diff --git a/simulation/ms-airsim/docker/entrypoint.sh b/simulation/ms-airsim/docker/entrypoint.sh
index 3f3a1f1ce..20403607d 100755
--- a/simulation/ms-airsim/docker/entrypoint.sh
+++ b/simulation/ms-airsim/docker/entrypoint.sh
@@ -2,6 +2,7 @@
 set -e
 
 NUM_ROBOTS="${NUM_ROBOTS:-1}"
+AUTOLAUNCH="${AUTOLAUNCH:-true}"
 MS_AIRSIM_BINARY_PATH="${MS_AIRSIM_BINARY_PATH:-/ms-airsim-env/LinuxNoEditor/Blocks.sh}"
 MS_AIRSIM_HEADLESS="${MS_AIRSIM_HEADLESS:-false}"
 # Seconds to let AirSim sensors settle before PX4 starts its EKF.
@@ -18,27 +19,27 @@ tmux new -d -s ms-airsim -n airsim
 # window list (airsim, robot_<i>_px4, robot_<i>_bridge) has room to breathe.
 tmux set-option -t ms-airsim status-right ''
 
-# Launch Microsoft AirSim (legacy) UE4 binary (optionally headless via -RenderOffScreen)
-UE4_FLAGS=""
-if [ "$MS_AIRSIM_HEADLESS" = "true" ]; then
-    UE4_FLAGS="-RenderOffScreen -nosound"
-fi
-tmux send-keys -t ms-airsim:airsim \
-    "sudo -u ms-airsim $MS_AIRSIM_BINARY_PATH $UE4_FLAGS" ENTER
-
-# Build ROS workspace
-cd /root/ros_ws && colcon build --symlink-install
+if [ "$AUTOLAUNCH" = "true" ]; then
+    # Build ROS workspace
+    cd /root/ros_ws && colcon build --symlink-install
+    # Launch Microsoft AirSim (legacy) UE4 binary (optionally headless)
+    UE4_FLAGS=""
+    if [ "$MS_AIRSIM_HEADLESS" = "true" ]; then
+        UE4_FLAGS="-RenderOffScreen -nosound"
+    fi
+    tmux send-keys -t ms-airsim:airsim \
+        "sudo -u ms-airsim $MS_AIRSIM_BINARY_PATH $UE4_FLAGS" ENTER
 
-# Launch bridge nodes — one window per robot, named robot_<i>_bridge
-for i in $(seq 1 "$NUM_ROBOTS"); do
-    window="robot_${i}_bridge"
-    tmux new-window -t ms-airsim -n "$window"
-    tmux send-keys -t "ms-airsim:$window" \
-        "source /root/ros_ws/install/setup.bash && ROS_DOMAIN_ID=$i ros2 run ms_airsim_ros_bridge bridge_node --ros-args -p robot_name:=robot_$i" ENTER
-done
+    # Launch bridge nodes — one window per robot, named robot_<i>_bridge
+    for i in $(seq 1 "$NUM_ROBOTS"); do
+        window="robot_${i}_bridge"
+        tmux new-window -t ms-airsim -n "$window"
+        tmux send-keys -t "ms-airsim:$window" \
+            "source /root/ros_ws/install/setup.bash && ROS_DOMAIN_ID=$i ros2 run ms_airsim_ros_bridge bridge_node --ros-args -p robot_name:=robot_$i" ENTER
+    done
 
-# Wait for AirSim API to be ready
-python3 -c "
+    # Wait for AirSim API to be ready
+    python3 -c "
 import airsim, time
 while True:
     try:
@@ -50,18 +51,17 @@ while True:
 print('AirSim ready')
 "
 
-echo "Waiting ${MS_AIRSIM_PX4_START_DELAY}s for AirSim sensors to settle..."
-sleep "$MS_AIRSIM_PX4_START_DELAY"
-
-# Launch PX4 SITL instances — one window per robot, named robot_<i>_px4
-for i in $(seq 1 "$NUM_ROBOTS"); do
-    mkdir -p "/root/px4_instance_$i"
-    window="robot_${i}_px4"
-    tmux new-window -t ms-airsim -n "$window"
-    tmux send-keys -t "ms-airsim:$window" \
-        "cd /root/px4_instance_$i && PX4_SIM_MODEL=none_iris /root/PX4-Autopilot/build/px4_sitl_default/bin/px4 /root/PX4-Autopilot/ROMFS/px4fmu_common -s /root/PX4-Autopilot/ROMFS/px4fmu_common/init.d-posix/rcS -i $i" ENTER
-done
-
+    echo "Waiting ${MS_AIRSIM_PX4_START_DELAY}s for AirSim sensors to settle..."
+    sleep "$MS_AIRSIM_PX4_START_DELAY"
 
+    # Launch PX4 SITL instances — one window per robot, named robot_<i>_px4
+    for i in $(seq 1 "$NUM_ROBOTS"); do
+        mkdir -p "/root/px4_instance_$i"
+        window="robot_${i}_px4"
+        tmux new-window -t ms-airsim -n "$window"
+        tmux send-keys -t "ms-airsim:$window" \
+            "cd /root/px4_instance_$i && PX4_SIM_MODEL=none_iris /root/PX4-Autopilot/build/px4_sitl_default/bin/px4 /root/PX4-Autopilot/ROMFS/px4fmu_common -s /root/PX4-Autopilot/ROMFS/px4fmu_common/init.d-posix/rcS -i $i" ENTER
+    done
+fi
 
 sleep infinity
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..7624601a2
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,84 @@
+import inspect
+import os
+import subprocess
+import time
+from datetime import datetime
+from pathlib import Path
+
+AIRSTACK_ROOT = os.environ.get("AIRSTACK_ROOT", str(Path(__file__).parent.parent))
+RUN_DIR = None
+LOGS_DIR = None
+
+
+def pytest_configure(config):
+    global RUN_DIR, LOGS_DIR
+    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    results_root = Path(AIRSTACK_ROOT) / "tests" / "results"
+    RUN_DIR = results_root / timestamp
+    LOGS_DIR = RUN_DIR / "logs"
+    LOGS_DIR.mkdir(parents=True, exist_ok=True)
+    config.option.xmlpath = str(RUN_DIR / "results.xml")
+
+
+# ── helpers ─────────────────────────────────────────────────────────────────
+
+def log_name():
+    frame = inspect.stack()[1]
+    file = Path(frame.filename).stem
+    func = frame.function
+    return f"{file}.{func}"
+
+
+def read_log_tail(log_name, lines=50):
+    log_path = LOGS_DIR / f"{log_name}.log"
+    if log_path.exists():
+        all_lines = log_path.read_text().splitlines()
+        return "\n".join(all_lines[-lines:])
+    return ""
+
+
+def docker_exec(container, cmd, timeout=60, domain_id=None, log_name=None):
+    if domain_id is not None:
+        cmd = f"export ROS_DOMAIN_ID={domain_id} && {cmd}"
+    full_cmd = ["docker", "exec", container, "bash", "-c", cmd]
+    if log_name:
+        with open(LOGS_DIR / f"{log_name}.log", "a") as log:
+            return subprocess.run(full_cmd, stdout=log, stderr=log, text=True, timeout=timeout)
+    return subprocess.run(full_cmd, capture_output=True, text=True, timeout=timeout)
+
+
+def airstack_cmd(*args, env_overrides=None, timeout=1800, log_name=None):
+    env = os.environ.copy()
+    if env_overrides:
+        env.update(env_overrides)
+    cmd = [str(Path(AIRSTACK_ROOT) / "airstack.sh")] + list(args)
+    if log_name:
+        with open(LOGS_DIR / f"{log_name}.log", "a") as log:
+            return subprocess.run(cmd, stdout=log, stderr=log, text=True,
+                                  timeout=timeout, cwd=AIRSTACK_ROOT, env=env)
+    return subprocess.run(cmd, capture_output=True, text=True,
+                          timeout=timeout, cwd=AIRSTACK_ROOT, env=env)
+
+
+def find_container(name_pattern):
+    result = subprocess.run(
+        ["docker", "ps", "--filter", f"name={name_pattern}", "--format", "{{.Names}}"],
+        capture_output=True, text=True,
+    )
+    names = result.stdout.strip().splitlines()
+    return names[0] if names else None
+
+
+def wait_for_container(name_pattern, timeout=120):
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        name = find_container(name_pattern)
+        if name:
+            result = subprocess.run(
+                ["docker", "inspect", "-f", "{{.State.Running}}", name],
+                capture_output=True, text=True,
+            )
+            if "true" in result.stdout:
+                return name
+        time.sleep(5)
+    raise TimeoutError(f"Container matching '{name_pattern}' not running after {timeout}s")
diff --git a/tests/docker/Dockerfile b/tests/docker/Dockerfile
new file mode 100644
index 000000000..41a6a0d5f
--- /dev/null
+++ b/tests/docker/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12-slim
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends ca-certificates curl gnupg && \
+    install -m 0755 -d /etc/apt/keyrings && \
+    curl -fsSL https://download.docker.com/linux/debian/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
+    echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/debian bookworm stable" \
+      > /etc/apt/sources.list.d/docker.list && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends docker-ce-cli docker-compose-plugin && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt /tmp/
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+WORKDIR /tests
diff --git a/tests/docker/docker-compose.yaml b/tests/docker/docker-compose.yaml
new file mode 100644
index 000000000..1de991ed0
--- /dev/null
+++ b/tests/docker/docker-compose.yaml
@@ -0,0 +1,13 @@
+services:
+  test:
+    build:
+      context: ../
+      dockerfile: docker/Dockerfile
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+      - ${AIRSTACK_PATH}:${AIRSTACK_PATH}:ro
+      - ${AIRSTACK_PATH}/tests/results:${AIRSTACK_PATH}/tests/results
+    environment:
+      - AIRSTACK_ROOT=${AIRSTACK_PATH}
+    working_dir: ${AIRSTACK_PATH}/tests
+    network_mode: host
diff --git a/tests/pytest.ini b/tests/pytest.ini
new file mode 100644
index 000000000..022358c02
--- /dev/null
+++ b/tests/pytest.ini
@@ -0,0 +1,10 @@
+[pytest]
+markers =
+    build_docker: Docker image build tests
+    build_packages: Colcon workspace build tests
+    liveliness: Container health and ROS2 node presence
+    comms: Cross-container ROS2 communication
+    takeoff: Takeoff/land scenario tests
+testpaths = .
+addopts = -v --durations=0
+cache_dir = /tmp/.pytest_cache
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 000000000..a9f5ceeac
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,2 @@
+pytest
+pytest-timeout
diff --git a/tests/test_build_docker.py b/tests/test_build_docker.py
new file mode 100644
index 000000000..15a9fbd33
--- /dev/null
+++ b/tests/test_build_docker.py
@@ -0,0 +1,29 @@
+import pytest
+from conftest import airstack_cmd, log_name, read_log_tail
+
+
+@pytest.mark.build_docker
+@pytest.mark.timeout(3600)
+class TestDockerBuilds:
+
+    def test_build_robot_desktop(self):
+        log = log_name()
+        result = airstack_cmd("image-build", "robot-desktop", timeout=3600, log_name=log)
+        assert result.returncode == 0, f"robot-desktop build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+
+    def test_build_gcs(self):
+        log = log_name()
+        result = airstack_cmd("image-build", "gcs", timeout=3600, log_name=log)
+        assert result.returncode == 0, f"gcs build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+
+    def test_build_isaac_sim(self):
+        log = log_name()
+        result = airstack_cmd("image-build", "isaac-sim", timeout=3600, log_name=log)
+        assert result.returncode == 0, f"isaac-sim build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+
+    def test_build_ms_airsim(self):
+        log = log_name()
+        result = airstack_cmd("image-build", "ms-airsim", timeout=3600, log_name=log)
+        assert result.returncode == 0, f"ms-airsim build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+
+    # TODO: Test other profiles that build their own docker containers
\ No newline at end of file
diff --git a/tests/test_build_packages.py b/tests/test_build_packages.py
new file mode 100644
index 000000000..708f16389
--- /dev/null
+++ b/tests/test_build_packages.py
@@ -0,0 +1,63 @@
+import pytest
+from conftest import airstack_cmd, wait_for_container, docker_exec, log_name, read_log_tail
+
+
+@pytest.mark.build_packages
+@pytest.mark.timeout(1200)
+class TestColconBuilds:
+
+    def test_colcon_build_robot(self):
+        log = log_name()
+        try:
+            result = airstack_cmd("up", "robot-desktop",
+                                  env_overrides={"AUTOLAUNCH": "false", "DISPLAY": ""},
+                                  timeout=120, log_name=log)
+            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail(log)}"
+
+            container = wait_for_container("robot.*desktop", timeout=60)
+            assert container, "Robot container not found"
+
+            build = docker_exec(container, "bash -ic bws", timeout=600, log_name=log)
+            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail(log)}"
+        finally:
+            airstack_cmd("down", log_name=log)
+
+    def test_colcon_build_gcs(self):
+        log = log_name()
+        try:
+            result = airstack_cmd("up", "gcs",
+                                  env_overrides={"AUTOLAUNCH": "false", "DISPLAY": ""},
+                                  timeout=120, log_name=log)
+            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail(log)}"
+
+            container = wait_for_container("gcs", timeout=60)
+            assert container, "GCS container not found"
+
+            build = docker_exec(container, "bash -ic bws", timeout=600, log_name=log)
+            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail(log)}"
+        finally:
+            airstack_cmd("down", log_name=log)
+
+    def test_colcon_build_ms_airsim(self):
+        log = log_name()
+        try:
+            result = airstack_cmd(
+                "up", "ms-airsim",
+                env_overrides={"AUTOLAUNCH": "false", "DISPLAY": "",
+                               "COMPOSE_PROFILES": "ms-airsim",
+                               "URDF_FILE": "robot_descriptions/iris/urdf/iris_stereo.ms-airsim.urdf"},
+                timeout=120, log_name=log,
+            )
+            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail(log)}"
+
+            container = wait_for_container("ms-airsim", timeout=60)
+            assert container, "ms-airsim container not found"
+
+            build = docker_exec(
+                container,
+                "cd /root/ros_ws && colcon build --symlink-install",
+                timeout=600, log_name=log,
+            )
+            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail(log)}"
+        finally:
+            airstack_cmd("down", log_name=log)

From fb79a89307990aadb56f23d9c27a6ec81089125d Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Fri, 17 Apr 2026 10:03:35 -0400
Subject: [PATCH 04/24] Compute docker image sizes and add compare_metrics.py
 to output clean markdown marking regressions

---
 tests/compare_metrics.py   | 149 +++++++++++++++++++++++++++++++++++++
 tests/conftest.py          |  54 ++++++++++++++
 tests/test_build_docker.py |  29 ++++----
 3 files changed, 218 insertions(+), 14 deletions(-)
 create mode 100644 tests/compare_metrics.py

diff --git a/tests/compare_metrics.py b/tests/compare_metrics.py
new file mode 100644
index 000000000..09ba2eb7d
--- /dev/null
+++ b/tests/compare_metrics.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""Compare test metrics between two runs.
+
+Reads results.xml (JUnit XML) for test durations and metrics.json for custom
+metrics (image sizes, etc.). Outputs a markdown table and exits 1 on regression.
+
+Usage:
+    python compare_metrics.py --current tests/results/<run>/ --baseline tests/results/<run>/
+    python compare_metrics.py --current tests/results/<run>/ --baseline tests/results/<run>/ --threshold 30
+"""
+import argparse
+import json
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+
+
+def parse_results_xml(path):
+    tree = ET.parse(path)
+    metrics = {}
+    for tc in tree.iter("testcase"):
+        name = f"{tc.get('classname')}.{tc.get('name')}"
+        failed = tc.find("failure") is not None
+        metrics[name] = {
+            "duration_s": {
+                "value": float(tc.get("time", 0)),
+                "unit": "s",
+                "direction": "lower_is_better",
+            },
+            "status": "failed" if failed else "passed",
+        }
+    return metrics
+
+
+def parse_metrics_json(path):
+    if not path.exists():
+        return {}
+    return json.loads(path.read_text())
+
+
+def merge_metrics(run_dir):
+    merged = {}
+    results_xml = run_dir / "results.xml"
+    if results_xml.exists():
+        merged.update(parse_results_xml(results_xml))
+    metrics_json = run_dir / "metrics.json"
+    for test_name, test_metrics in parse_metrics_json(metrics_json).items():
+        if test_name not in merged:
+            merged[test_name] = {}
+        merged[test_name].update(test_metrics)
+    return merged
+
+
+def compare(current, baseline, threshold):
+    rows = []
+    has_regression = False
+
+    all_tests = sorted(set(list(current.keys()) + list(baseline.keys())))
+    for test in all_tests:
+        curr = current.get(test, {})
+        base = baseline.get(test, {})
+
+        # Collect all metric keys (skip 'status')
+        metric_keys = sorted(set(
+            [k for k in curr if k != "status"] +
+            [k for k in base if k != "status"]
+        ))
+        for key in metric_keys:
+            c = curr.get(key)
+            b = base.get(key)
+            if not c or not b:
+                rows.append({
+                    "test": test, "metric": key,
+                    "baseline": f"{b['value']}{b.get('unit', '')}" if b else "—",
+                    "current": f"{c['value']}{c.get('unit', '')}" if c else "—",
+                    "change": "new" if c and not b else "removed",
+                    "flag": "",
+                })
+                continue
+
+            cv, bv = c["value"], b["value"]
+            direction = c.get("direction", "lower_is_better")
+
+            if bv != 0:
+                change_pct = ((cv - bv) / bv) * 100
+            else:
+                change_pct = 0
+
+            # Determine if this is a regression
+            regressed = (direction == "lower_is_better" and change_pct > threshold) or \
+                        (direction == "higher_is_better" and change_pct < -threshold)
+            improved = (direction == "lower_is_better" and change_pct < -threshold) or \
+                       (direction == "higher_is_better" and change_pct > threshold)
+
+            if regressed:
+                has_regression = True
+
+            rows.append({
+                "test": test, "metric": key,
+                "baseline": f"{bv:.1f}{b.get('unit', '')}",
+                "current": f"{cv:.1f}{c.get('unit', '')}",
+                "change": f"{change_pct:+.1f}%",
+                "flag": "regression" if regressed else ("improved" if improved else ""),
+            })
+
+    return rows, has_regression
+
+
+def format_markdown(rows, has_regression):
+    lines = [
+        "| Test | Metric | Baseline | Current | Change |",
+        "|------|--------|----------|---------|--------|",
+    ]
+    for r in rows:
+        change = r["change"]
+        if r["flag"] == "regression":
+            change += " :red_circle:"
+        elif r["flag"] == "improved":
+            change += " :green_circle:"
+        lines.append(f"| {r['test']} | {r['metric']} | {r['baseline']} | {r['current']} | {change} |")
+
+    if has_regression:
+        lines += ["", "**Regression detected** — some metrics exceeded the threshold."]
+
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Compare test metrics between runs")
+    parser.add_argument("--current", required=True, help="Current run directory")
+    parser.add_argument("--baseline", required=True, help="Baseline run directory")
+    parser.add_argument("--threshold", type=float, default=20, help="Regression threshold (%%)")
+    parser.add_argument("--output", help="Write markdown report to file")
+    args = parser.parse_args()
+
+    current = merge_metrics(Path(args.current))
+    baseline = merge_metrics(Path(args.baseline))
+    rows, has_regression = compare(current, baseline, args.threshold)
+    md = format_markdown(rows, has_regression)
+
+    print(md)
+    if args.output:
+        Path(args.output).write_text(md)
+
+    sys.exit(1 if has_regression else 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/conftest.py b/tests/conftest.py
index 7624601a2..ac5f91b21 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import inspect
+import json
 import os
 import subprocess
 import time
@@ -82,3 +83,56 @@ def wait_for_container(name_pattern, timeout=120):
                 return name
         time.sleep(5)
     raise TimeoutError(f"Container matching '{name_pattern}' not running after {timeout}s")
+
+
+def docker_image_size_mb(service, env=None):
+    compose_env = os.environ.copy()
+    if env:
+        compose_env.update(env)
+    # Resolve the image name for this service from compose config
+    result = subprocess.run(
+        ["docker", "compose", "-f", str(Path(AIRSTACK_ROOT) / "docker-compose.yaml"),
+         "config", "--images"],
+        capture_output=True, text=True, cwd=AIRSTACK_ROOT, env=compose_env,
+    )
+    image = None
+    for line in result.stdout.strip().splitlines():
+        if service in line:
+            image = line.strip()
+            break
+    if not image:
+        return None
+    # Get the image size
+    result = subprocess.run(
+        ["docker", "image", "inspect", image, "--format", "{{.Size}}"],
+        capture_output=True, text=True,
+    )
+    if result.returncode == 0 and result.stdout.strip():
+        return round(int(result.stdout.strip()) / 1_000_000, 1)
+    return None
+
+
+# ── metrics ────────────────────────────────────────────────────────────────
+
+class MetricsRecorder:
+    def __init__(self, path):
+        self._path = path
+        self._data = json.loads(path.read_text()) if path.exists() else {}
+
+    def record(self, test_name, key, value, unit="", direction="lower_is_better"):
+        if test_name not in self._data:
+            self._data[test_name] = {}
+        self._data[test_name][key] = {
+            "value": value, "unit": unit, "direction": direction,
+        }
+        self._path.write_text(json.dumps(self._data, indent=2))
+
+
+METRICS = None
+
+
+def get_metrics():
+    global METRICS
+    if METRICS is None:
+        METRICS = MetricsRecorder(RUN_DIR / "metrics.json")
+    return METRICS
diff --git a/tests/test_build_docker.py b/tests/test_build_docker.py
index 15a9fbd33..8105d939e 100644
--- a/tests/test_build_docker.py
+++ b/tests/test_build_docker.py
@@ -1,29 +1,30 @@
 import pytest
-from conftest import airstack_cmd, log_name, read_log_tail
+from conftest import airstack_cmd, log_name, read_log_tail, docker_image_size_mb, get_metrics
 
 
 @pytest.mark.build_docker
 @pytest.mark.timeout(3600)
 class TestDockerBuilds:
 
-    def test_build_robot_desktop(self):
+    def _build_and_record(self, service, env=None):
         log = log_name()
-        result = airstack_cmd("image-build", "robot-desktop", timeout=3600, log_name=log)
-        assert result.returncode == 0, f"robot-desktop build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+        result = airstack_cmd("image-build", service, timeout=3600, log_name=log)
+        assert result.returncode == 0, f"{service} build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+
+        size = docker_image_size_mb(service, env=env)
+        if size is not None:
+            get_metrics().record(f"docker.{service}", "image_size_mb", size, unit="MB")
+
+    def test_build_robot_desktop(self):
+        self._build_and_record("robot-desktop")
 
     def test_build_gcs(self):
-        log = log_name()
-        result = airstack_cmd("image-build", "gcs", timeout=3600, log_name=log)
-        assert result.returncode == 0, f"gcs build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+        self._build_and_record("gcs")
 
     def test_build_isaac_sim(self):
-        log = log_name()
-        result = airstack_cmd("image-build", "isaac-sim", timeout=3600, log_name=log)
-        assert result.returncode == 0, f"isaac-sim build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+        self._build_and_record("isaac-sim")
 
     def test_build_ms_airsim(self):
-        log = log_name()
-        result = airstack_cmd("image-build", "ms-airsim", timeout=3600, log_name=log)
-        assert result.returncode == 0, f"ms-airsim build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+        self._build_and_record("ms-airsim", env={"COMPOSE_PROFILES": "ms-airsim"})
 
-    # TODO: Test other profiles that build their own docker containers
\ No newline at end of file
+    # TODO: Test other profiles that build their own docker containers

From 0db708ca94dfc8fc22ee9fd546296f60d20b9011 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Fri, 17 Apr 2026 14:54:22 -0400
Subject: [PATCH 05/24] Auto fetch airsim scene if user didn't specify path.

---
 .env                                          |  5 +-
 .gitignore                                    |  4 ++
 docs/simulation/ms-airsim/docker.md           | 29 +++++------
 docs/simulation/ms-airsim/index.md            | 22 +++++----
 .../ms-airsim/assets/scenes/fetch_scene.sh    | 48 +++++++++++++++++++
 .../ms-airsim/docker/docker-compose.yaml      |  4 +-
 simulation/ms-airsim/docker/entrypoint.sh     | 15 +++++-
 7 files changed, 97 insertions(+), 30 deletions(-)
 create mode 100755 simulation/ms-airsim/assets/scenes/fetch_scene.sh

diff --git a/.env b/.env
index 28ac1935f..87aec262a 100644
--- a/.env
+++ b/.env
@@ -38,8 +38,9 @@ PLAY_SIM_ON_START="false"
 # ===============================================
 
 # ================= MS-AIRSIM =====================
-MS_AIRSIM_ENV_DIR="/home/oalama/Desktop/Airsim/Environments/AirSimNH/LinuxNoEditor"
-MS_AIRSIM_BINARY_PATH="/ms-airsim-env/AirSimNH.sh"
+# Do not set if you want airstack to fetch a simple blocks world.
+# MS_AIRSIM_ENV_DIR=./simulation/ms-airsim/assets/scenes
+# MS_AIRSIM_BINARY_PATH="/ms-airsim-env/Blocks/LinuxNoEditor/Blocks.sh"
 # =================================================
 
 # ================= ROBOT =====================
diff --git a/.gitignore b/.gitignore
index 2f9282334..afd16bcf4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -76,5 +76,9 @@ simulation/isaac-sim/launch_scripts/prepare_scene.py
 # Generated Microsoft AirSim (legacy) config
 simulation/ms-airsim/config/settings.json
 
+# Downloaded UE4 scene binaries (fetched via assets/scenes/fetch_scene.sh)
+simulation/ms-airsim/assets/scenes/*
+!simulation/ms-airsim/assets/scenes/fetch_scene.sh
+
 # Test results
 tests/results/
diff --git a/docs/simulation/ms-airsim/docker.md b/docs/simulation/ms-airsim/docker.md
index a881a81d8..c76089c17 100644
--- a/docs/simulation/ms-airsim/docker.md
+++ b/docs/simulation/ms-airsim/docker.md
@@ -15,7 +15,8 @@ simulation/ms-airsim/
 │   ├── settings.json.j2      # Jinja2 template for AirSim settings
 │   ├── settings.json         # Generated settings (git-ignored)
 │   └── generate_settings.py  # Settings generator script
-├── environments/             # Pre-built UE4 binaries (git-ignored)
+├── assets/scenes/            # Pre-built UE4 binaries (git-ignored), e.g. Blocks/, AirSimNH/
+│   └── fetch_scene.sh        # Idempotent download + extract helper (tracked)
 └── ros_ws/                   # ROS 2 bridge workspace
     └── src/
         └── ms_airsim_ros_bridge/  # Depth + camera_info bridge node
@@ -56,19 +57,19 @@ The container runs `entrypoint.sh`, which:
 
 1. Generates `settings.json` from the Jinja2 template using current environment variables
 2. Creates a tmux session named `ms-airsim`
-3. Launches the UE4 binary as the `ms-airsim` user (UE4 refuses to run as root)
-4. Waits for the AirSim API to become available (TCP port 41451)
-5. Spawns one PX4 SITL instance per robot, each in its own tmux window
-6. Builds the ROS 2 bridge workspace (`colcon build`)
-7. Launches one bridge node per robot, each with `ROS_DOMAIN_ID=<robot_index>`
+3. Builds the ROS 2 bridge workspace (`colcon build`)
+4. In the `airsim` window: if `MS_AIRSIM_BINARY_PATH` is unset, runs `fetch_scene.sh blocks` to download + extract the default scene, then launches the UE4 binary as the `ms-airsim` user (UE4 refuses to run as root)
+5. Launches one bridge node per robot, each with `ROS_DOMAIN_ID=<robot_index>`
+6. Waits for the AirSim API to become available (TCP port 41451)
+7. Spawns one PX4 SITL instance per robot, each in its own tmux window
 
 ## Environment Variables
 
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `AUTOLAUNCH` | `true` | Auto-start on container launch |
-| `MS_AIRSIM_BINARY_PATH` | `/ms-airsim-env/Blocks.sh` | Path to UE4 binary inside container |
-| `MS_AIRSIM_ENV_DIR` | `../environments` | Host path to extracted UE4 environment |
+| `MS_AIRSIM_BINARY_PATH` | _(unset → auto-fetch Blocks)_ | Path to UE4 binary inside container. If unset, the entrypoint fetches Blocks into the mounted scenes dir and points at it. |
+| `MS_AIRSIM_ENV_DIR` | `../assets/scenes` | Host path to extracted UE4 scenes |
 | `MS_AIRSIM_HEADLESS` | `false` | Run UE4 without a window (`-RenderOffScreen -nosound`) |
 | `NUM_ROBOTS` | `1` | Number of vehicles and PX4 SITL instances |
 | `SIM_IP` | `172.31.0.200` | Simulator IP on `airstack_network` |
@@ -95,8 +96,8 @@ NUM_ROBOTS=2 airstack up --profile ms-airsim
 # Headless (no GUI, uses UE4's -RenderOffScreen)
 MS_AIRSIM_HEADLESS=true airstack up --profile ms-airsim
 
-# Custom environment binary
-MS_AIRSIM_ENV_DIR=/data/airsim_envs MS_AIRSIM_BINARY_PATH=/ms-airsim-env/CityEnviron.sh airstack up --profile ms-airsim
+# Custom scene binary
+MS_AIRSIM_ENV_DIR=/data/airsim_envs MS_AIRSIM_BINARY_PATH=/ms-airsim-env/CityEnviron/LinuxNoEditor/CityEnviron.sh airstack up --profile ms-airsim
 ```
 
 ## Settings Generation
@@ -193,13 +194,9 @@ Mounts `simulation/ms-airsim/config/` to the AirSim config directory. `settings.
 
 Mounts the bridge source so edits on the host are reflected after a rebuild inside the container.
 
-### UE4 Environment
+### UE4 Scene
 
-```yaml
-- ${MS_AIRSIM_ENV_DIR:-../environments}:/ms-airsim-env:rw
-```
-
-Mounts pre-built UE4 binaries. Set `MS_AIRSIM_ENV_DIR` in `.env` to point to the extracted environment directory.
+Mounts pre-built UE4 binaries. If `MS_AIRSIM_BINARY_PATH` is unset, the entrypoint auto-populates this directory with the default Blocks scene on first launch. Run `assets/scenes/fetch_scene.sh` to pre-fetch or pick a different scene, or set `MS_AIRSIM_ENV_DIR` in `.env` to point to an external scenes directory.
 
 ## Accessing the Container
 
diff --git a/docs/simulation/ms-airsim/index.md b/docs/simulation/ms-airsim/index.md
index 537b9953d..4dd559416 100644
--- a/docs/simulation/ms-airsim/index.md
+++ b/docs/simulation/ms-airsim/index.md
@@ -20,17 +20,23 @@ Microsoft AirSim (legacy) provides an alternative simulation backend for AirStac
 
 ## Quick Start
 
-### 1. Download an environment
+### 1. Scene (auto-fetched on first launch)
 
-Download a pre-built environment from the [AirSim Linux releases](https://github.com/microsoft/AirSim/releases/tag/v1.8.1) and extract it into `simulation/ms-airsim/environments/`:
+If `MS_AIRSIM_BINARY_PATH` is unset, the container's entrypoint auto-downloads the Blocks scene (~200 MB) into `simulation/ms-airsim/assets/scenes/Blocks/` inside the `airsim` tmux window on first launch. Progress and any errors are visible there.
+
+To pre-fetch (e.g. before CI) or pick a different scene, run the helper directly:
 
 ```bash
-cd simulation/ms-airsim
-mkdir -p environments && cd environments
-wget https://github.com/microsoft/AirSim/releases/download/v1.8.1/AbandonedPark.zip
-unzip AbandonedPark.zip
+./simulation/ms-airsim/assets/scenes/fetch_scene.sh              # blocks (default)
+./simulation/ms-airsim/assets/scenes/fetch_scene.sh airsimnh     # or: abandonedpark, forest,
+                                                                 # landscapemountains, soccerfield,
+                                                                 # building99, zhangjiajie
 ```
 
+To use a scene that isn't one of the presets, extract it yourself into `simulation/ms-airsim/assets/scenes/` and set `MS_AIRSIM_BINARY_PATH` to its `.sh` path inside the container.
+
+Scenes are pulled from the [AirSim Linux releases](https://github.com/microsoft/AirSim/releases/tag/v1.8.1).
+
 ### 2. Launch Microsoft AirSim (legacy) + Robot
 
 ```bash
@@ -136,8 +142,8 @@ Located at `simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/config/bridge.y
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `SIM_IP` | `172.31.0.200` | Simulation container IP |
-| `MS_AIRSIM_ENV_DIR` | `simulation/ms-airsim/environments` | Host path to extracted AirSim environment |
-| `MS_AIRSIM_BINARY_PATH` | `/ms-airsim-env/LinuxNoEditor/Blocks.sh` | Path to binary inside container |
+| `MS_AIRSIM_ENV_DIR` | `simulation/ms-airsim/assets/scenes` | Host path to extracted AirSim scenes |
+| `MS_AIRSIM_BINARY_PATH` | _(unset → auto-fetch Blocks)_ | Path to binary inside container. If unset, the entrypoint fetches Blocks and points at it. |
 
 ## Published ROS 2 Topics
 
diff --git a/simulation/ms-airsim/assets/scenes/fetch_scene.sh b/simulation/ms-airsim/assets/scenes/fetch_scene.sh
new file mode 100755
index 000000000..ba977a57d
--- /dev/null
+++ b/simulation/ms-airsim/assets/scenes/fetch_scene.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+# Fetch and extract a Microsoft AirSim pre-built UE4 scene into
+# simulation/ms-airsim/assets/scenes/<Name>/. Idempotent — skips if
+# the scene has already been extracted.
+#
+# Usage: fetch_scene.sh [scene]
+#   scene (default: blocks) — one of:
+#     blocks, airsimnh, abandonedpark, forest,
+#     landscapemountains, soccerfield, building99, zhangjiajie
+
+set -euo pipefail
+
+SCENE="${1:-blocks}"
+SCENES_DIR="${SCENES_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
+RELEASE_URL="https://github.com/microsoft/AirSim/releases/download/v1.8.1"
+
+case "$SCENE" in
+    blocks)             NAME=Blocks;             ZIP=Blocks.zip ;;
+    airsimnh)           NAME=AirSimNH;           ZIP=Neighborhood.zip ;;
+    abandonedpark)      NAME=AbandonedPark;      ZIP=AbandonedPark.zip ;;
+    forest)             NAME=Forest;             ZIP=Forest.zip ;;
+    landscapemountains) NAME=LandscapeMountains; ZIP=LandscapeMountains.zip ;;
+    soccerfield)        NAME=SoccerField;        ZIP=SoccerField.zip ;;
+    building99)         NAME=Building99;         ZIP=Building99.zip ;;
+    zhangjiajie)        NAME=ZhangJiajie;        ZIP=ZhangJiajie.zip ;;
+    *) echo "unknown scene: $SCENE" >&2; exit 2 ;;
+esac
+
+DEST="$SCENES_DIR/$NAME"
+if compgen -G "$DEST/LinuxNoEditor/*.sh" > /dev/null; then
+    echo "$NAME already present in $DEST"
+    exit 0
+fi
+
+mkdir -p "$DEST"
+TMP="$(mktemp -d)"
+trap 'rm -rf "$TMP"' EXIT
+
+echo "Downloading $ZIP..."
+curl -fL "$RELEASE_URL/$ZIP" -o "$TMP/$ZIP"
+echo "Extracting to $DEST..."
+# Zips wrap everything in a single top-level dir (e.g. LinuxBlocks1.8.1/).
+# Extract to a tmp location, then lift the wrapper's contents into $DEST.
+unzip -q "$TMP/$ZIP" -d "$TMP/extract"
+shopt -s dotglob
+mv "$TMP/extract"/*/* "$DEST/"
+find "$DEST/LinuxNoEditor" -maxdepth 1 -name "*.sh" -exec chmod +x {} \;
+echo "Ready: $DEST"
diff --git a/simulation/ms-airsim/docker/docker-compose.yaml b/simulation/ms-airsim/docker/docker-compose.yaml
index 7db6642a3..069e7df5e 100644
--- a/simulation/ms-airsim/docker/docker-compose.yaml
+++ b/simulation/ms-airsim/docker/docker-compose.yaml
@@ -25,7 +25,7 @@ services:
       - XDG_RUNTIME_DIR=/tmp
       - NUM_ROBOTS=${NUM_ROBOTS:-1}
       - AUTOLAUNCH=${AUTOLAUNCH:-true}
-      - MS_AIRSIM_BINARY_PATH=${MS_AIRSIM_BINARY_PATH:-/ms-airsim-env/Blocks.sh}
+      - MS_AIRSIM_BINARY_PATH=${MS_AIRSIM_BINARY_PATH:-}
       - MS_AIRSIM_HEADLESS=${MS_AIRSIM_HEADLESS:-false}
     deploy:
       resources:
@@ -38,6 +38,6 @@ services:
       - /tmp/.X11-unix:/tmp/.X11-unix
       - ../config:/home/ms-airsim/Documents/AirSim:rw
       - ../ros_ws:/root/ros_ws:rw
-      - ${MS_AIRSIM_ENV_DIR:-../environments}:/ms-airsim-env:rw
+      - ${MS_AIRSIM_ENV_DIR:-../assets/scenes}:/ms-airsim-env:rw
       - /usr/share/vulkan/icd.d/nvidia_icd.json:/usr/share/vulkan/icd.d/nvidia_icd.json:ro
       - ../../../common/.tmux.conf:/root/.tmux.conf:rw
diff --git a/simulation/ms-airsim/docker/entrypoint.sh b/simulation/ms-airsim/docker/entrypoint.sh
index 20403607d..8034434a6 100755
--- a/simulation/ms-airsim/docker/entrypoint.sh
+++ b/simulation/ms-airsim/docker/entrypoint.sh
@@ -3,7 +3,6 @@ set -e
 
 NUM_ROBOTS="${NUM_ROBOTS:-1}"
 AUTOLAUNCH="${AUTOLAUNCH:-true}"
-MS_AIRSIM_BINARY_PATH="${MS_AIRSIM_BINARY_PATH:-/ms-airsim-env/LinuxNoEditor/Blocks.sh}"
 MS_AIRSIM_HEADLESS="${MS_AIRSIM_HEADLESS:-false}"
 # Seconds to let AirSim sensors settle before PX4 starts its EKF.
 # Too short → PX4 snapshots a bad local origin (altitude offset).
@@ -19,6 +18,18 @@ tmux new -d -s ms-airsim -n airsim
 # window list (airsim, robot_<i>_px4, robot_<i>_bridge) has room to breathe.
 tmux set-option -t ms-airsim status-right ''
 
+# Scene resolution. If MS_AIRSIM_BINARY_PATH is unset, the airsim tmux window
+# auto-fetches Blocks so the download is visible. If set, the file must exist.
+FETCH_PREFIX=""
+if [ -z "$MS_AIRSIM_BINARY_PATH" ]; then
+    MS_AIRSIM_BINARY_PATH="/ms-airsim-env/Blocks/LinuxNoEditor/Blocks.sh"
+    FETCH_PREFIX="SCENES_DIR=/ms-airsim-env bash /ms-airsim-env/fetch_scene.sh blocks && chown -R ms-airsim:ms-airsim /ms-airsim-env/Blocks && chmod -R a+rwX /ms-airsim-env/Blocks && "
+elif [ ! -f "$MS_AIRSIM_BINARY_PATH" ]; then
+    echo "ERROR: MS_AIRSIM_BINARY_PATH=$MS_AIRSIM_BINARY_PATH does not exist." >&2
+    echo "Extract the scene into the mounted volume, or unset MS_AIRSIM_BINARY_PATH to auto-fetch Blocks." >&2
+    exit 1
+fi
+
 if [ "$AUTOLAUNCH" = "true" ]; then
     # Build ROS workspace
     cd /root/ros_ws && colcon build --symlink-install
@@ -28,7 +39,7 @@ if [ "$AUTOLAUNCH" = "true" ]; then
         UE4_FLAGS="-RenderOffScreen -nosound"
     fi
     tmux send-keys -t ms-airsim:airsim \
-        "sudo -u ms-airsim $MS_AIRSIM_BINARY_PATH $UE4_FLAGS" ENTER
+        "${FETCH_PREFIX}sudo -u ms-airsim $MS_AIRSIM_BINARY_PATH $UE4_FLAGS" ENTER
 
     # Launch bridge nodes — one window per robot, named robot_<i>_bridge
     for i in $(seq 1 "$NUM_ROBOTS"); do

From 1ec194594cdd10c1a4cd0bc78350f6169127eccc Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Fri, 17 Apr 2026 17:39:06 -0400
Subject: [PATCH 06/24] Upgrade pegasus example from two to arbitrary number of
 drones  equally spaced.

---
 ...example_multi_px4_pegasus_launch_script.py | 199 ++++++++++++++
 .../example_two_px4_pegasus_launch_script.py  | 257 ------------------
 2 files changed, 199 insertions(+), 257 deletions(-)
 create mode 100644 simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py
 delete mode 100755 simulation/isaac-sim/launch_scripts/example_two_px4_pegasus_launch_script.py

diff --git a/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py b/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py
new file mode 100644
index 000000000..bda5363e7
--- /dev/null
+++ b/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py
@@ -0,0 +1,199 @@
+#!/usr/bin/env python
+"""
+Multi-drone PX4 Pegasus launcher, parametrized by NUM_ROBOTS.
+
+Env:
+ - NUM_ROBOTS (default 1): how many drones to spawn
+ - ENABLE_LIDAR (default false): attach an Ouster lidar to each drone
+ - PLAY_SIM_ON_START (default true): autoplay timeline
+"""
+
+import carb
+from isaacsim import SimulationApp
+
+# Must be created before any omni imports
+simulation_app = SimulationApp({"headless": False})
+
+import os
+import sys
+import time
+
+import omni.kit.app
+import omni.timeline
+import omni.usd
+
+from omni.isaac.core.world import World
+
+# Pegasus imports
+from pegasus.simulator.params import SIMULATION_ENVIRONMENTS
+from pegasus.simulator.logic.interface.pegasus_interface import PegasusInterface
+from pegasus.simulator.ogn.api.spawn_multirotor import spawn_px4_multirotor_node
+from pegasus.simulator.ogn.api.spawn_zed_camera import add_zed_stereo_camera_subgraph
+from pegasus.simulator.ogn.api.spawn_ouster_lidar import add_ouster_lidar_subgraph
+
+sys.path.insert(0, os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "utils")))
+from scene_prep import scale_stage_prim, add_colliders, add_dome_light, save_scene_as_contained_usd
+
+
+# --------------------- CONFIGURATION ---------------------
+ENV_URL = SIMULATION_ENVIRONMENTS["Default Environment"]
+STAGE_SCALE = 1.0
+SAVE_SCENE_TO = None
+DRONE_USD = "~/.local/share/ov/data/documents/Kit/shared/exts/pegasus.simulator/pegasus/simulator/assets/Robots/Iris/iris.usd"
+
+NUM_ROBOTS = int(os.environ.get("NUM_ROBOTS", "1"))
+ENABLE_LIDAR = os.environ.get("ENABLE_LIDAR", "false").lower() == "true"
+# ---------------------------------------------------------
+
+
+# Enable required extensions
+ext_manager = omni.kit.app.get_app().get_extension_manager()
+for ext in [
+    "omni.graph.core",
+    "omni.graph.action",
+    "omni.graph.action_nodes",
+    "isaacsim.core.nodes",
+    "omni.graph.ui",
+    "omni.graph.visualization.nodes",
+    "omni.graph.scriptnode",
+    "omni.graph.window.action",
+    "omni.graph.window.generic",
+    "omni.graph.ui_nodes",
+    "pegasus.simulator",
+]:
+    if not ext_manager.is_extension_enabled(ext):
+        ext_manager.set_extension_enabled_immediate(ext, True)
+
+
+def wait_for_stage(stage, timeout_s: float = 10.0):
+    """Pump the Kit app loop until /World has content (scene fully loaded)."""
+    for _ in range(int(timeout_s / 0.1)):
+        omni.kit.app.get_app().update()
+        world_prim = stage.GetPrimAtPath("/World")
+        if world_prim.IsValid():
+            non_physics = [c for c in world_prim.GetChildren() if c.GetName() != "PhysicsScene"]
+            if non_physics:
+                return True
+        time.sleep(0.1)
+    return False
+
+
+def spawn_drone(index: int):
+    """Spawn drone with vehicle_id=index (1-based), plus camera and optional lidar."""
+    robot_name = f"robot_{index}"
+    drone_prim = f"/World/drone{index}/base_link"
+    # Spread drones along X: -2, 0, 2, 4, ... centered near origin for small counts
+    init_x = 2.0 * (index - 1) - 2.0 * (NUM_ROBOTS - 1) / 2.0
+
+    graph_handle = spawn_px4_multirotor_node(
+        pegasus_node_name=f"PX4Multirotor_{index}",
+        drone_prim=drone_prim,
+        robot_name=robot_name,
+        vehicle_id=index,
+        domain_id=index,
+        usd_file=DRONE_USD,
+        init_pos=[init_x, 0.0, 0.07],
+        init_orient=[0.0, 0.0, 0.0, 1.0],
+    )
+
+    add_zed_stereo_camera_subgraph(
+        parent_graph_handle=graph_handle,
+        drone_prim=drone_prim,
+        robot_name=robot_name,
+        camera_name="ZEDCamera",
+        camera_offset=[0.2, 0.0, -0.05],
+        camera_rotation_offset=[0.0, 0.0, 0.0],
+    )
+
+    if ENABLE_LIDAR:
+        add_ouster_lidar_subgraph(
+            parent_graph_handle=graph_handle,
+            drone_prim=drone_prim,
+            robot_name=robot_name,
+            lidar_name="OS1_REV6_128_10hz___512_resolution",
+            lidar_offset=[0.0, 0.0, 0.025],
+            lidar_rotation_offset=[0.0, 0.0, 0.0],
+            lidar_min_range=0.75,
+        )
+
+
+class PegasusApp:
+
+    def __init__(self):
+        self.timeline = omni.timeline.get_timeline_interface()
+
+        self.pg = PegasusInterface()
+        self.pg._world = World(**self.pg._world_settings)
+        self.world = self.pg.world
+
+        # Keep the timeline stopped throughout setup so OmniGraph doesn't tick early.
+        self.timeline.stop()
+
+        self.pg.load_environment(ENV_URL)
+
+        stage = omni.usd.get_context().get_stage()
+        if stage is None:
+            raise RuntimeError("Stage failed to load")
+
+        if not wait_for_stage(stage):
+            carb.log_warn("Stage load timed out — continuing anyway.")
+
+        stage_prim = stage.GetPrimAtPath("/World/stage")
+        if stage_prim.IsValid():
+            scale_stage_prim(stage, "/World/stage", STAGE_SCALE)
+            add_colliders(stage_prim)
+            for _ in range(10):
+                omni.kit.app.get_app().update()
+        else:
+            carb.log_warn("/World/stage not found — skipping scale and collision.")
+
+        add_dome_light(stage)
+
+        if SAVE_SCENE_TO:
+            import tempfile
+            tmp_usd = os.path.join(tempfile.gettempdir(), "prepared_scene.usd")
+            success, error = asyncio.get_event_loop().run_until_complete(
+                omni.usd.get_context().export_as_stage_async(tmp_usd)
+            )
+            if success:
+                os.makedirs(SAVE_SCENE_TO, exist_ok=True)
+                save_scene_as_contained_usd(tmp_usd, SAVE_SCENE_TO)
+                os.remove(tmp_usd)
+            else:
+                carb.log_error(f"Scene export failed: {error}")
+
+        print(f"[example_multi] Spawning {NUM_ROBOTS} drone(s), lidar={'on' if ENABLE_LIDAR else 'off'}")
+        for i in range(1, NUM_ROBOTS + 1):
+            spawn_drone(i)
+
+        self.play_on_start = os.environ.get("PLAY_SIM_ON_START", "true").lower() == "true"
+
+    def run(self):
+        if self.play_on_start:
+            self.timeline.play()
+        else:
+            self.timeline.stop()
+
+        app = omni.kit.app.get_app()
+        while simulation_app.is_running():
+            world = World.instance()
+            if world is not None and hasattr(world, '_scene'):
+                world.step(render=True)
+                if world is not self.world:
+                    self.world = world
+                    self.pg._world = world
+            else:
+                app.update()
+
+        carb.log_warn("Closing simulation.")
+        self.timeline.stop()
+        simulation_app.close()
+
+
+def main():
+    pg_app = PegasusApp()
+    pg_app.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/simulation/isaac-sim/launch_scripts/example_two_px4_pegasus_launch_script.py b/simulation/isaac-sim/launch_scripts/example_two_px4_pegasus_launch_script.py
deleted file mode 100755
index ac42b755c..000000000
--- a/simulation/isaac-sim/launch_scripts/example_two_px4_pegasus_launch_script.py
+++ /dev/null
@@ -1,257 +0,0 @@
-#!/usr/bin/env python
-"""
-Example two-drone PX4 launcher with scene preparation.
-
-Demonstrates:
- - Loading a Pegasus world with an environment
- - Scaling the environment prim and adding collision geometry
- - Adding a dome light
- - Spawning two PX4 multirotors each with a ZED camera and Ouster lidar
- - Optionally saving the prepared scene as a self-contained USD
-"""
-
-import carb
-from isaacsim import SimulationApp
-
-# Must be created before any omni imports
-simulation_app = SimulationApp({"headless": False})
-
-import os
-import sys
-import time
-
-import omni.kit.app
-import omni.timeline
-import omni.usd
-
-from omni.isaac.core.world import World
-
-# Pegasus imports
-from pegasus.simulator.params import SIMULATION_ENVIRONMENTS
-from pegasus.simulator.logic.interface.pegasus_interface import PegasusInterface
-from pegasus.simulator.ogn.api.spawn_multirotor import spawn_px4_multirotor_node
-from pegasus.simulator.ogn.api.spawn_zed_camera import add_zed_stereo_camera_subgraph
-from pegasus.simulator.ogn.api.spawn_ouster_lidar import add_ouster_lidar_subgraph
-
-sys.path.insert(0, os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "utils")))
-from scene_prep import scale_stage_prim, add_colliders, add_dome_light, save_scene_as_contained_usd
-
-
-# --------------------- CONFIGURATION ---------------------
-# Environment to load. Swap this URL/key for any other scene.
-ENV_URL = SIMULATION_ENVIRONMENTS["Curved Gridroom"]
-
-# Scale applied to /World/stage. 0.01 converts cm→m for Nucleus assets.
-# Set to 1.0 if the environment is already in meters.
-STAGE_SCALE = 1.0
-
-# Set to a directory path to export a self-contained USD after scene prep.
-# Set to None to skip saving.
-SAVE_SCENE_TO = None  # e.g. os.path.expanduser("~/AirStack/my_scene/")
-
-DRONE_USD = "~/.local/share/ov/data/documents/Kit/shared/exts/pegasus.simulator/pegasus/simulator/assets/Robots/Iris/iris.usd"
-# ---------------------------------------------------------
-
-
-# Enable required extensions
-ext_manager = omni.kit.app.get_app().get_extension_manager()
-for ext in [
-    "omni.graph.core",
-    "omni.graph.action",
-    "omni.graph.action_nodes",
-    "isaacsim.core.nodes",
-    "omni.graph.ui",
-    "omni.graph.visualization.nodes",
-    "omni.graph.scriptnode",
-    "omni.graph.window.action",
-    "omni.graph.window.generic",
-    "omni.graph.ui_nodes",
-    "pegasus.simulator",
-]:
-    if not ext_manager.is_extension_enabled(ext):
-        ext_manager.set_extension_enabled_immediate(ext, True)
-
-
-def wait_for_stage(stage, timeout_s: float = 10.0):
-    """Pump the Kit app loop until /World has content (scene fully loaded)."""
-    for _ in range(int(timeout_s / 0.1)):
-        omni.kit.app.get_app().update()
-        world_prim = stage.GetPrimAtPath("/World")
-        if world_prim.IsValid():
-            non_physics = [c for c in world_prim.GetChildren() if c.GetName() != "PhysicsScene"]
-            if non_physics:
-                return True
-        time.sleep(0.1)
-    return False
-
-
-class PegasusApp:
-
-    def __init__(self):
-        self.timeline = omni.timeline.get_timeline_interface()
-
-        # Start Pegasus interface + world
-        self.pg = PegasusInterface()
-        self.pg._world = World(**self.pg._world_settings)
-        self.world = self.pg.world
-
-        # Keep the timeline stopped throughout setup so that OmniGraph's
-        # OnPlaybackTick never fires.
-        self.timeline.stop()
-
-        # Load environment
-        self.pg.load_environment(ENV_URL)
-
-        stage = omni.usd.get_context().get_stage()
-        if stage is None:
-            raise RuntimeError("Stage failed to load")
-
-        # Wait for the environment to finish loading before modifying it
-        if not wait_for_stage(stage):
-            carb.log_warn("Stage load timed out — continuing anyway.")
-
-        # ----- Scene preparation -----
-
-        # Scale /World/stage if the asset uses non-metric units (e.g. cm).
-        # Remove or set STAGE_SCALE=1.0 if the environment is already in meters.
-        stage_prim = stage.GetPrimAtPath("/World/stage")
-        if stage_prim.IsValid():
-            scale_stage_prim(stage, "/World/stage", STAGE_SCALE)
-
-            # Apply CollisionAPI to every mesh so physics works correctly
-            add_colliders(stage_prim)
-
-            # Let the app process the transform and collision changes
-            for _ in range(10):
-                omni.kit.app.get_app().update()
-        else:
-            carb.log_warn("/World/stage not found — skipping scale and collision.")
-
-        # Add a dome light for uniform scene illumination.
-        # Pass intensity/exposure kwargs to override defaults defined in scene_prep.
-        add_dome_light(stage)
-
-        # Optionally save the prepared scene as a self-contained USD package.
-        # The Collector copies all Nucleus-hosted textures and MDLs locally.
-        if SAVE_SCENE_TO:
-            import tempfile
-            tmp_usd = os.path.join(tempfile.gettempdir(), "prepared_scene.usd")
-            success, error = asyncio.get_event_loop().run_until_complete(
-                omni.usd.get_context().export_as_stage_async(tmp_usd)
-            )
-            if success:
-                os.makedirs(SAVE_SCENE_TO, exist_ok=True)
-                save_scene_as_contained_usd(tmp_usd, SAVE_SCENE_TO)
-                os.remove(tmp_usd)
-            else:
-                carb.log_error(f"Scene export failed: {error}")
-
-        # ----- Spawn drone OmniGraphs -----
-        # This only creates the graph topology. The actual drones + PX4
-        # backends are created by compute_base on the first Play tick.
-
-        ####################################################################################################
-        # Spawn vehicle 1
-        ####################################################################################################
-        graph_handle1 = spawn_px4_multirotor_node(
-            pegasus_node_name="PX4Multirotor_1",
-            drone_prim="/World/drone1/base_link",
-            robot_name="robot_1",
-            vehicle_id=1,   # MAVLink port = 14540 + vehicle_id
-            domain_id=1,    # ROS 2 domain ID — match vehicle_id by convention
-            usd_file=DRONE_USD,
-            init_pos=[2.0, 0.0, 0.07],
-            init_orient=[0.0, 0.0, 0.0, 1.0],
-        )
-
-        # Add a ZED stereo camera subgraph to drone 1
-        add_zed_stereo_camera_subgraph(
-            parent_graph_handle=graph_handle1,
-            drone_prim="/World/drone1/base_link",
-            robot_name="robot_1",
-            camera_name="ZEDCamera",
-            camera_offset=[0.2, 0.0, -0.05],        # X, Y, Z offset from base_link
-            camera_rotation_offset=[0.0, 0.0, 0.0],  # roll, pitch, yaw in degrees
-        )
-
-        # Add an Ouster lidar subgraph to drone 1
-        add_ouster_lidar_subgraph(
-            parent_graph_handle=graph_handle1,
-            drone_prim="/World/drone1/base_link",
-            robot_name="robot_1",
-            lidar_name="OS1_REV6_128_10hz___512_resolution",
-            lidar_offset=[0.0, 0.0, 0.025],          # X, Y, Z offset from base_link
-            lidar_rotation_offset=[0.0, 0.0, 0.0],
-            lidar_min_range=0.75,                     # avoid propeller hits
-        )
-
-        ####################################################################################################
-        # Spawn vehicle 2
-        ####################################################################################################
-        graph_handle2 = spawn_px4_multirotor_node(
-            pegasus_node_name="PX4Multirotor_2",
-            drone_prim="/World/drone2/base_link",
-            robot_name="robot_2",
-            vehicle_id=2,   # MAVLink port = 14540 + vehicle_id
-            domain_id=2,    # ROS 2 domain ID — match vehicle_id by convention
-            usd_file=DRONE_USD,
-            init_pos=[0.0, 0.0, 0.07],
-            init_orient=[0.0, 0.0, 0.0, 1.0],
-        )
-
-        # Add a ZED stereo camera subgraph to drone 2
-        add_zed_stereo_camera_subgraph(
-            parent_graph_handle=graph_handle2,
-            drone_prim="/World/drone2/base_link",
-            robot_name="robot_2",
-            camera_name="ZEDCamera",
-            camera_offset=[0.2, 0.0, -0.05],
-            camera_rotation_offset=[0.0, 0.0, 0.0],
-        )
-
-        # Add an Ouster lidar subgraph to drone 2
-        add_ouster_lidar_subgraph(
-            parent_graph_handle=graph_handle2,
-            drone_prim="/World/drone2/base_link",
-            robot_name="robot_2",
-            lidar_name="OS1_REV6_128_10hz___512_resolution",
-            lidar_offset=[0.0, 0.0, 0.025],
-            lidar_rotation_offset=[0.0, 0.0, 0.0],
-            lidar_min_range=0.75,
-        )
-
-
-        self.play_on_start = os.environ.get("PLAY_SIM_ON_START", "true").lower() == "true"
-
-    def run(self):
-
-        if self.play_on_start:
-            self.timeline.play()
-        else:
-            self.timeline.stop()
-
-        app = omni.kit.app.get_app()
-        while simulation_app.is_running():
-            # File → Save re-opens the stage, which invalidates the World.
-            # Fall back to app.update() until the extension re-creates it.
-            world = World.instance()
-            if world is not None and hasattr(world, '_scene'):
-                world.step(render=True)
-                if world is not self.world:
-                    self.world = world
-                    self.pg._world = world
-            else:
-                app.update()
-
-        carb.log_warn("Closing simulation.")
-        self.timeline.stop()
-        simulation_app.close()
-
-
-def main():
-    pg_app = PegasusApp()
-    pg_app.run()
-
-
-if __name__ == "__main__":
-    main()

From 514b589aef54411d956e303477a4db719384e557 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Sat, 18 Apr 2026 21:28:52 -0400
Subject: [PATCH 07/24] Initial liveliness checks.

Compare metrics still unchecked. And modelling cross test dependencies not done yet
---
 .airstack/modules/dev.sh                      |  19 +-
 common/.tmux.conf                             |   6 +-
 robot/docker/robot-base-docker-compose.yaml   |   1 +
 .../sensors_bringup/launch/sensors.launch.xml |   5 +-
 simulation/isaac-sim/docker/.bashrc           |   4 +-
 .../isaac-sim/docker/docker-compose.yaml      |   3 +
 ...example_multi_px4_pegasus_launch_script.py |  11 +-
 tests/compare_metrics.py                      |  57 ++-
 tests/conftest.py                             | 424 ++++++++++++++++--
 tests/docker/docker-compose.yaml              |   1 +
 tests/pytest.ini                              |   4 +
 tests/test_build_docker.py                    |   7 +-
 tests/test_build_packages.py                  |  35 +-
 tests/test_liveliness.py                      | 291 ++++++++++++
 14 files changed, 773 insertions(+), 95 deletions(-)
 create mode 100644 tests/test_liveliness.py

diff --git a/.airstack/modules/dev.sh b/.airstack/modules/dev.sh
index f6f12e4b6..d2fa24a6c 100644
--- a/.airstack/modules/dev.sh
+++ b/.airstack/modules/dev.sh
@@ -3,25 +3,16 @@
 # dev.sh - Development-related commands for AirStack
 # This module provides commands for development tasks
 
-# Function to run tests via the dockerized test runner.
-# Usage: airstack test                        — run all tests
-#        airstack test build_packages         — run one marker
-#        airstack test build_docker build_packages — multiple markers
+# Run tests via the dockerized pytest runner. All args forward to pytest.
 function cmd_dev_test {
     check_docker
     local compose_file="$PROJECT_ROOT/tests/docker/docker-compose.yaml"
-    local markers=("$@")
-
     export AIRSTACK_PATH="$PROJECT_ROOT"
+    # Grant X access so sim containers spawned by tests in GUI mode
+    # (`pytest --gui`) can reach the host's X server. No-op otherwise.
+    xhost + || log_warn "xhost failed (is DISPLAY set? xhost installed?)"
     docker compose -f "$compose_file" build --quiet
-
-    if [ ${#markers[@]} -eq 0 ]; then
-        docker compose -f "$compose_file" run --rm test pytest
-    else
-        local marker
-        marker=$(IFS=" or "; echo "${markers[*]}")
-        docker compose -f "$compose_file" run --rm test pytest -m "$marker"
-    fi
+    docker compose -f "$compose_file" run --rm test pytest "$@"
 }
 
 # Function to build documentation
diff --git a/common/.tmux.conf b/common/.tmux.conf
index e1499eaac..a9b44c521 100644
--- a/common/.tmux.conf
+++ b/common/.tmux.conf
@@ -16,8 +16,10 @@ set -g @plugin 'noscript/tmux-mighty-scroll'
 
 # ── Split Window ──────────────────────────────────────────────────────────────
 
-# Automatically split the window when creating a new session named "bringup" and focus on the first window
-set-hook -g after-new-session 'if -F "#{==:#{session_name},bringup}" "split-window -v -l 20% -t bringup:0"'
+# Auto-split the bringup session's first window so users have a manual shell
+# alongside the autonomy launch. The split pane is titled "shell" so automated
+# liveliness checks can tell it apart from a pane that has crashed back to bash.
+set-hook -g after-new-session 'if -F "#{==:#{session_name},bringup}" "split-window -v -l 20% -t bringup:0 ; select-pane -T shell"'
 
 # ── Status bar ────────────────────────────────────────────────────────────────
 set -g status on
diff --git a/robot/docker/robot-base-docker-compose.yaml b/robot/docker/robot-base-docker-compose.yaml
index 79103f6bd..d2b26124d 100644
--- a/robot/docker/robot-base-docker-compose.yaml
+++ b/robot/docker/robot-base-docker-compose.yaml
@@ -9,6 +9,7 @@ services:
     environment:
       - DISPLAY=${DISPLAY}
       - QT_X11_NO_MITSHM=1
+      - QT_QPA_PLATFORM
       # Record bags
       - RECORD_BAGS=${RECORD_BAGS}
       # docker compose interpolation to env variables
diff --git a/robot/ros_ws/src/sensors/sensors_bringup/launch/sensors.launch.xml b/robot/ros_ws/src/sensors/sensors_bringup/launch/sensors.launch.xml
index dfad888e6..8529d1c1f 100644
--- a/robot/ros_ws/src/sensors/sensors_bringup/launch/sensors.launch.xml
+++ b/robot/ros_ws/src/sensors/sensors_bringup/launch/sensors.launch.xml
@@ -5,7 +5,8 @@
         <!-- nothing right now -->
     </group>
     <!-- should we put this under the sensors namespace above? -->
-    <node
+    <!-- <node
         pkg="gimbal_stabilizer" exec="gimbal_stabilizer_node" name="gimbal_stabilizer" output="screen">
-    </node>    
+    </node> -->
+
 </launch>
\ No newline at end of file
diff --git a/simulation/isaac-sim/docker/.bashrc b/simulation/isaac-sim/docker/.bashrc
index 256a52218..256b3587d 100644
--- a/simulation/isaac-sim/docker/.bashrc
+++ b/simulation/isaac-sim/docker/.bashrc
@@ -135,9 +135,9 @@ export ROS_AUTOMATIC_DISCOVERY_RANGE=SUBNET
 # This strips out the System ROS (Python 3.12) paths to prevent conflicts
 export ISAAC_SIM_PYTHONPATH=$(echo $PYTHONPATH | tr ':' '\n' | grep -v "lib/python3.12/site-packages" | paste -sd ':' -):/isaac-sim/exts/isaacsim.ros2.bridge/jazzy/rclpy
 
-# Helper function to run Isaac Sim python scripts with the correct environment
+# Helper function to run Isaac Sim python scripts with the correct environment.
 run_isaac_python() {
-    PYTHONPATH="$ISAAC_SIM_PYTHONPATH" /isaac-sim/python.sh "$@"
+    PYTHONPATH="$ISAAC_SIM_PYTHONPATH" exec /isaac-sim/python.sh "$@"
 }
 export -f run_isaac_python
 
diff --git a/simulation/isaac-sim/docker/docker-compose.yaml b/simulation/isaac-sim/docker/docker-compose.yaml
index 60e17925b..fccd03931 100644
--- a/simulation/isaac-sim/docker/docker-compose.yaml
+++ b/simulation/isaac-sim/docker/docker-compose.yaml
@@ -37,6 +37,9 @@ services:
       - QT_X11_NO_MITSHM=1
       # Isaac Sim stuff
       - PLAY_SIM_ON_START=${PLAY_SIM_ON_START}
+      - NUM_ROBOTS=${NUM_ROBOTS:-1}
+      - ENABLE_LIDAR=${ENABLE_LIDAR:-false}
+      - ISAAC_SIM_HEADLESS=${ISAAC_SIM_HEADLESS:-false}
     deploy:
       # let it use the GPU
       resources:
diff --git a/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py b/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py
index bda5363e7..b3dc2ac38 100644
--- a/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py
+++ b/simulation/isaac-sim/launch_scripts/example_multi_px4_pegasus_launch_script.py
@@ -8,15 +8,16 @@
  - PLAY_SIM_ON_START (default true): autoplay timeline
 """
 
+import os
+import sys
+import time
+
 import carb
 from isaacsim import SimulationApp
 
 # Must be created before any omni imports
-simulation_app = SimulationApp({"headless": False})
-
-import os
-import sys
-import time
+_headless = os.environ.get("ISAAC_SIM_HEADLESS", "false").lower() == "true"
+simulation_app = SimulationApp({"headless": _headless})
 
 import omni.kit.app
 import omni.timeline
diff --git a/tests/compare_metrics.py b/tests/compare_metrics.py
index 09ba2eb7d..a601a49a0 100644
--- a/tests/compare_metrics.py
+++ b/tests/compare_metrics.py
@@ -51,6 +51,37 @@ def merge_metrics(run_dir):
     return merged
 
 
+def _is_scored(entry):
+    """True if this metric entry can be numerically compared.
+
+    Skip time-series (list-valued, key 'samples'), non-numeric sentinels ('timeout'),
+    and any dict that lacks a 'value' field.
+    """
+    if not isinstance(entry, dict):
+        return False
+    if "samples" in entry:
+        return False
+    if "value" not in entry:
+        return False
+    v = entry["value"]
+    return isinstance(v, (int, float))
+
+
+def _fmt(entry):
+    """Format a metric entry for display. Handles sentinels, lists, and numbers."""
+    if not isinstance(entry, dict):
+        return str(entry)
+    if "samples" in entry:
+        return f"[{len(entry['samples'])} samples]"
+    if "value" not in entry:
+        return "—"
+    v = entry["value"]
+    unit = entry.get("unit", "")
+    if isinstance(v, (int, float)):
+        return f"{v:.1f}{unit}"
+    return f"{v}{unit}" if unit else str(v)
+
+
 def compare(current, baseline, threshold):
     rows = []
     has_regression = False
@@ -71,13 +102,31 @@ def compare(current, baseline, threshold):
             if not c or not b:
                 rows.append({
                     "test": test, "metric": key,
-                    "baseline": f"{b['value']}{b.get('unit', '')}" if b else "—",
-                    "current": f"{c['value']}{c.get('unit', '')}" if c else "—",
+                    "baseline": _fmt(b) if b else "—",
+                    "current": _fmt(c) if c else "—",
                     "change": "new" if c and not b else "removed",
                     "flag": "",
                 })
                 continue
 
+            # Non-scorable: time series, sentinels ("timeout"), strings. Show but don't score.
+            if not _is_scored(c) or not _is_scored(b):
+                flag = ""
+                # Special case: if current is "timeout" but baseline was numeric, it's a regression
+                cv = c.get("value") if isinstance(c, dict) else None
+                bv = b.get("value") if isinstance(b, dict) else None
+                if isinstance(cv, str) and cv == "timeout" and isinstance(bv, (int, float)):
+                    flag = "regression"
+                    has_regression = True
+                rows.append({
+                    "test": test, "metric": key,
+                    "baseline": _fmt(b),
+                    "current": _fmt(c),
+                    "change": "—",
+                    "flag": flag,
+                })
+                continue
+
             cv, bv = c["value"], b["value"]
             direction = c.get("direction", "lower_is_better")
 
@@ -97,8 +146,8 @@ def compare(current, baseline, threshold):
 
             rows.append({
                 "test": test, "metric": key,
-                "baseline": f"{bv:.1f}{b.get('unit', '')}",
-                "current": f"{cv:.1f}{c.get('unit', '')}",
+                "baseline": _fmt(b),
+                "current": _fmt(c),
                 "change": f"{change_pct:+.1f}%",
                 "flag": "regression" if regressed else ("improved" if improved else ""),
             })
diff --git a/tests/conftest.py b/tests/conftest.py
index ac5f91b21..9ec4c40ab 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,14 +1,47 @@
-import inspect
 import json
+import logging
 import os
+import re
+import shlex
 import subprocess
 import time
+from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
 
+import pytest
+
 AIRSTACK_ROOT = os.environ.get("AIRSTACK_ROOT", str(Path(__file__).parent.parent))
 RUN_DIR = None
 LOGS_DIR = None
+ROS_DISTRO_SETUP = "/opt/ros/jazzy/setup.bash"
+
+# Track the currently-running pytest item so current_log() and current_test_id()
+# can pick up the parametrize id without tests having to pass `request` around.
+_CURRENT_ITEM = None
+
+logger = logging.getLogger("airstack")
+logger.setLevel(logging.INFO)
+_LOG_FORMAT = logging.Formatter("[%(asctime)s] %(levelname)s %(message)s", "%H:%M:%S")
+_test_log_handler = None
+
+
+# ── pytest config / hooks ──────────────────────────────────────────────────
+
+def pytest_addoption(parser):
+    parser.addoption("--sim", default="msairsim,isaacsim",
+                     help="Comma-separated sim targets: msairsim, isaacsim")
+    parser.addoption("--num-robots", default="1,3",
+                     help="Comma-separated robot counts, e.g. 1,3")
+    parser.addoption("--stress-iterations", type=int, default=3,
+                     help="Number of up/down iterations per (sim, num_robots) config")
+    parser.addoption("--stable-duration", type=int, default=120,
+                     help="Seconds test_stable polls for")
+    parser.addoption("--stable-interval", type=int, default=10,
+                     help="Seconds between polls in test_stable")
+    parser.addoption("--gui", action="store_true", default=False,
+                     help="Show sim GUI windows for visual sanity checks. "
+                          "Default: headless (no X, good for CI).")
 
 
 def pytest_configure(config):
@@ -21,16 +54,81 @@ def pytest_configure(config):
     config.option.xmlpath = str(RUN_DIR / "results.xml")
 
 
-# ── helpers ─────────────────────────────────────────────────────────────────
+def pytest_runtest_setup(item):
+    global _CURRENT_ITEM, _test_log_handler
+    _CURRENT_ITEM = item
+    log_path = LOGS_DIR / f"{current_log()}.log"
+    _test_log_handler = logging.FileHandler(log_path)
+    _test_log_handler.setFormatter(_LOG_FORMAT)
+    logger.addHandler(_test_log_handler)
+
+
+def pytest_runtest_teardown(item):
+    global _CURRENT_ITEM, _test_log_handler
+    if _test_log_handler is not None:
+        logger.removeHandler(_test_log_handler)
+        _test_log_handler.close()
+        _test_log_handler = None
+    _CURRENT_ITEM = None
+
 
-def log_name():
-    frame = inspect.stack()[1]
-    file = Path(frame.filename).stem
-    func = frame.function
-    return f"{file}.{func}"
+@contextmanager
+def logger_to(log_name):
+    """Temporarily route `logger` to a different file. Suspends any handlers
+    already attached so narration isn't duplicated across files."""
+    existing = list(logger.handlers)
+    for h in existing:
+        logger.removeHandler(h)
+    fh = logging.FileHandler(LOGS_DIR / f"{log_name}.log")
+    fh.setFormatter(_LOG_FORMAT)
+    logger.addHandler(fh)
+    try:
+        yield
+    finally:
+        logger.removeHandler(fh)
+        fh.close()
+        for h in existing:
+            logger.addHandler(h)
 
 
-def read_log_tail(log_name, lines=50):
+def pytest_generate_tests(metafunc):
+    """Parametrize airstack_env over (sim, num_robots, iteration) from CLI flags.
+
+    Only triggers for tests that request the airstack_env fixture — other tests
+    are untouched.
+    """
+    if "airstack_env" not in metafunc.fixturenames:
+        return
+    sims = [s.strip() for s in metafunc.config.getoption("--sim").split(",") if s.strip()]
+    nums = [int(x) for x in metafunc.config.getoption("--num-robots").split(",") if x.strip()]
+    iterations = metafunc.config.getoption("--stress-iterations")
+    params = [(s, n, i) for s in sims for n in nums for i in range(iterations)]
+    ids = [f"{s}-{n}-iter{i}" for s, n, i in params]
+    metafunc.parametrize("airstack_env", params, ids=ids, indirect=True, scope="class")
+
+
+# ── logging / subprocess helpers ───────────────────────────────────────────
+
+def current_log():
+    """Log name for the currently-running pytest item, or None outside a test.
+
+    Subprocess helpers default to this so every call fired from a test auto-logs
+    to the right file without plumbing log_name through every layer.
+    """
+    if _CURRENT_ITEM is None:
+        return None
+    # "test_liveliness.py::TestLiveliness::test_foo[id]" →
+    # "test_liveliness.TestLiveliness.test_foo[id]"
+    return (_CURRENT_ITEM.nodeid
+            .replace("/", ".")
+            .replace(".py::", ".")
+            .replace("::", "."))
+
+
+def read_log_tail(log_name=None, lines=50):
+    log_name = log_name or current_log()
+    if not log_name:
+        return ""
     log_path = LOGS_DIR / f"{log_name}.log"
     if log_path.exists():
         all_lines = log_path.read_text().splitlines()
@@ -38,14 +136,29 @@ def read_log_tail(log_name, lines=50):
     return ""
 
 
-def docker_exec(container, cmd, timeout=60, domain_id=None, log_name=None):
-    if domain_id is not None:
-        cmd = f"export ROS_DOMAIN_ID={domain_id} && {cmd}"
+def _run_teed(cmd_list, timeout, log_name=None, env=None, cwd=None):
+    """Run a subprocess, teeing stdout+stderr live to the log file and
+    capturing them for parsing.
+
+    
+    """
+    log_name = log_name or current_log()
+    if not log_name:
+        return subprocess.run(cmd_list, capture_output=True, text=True,
+                              timeout=timeout, env=env, cwd=cwd)
+    log_path = LOGS_DIR / f"{log_name}.log"
+    quoted = " ".join(shlex.quote(a) for a in cmd_list)
+    with open(log_path, "a") as f:
+        f.write(f"\n$ {quoted}\n")
+    shell_cmd = f"{quoted} 2>&1 | tee -a {shlex.quote(str(log_path))}"
+    return subprocess.run(["bash", "-c", shell_cmd],
+                          capture_output=True, text=True,
+                          timeout=timeout, env=env, cwd=cwd)
+
+
+def docker_exec(container, cmd, timeout=60, log_name=None):
     full_cmd = ["docker", "exec", container, "bash", "-c", cmd]
-    if log_name:
-        with open(LOGS_DIR / f"{log_name}.log", "a") as log:
-            return subprocess.run(full_cmd, stdout=log, stderr=log, text=True, timeout=timeout)
-    return subprocess.run(full_cmd, capture_output=True, text=True, timeout=timeout)
+    return _run_teed(full_cmd, timeout=timeout, log_name=log_name)
 
 
 def airstack_cmd(*args, env_overrides=None, timeout=1800, log_name=None):
@@ -53,34 +166,68 @@ def airstack_cmd(*args, env_overrides=None, timeout=1800, log_name=None):
     if env_overrides:
         env.update(env_overrides)
     cmd = [str(Path(AIRSTACK_ROOT) / "airstack.sh")] + list(args)
-    if log_name:
-        with open(LOGS_DIR / f"{log_name}.log", "a") as log:
-            return subprocess.run(cmd, stdout=log, stderr=log, text=True,
-                                  timeout=timeout, cwd=AIRSTACK_ROOT, env=env)
-    return subprocess.run(cmd, capture_output=True, text=True,
-                          timeout=timeout, cwd=AIRSTACK_ROOT, env=env)
+    return _run_teed(cmd, timeout=timeout, log_name=log_name,
+                     env=env, cwd=AIRSTACK_ROOT)
 
 
-def find_container(name_pattern):
-    result = subprocess.run(
+def ros2_env(setup_bash, domain_id):
+    """Shell prefix that makes `ros2` available on the requested domain."""
+    return (
+        f"source {ROS_DISTRO_SETUP} && source {setup_bash} "
+        f"&& export ROS_DOMAIN_ID={domain_id}"
+    )
+
+
+def ros2_exec(container, ros2_cmd, domain_id=0, setup_bash=None, timeout=15, log_name=None):
+    """Run `ros2 ...` inside a container with the right workspace sourced."""
+    setup = setup_bash or "/root/AirStack/robot/ros_ws/install/setup.bash"
+    inner = f"{ros2_env(setup, domain_id)} && {ros2_cmd}"
+    return docker_exec(container, inner, timeout=timeout, log_name=log_name)
+
+
+_HZ_RE = re.compile(r"average rate:\s+([\d.]+)")
+
+
+def _parse_hz(text):
+    m = _HZ_RE.search(text or "")
+    return float(m.group(1)) if m else None
+
+
+# ── container helpers ──────────────────────────────────────────────────────
+
+def find_all_containers(name_pattern):
+    result = _run_teed(
         ["docker", "ps", "--filter", f"name={name_pattern}", "--format", "{{.Names}}"],
-        capture_output=True, text=True,
+        timeout=10,
     )
-    names = result.stdout.strip().splitlines()
+    return [n for n in result.stdout.strip().splitlines() if n]
+
+
+def find_container(name_pattern):
+    names = find_all_containers(name_pattern)
     return names[0] if names else None
 
 
+def get_robot_containers(pattern="robot.*desktop"):
+    """Return a sorted list of currently-running robot container names."""
+    return sorted(find_all_containers(pattern))
+
+
+def container_running(name):
+    """True if the named container is currently Running."""
+    result = _run_teed(
+        ["docker", "inspect", "-f", "{{.State.Running}}", name],
+        timeout=10,
+    )
+    return "true" in result.stdout
+
+
 def wait_for_container(name_pattern, timeout=120):
     deadline = time.time() + timeout
     while time.time() < deadline:
         name = find_container(name_pattern)
-        if name:
-            result = subprocess.run(
-                ["docker", "inspect", "-f", "{{.State.Running}}", name],
-                capture_output=True, text=True,
-            )
-            if "true" in result.stdout:
-                return name
+        if name and container_running(name):
+            return name
         time.sleep(5)
     raise TimeoutError(f"Container matching '{name_pattern}' not running after {timeout}s")
 
@@ -89,23 +236,17 @@ def docker_image_size_mb(service, env=None):
     compose_env = os.environ.copy()
     if env:
         compose_env.update(env)
-    # Resolve the image name for this service from compose config
-    result = subprocess.run(
+    result = _run_teed(
         ["docker", "compose", "-f", str(Path(AIRSTACK_ROOT) / "docker-compose.yaml"),
          "config", "--images"],
-        capture_output=True, text=True, cwd=AIRSTACK_ROOT, env=compose_env,
+        timeout=30, env=compose_env, cwd=AIRSTACK_ROOT,
     )
-    image = None
-    for line in result.stdout.strip().splitlines():
-        if service in line:
-            image = line.strip()
-            break
+    image = next((l.strip() for l in result.stdout.strip().splitlines() if service in l), None)
     if not image:
         return None
-    # Get the image size
-    result = subprocess.run(
+    result = _run_teed(
         ["docker", "image", "inspect", image, "--format", "{{.Size}}"],
-        capture_output=True, text=True,
+        timeout=10,
     )
     if result.returncode == 0 and result.stdout.strip():
         return round(int(result.stdout.strip()) / 1_000_000, 1)
@@ -127,6 +268,13 @@ def record(self, test_name, key, value, unit="", direction="lower_is_better"):
         }
         self._path.write_text(json.dumps(self._data, indent=2))
 
+    def record_list(self, test_name, key, values):
+        """Store a raw list (time series) — not scored by compare_metrics."""
+        if test_name not in self._data:
+            self._data[test_name] = {}
+        self._data[test_name][key] = {"samples": values}
+        self._path.write_text(json.dumps(self._data, indent=2))
+
 
 METRICS = None
 
@@ -136,3 +284,193 @@ def get_metrics():
     if METRICS is None:
         METRICS = MetricsRecorder(RUN_DIR / "metrics.json")
     return METRICS
+
+
+def current_test_id():
+    """Full pytest test id for this test invocation — used as the metrics.json key."""
+    if _CURRENT_ITEM is None:
+        return "unknown"
+    return _CURRENT_ITEM.nodeid
+
+
+# ── shared sim test infrastructure (liveliness, comms, takeoff all reuse) ──
+
+SIM_CONFIG = {
+    "msairsim": {
+        "profile": "ms-airsim",
+        "sim_container": "ms-airsim",
+        "sim_setup_bash": "/root/ros_ws/install/setup.bash",
+        "robot_setup_bash": "/root/AirStack/robot/ros_ws/install/setup.bash",
+        "extra_env": {
+            "URDF_FILE": "robot_descriptions/iris/urdf/iris_stereo.ms-airsim.urdf",
+            # Clear any user-set paths in .env so entrypoint auto-fetches Blocks.
+            # Shell env wins over --env-file in docker compose substitution.
+            "MS_AIRSIM_ENV_DIR": "",
+            "MS_AIRSIM_BINARY_PATH": "",
+        },
+    },
+    "isaacsim": {
+        "profile": "isaac-sim",
+        "sim_container": "isaac-sim",
+        "sim_setup_bash": "/opt/ros/jazzy/setup.bash",
+        "robot_setup_bash": "/root/AirStack/robot/ros_ws/install/setup.bash",
+        "extra_env": {
+            "ISAAC_SIM_USE_STANDALONE": "true",
+            "ISAAC_SIM_SCRIPT_NAME": "example_multi_px4_pegasus_launch_script.py",
+            "PLAY_SIM_ON_START": "true",
+        },
+    },
+}
+
+
+def wait_for_first_message(container, topic, domain_id, setup_bash, timeout=60):
+    """Wait up to `timeout` seconds for one message on `topic`. Returns seconds
+    elapsed on success, None on timeout. Each attempt sources the workspace
+    and runs `ros2 topic echo --once`; if the workspace isn't built yet or the
+    topic has no publisher, the attempt fails fast and we retry.
+    """
+    start = time.time()
+    deadline = start + timeout
+    logger.info("Probing %s on domain %d in %s (timeout=%ds)",
+                topic, domain_id, container, timeout)
+    attempt = 0
+    while time.time() < deadline:
+        attempt += 1
+        per_attempt = min(max(1, int(deadline - time.time())), 10)
+        try:
+            result = ros2_exec(
+                container,
+                f"timeout {per_attempt} ros2 topic echo --once {topic}",
+                domain_id=domain_id, setup_bash=setup_bash, timeout=per_attempt + 5,
+            )
+        except subprocess.TimeoutExpired:
+            logger.warning("Attempt %d subprocess timeout for %s, retrying", attempt, topic)
+            time.sleep(2)
+            continue
+        # ros2 prints "---" on its own line after a real message.
+        if result.stdout.rstrip().endswith("---"):
+            elapsed = round(time.time() - start, 2)
+            logger.info("Got first message on %s after %.2fs (attempt %d)",
+                        topic, elapsed, attempt)
+            return elapsed
+        logger.warning("Attempt %d failed for %s, retrying", attempt, topic)
+        time.sleep(2)
+    logger.error("Timed out waiting for first message on %s after %ds",
+                 topic, timeout)
+    return None
+
+
+def sample_hz(container, topic, domain_id, setup_bash, duration=5, window=10):
+    """Sample publish rate on `topic` for `duration` seconds. Returns float or None."""
+    result = ros2_exec(
+        container,
+        f"timeout {duration} ros2 topic hz --window {window} {topic} 2>&1",
+        domain_id=domain_id, setup_bash=setup_bash, timeout=duration + 15,
+    )
+    return _parse_hz(result.stdout + result.stderr)
+
+
+def parallel_sample_hz(container, topic_domain_pairs, setup_bash, duration=5, window=10):
+    """Sample Hz for multiple topics concurrently; return {topic: hz_or_None}.
+
+    One `docker exec` that backgrounds each `ros2 topic hz` probe, waits for all,
+    then cats each probe's temp file.
+    """
+    probes = []
+    temp_files = {}
+    for i, (topic, domain) in enumerate(topic_domain_pairs):
+        fname = f"/tmp/hz_{i}.out"
+        temp_files[topic] = fname
+        probes.append(
+            f"(ROS_DOMAIN_ID={domain} timeout {duration} "
+            f"ros2 topic hz --window {window} {topic} > {fname} 2>&1) &"
+        )
+    # Newlines, not `&& ... &`: bash precedence makes `A && B && C & D &` only
+    # apply the && chain to C, so later backgrounded probes would miss the
+    # sourced PATH. One statement per line sidesteps this entirely.
+    lines = [f"source {ROS_DISTRO_SETUP}", f"source {setup_bash}"] + probes + ["wait"]
+    for fname in temp_files.values():
+        lines.append(f"echo '===FILE {fname}==='")
+        lines.append(f"cat {fname} 2>/dev/null || true")
+    script = "\n".join(lines)
+    result = _run_teed(
+        ["docker", "exec", container, "bash", "-c", script],
+        timeout=duration + 30,
+    )
+    rates = {}
+    if result.returncode == 0 or result.stdout:
+        chunks = result.stdout.split("===FILE ")
+        for chunk in chunks[1:]:
+            header, _, content = chunk.partition("===")
+            fname = header.strip()
+            topic = next((t for t, f in temp_files.items() if f == fname), None)
+            if topic:
+                rates[topic] = _parse_hz(content)
+    for topic, _ in topic_domain_pairs:
+        rates.setdefault(topic, None)
+    return rates
+
+
+@pytest.fixture
+def airstack_env(request):
+    """Parametrized fixture: runs `airstack up`, yields env dict, tears down.
+
+    Parametrized by `pytest_generate_tests` over (sim, num_robots, iteration)
+    tuples derived from CLI flags.
+
+    Deliberately does NOT wait for containers or ROS2 nodes to be ready — tests
+    own their wait loops + timeout metrics so failures attribute to the right layer.
+    """
+    sim, num_robots, iteration = request.param
+    cfg = SIM_CONFIG[sim]
+    log = f"airstack_env[{_CURRENT_ITEM.callspec.id}]"
+
+    headless = not request.config.getoption("--gui")
+    env_overrides = {
+        "AUTOLAUNCH": "true",
+        "NUM_ROBOTS": str(num_robots),
+        "COMPOSE_PROFILES": f"desktop,{cfg['profile']}",
+        "MS_AIRSIM_HEADLESS": "true" if headless else "false",
+        "ISAAC_SIM_HEADLESS": "true" if headless else "false",
+    }
+    if headless:
+        # Forces rviz/Qt apps to render offscreen instead of spawning windows.
+        env_overrides["QT_QPA_PLATFORM"] = "offscreen"
+    env_overrides.update(cfg.get("extra_env", {}))
+
+    with logger_to(log):
+        logger.info("Bringing up stack: sim=%s num_robots=%d iter=%d headless=%s",
+                    sim, num_robots, iteration, headless)
+        t0 = time.time()
+        up_result = airstack_cmd("up",
+                                 env_overrides=env_overrides, timeout=180, log_name=log)
+        up_cmd_duration_s = round(time.time() - t0, 2)
+        logger.info("airstack up returned %d in %.2fs",
+                    up_result.returncode, up_cmd_duration_s)
+        assert up_result.returncode == 0, \
+            f"airstack up failed:\n{read_log_tail(log)}"
+
+    env = {
+        "sim": sim,
+        "num_robots": num_robots,
+        "iteration": iteration,
+        "sim_container": cfg["sim_container"],
+        "robot_pattern": "robot.*desktop",
+        "up_started_at": t0,
+        "cfg": cfg,
+    }
+
+    tid = current_test_id()
+    m = get_metrics()
+    m.record(tid, "airstack_up_duration_s", up_cmd_duration_s, unit="s")
+
+    try:
+        yield env
+    finally:
+        with logger_to(log):
+            logger.info("Tearing down stack")
+            t3 = time.time()
+            airstack_cmd("down", timeout=120, log_name=log)
+            down_duration_s = round(time.time() - t3, 2)
+            logger.info("Teardown finished in %.2fs", down_duration_s)
+        m.record(tid, "airstack_down_duration_s", down_duration_s, unit="s")
\ No newline at end of file
diff --git a/tests/docker/docker-compose.yaml b/tests/docker/docker-compose.yaml
index 1de991ed0..795891d19 100644
--- a/tests/docker/docker-compose.yaml
+++ b/tests/docker/docker-compose.yaml
@@ -9,5 +9,6 @@ services:
       - ${AIRSTACK_PATH}/tests/results:${AIRSTACK_PATH}/tests/results
     environment:
       - AIRSTACK_ROOT=${AIRSTACK_PATH}
+      - DISPLAY=${DISPLAY:-}
     working_dir: ${AIRSTACK_PATH}/tests
     network_mode: host
diff --git a/tests/pytest.ini b/tests/pytest.ini
index 022358c02..2650a4181 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -8,3 +8,7 @@ markers =
 testpaths = .
 addopts = -v --durations=0
 cache_dir = /tmp/.pytest_cache
+log_cli = true
+log_cli_level = INFO
+log_cli_format = [%(asctime)s] %(levelname)s %(message)s
+log_cli_date_format = %H:%M:%S
diff --git a/tests/test_build_docker.py b/tests/test_build_docker.py
index 8105d939e..556e885e5 100644
--- a/tests/test_build_docker.py
+++ b/tests/test_build_docker.py
@@ -1,5 +1,5 @@
 import pytest
-from conftest import airstack_cmd, log_name, read_log_tail, docker_image_size_mb, get_metrics
+from conftest import airstack_cmd, read_log_tail, docker_image_size_mb, get_metrics
 
 
 @pytest.mark.build_docker
@@ -7,9 +7,8 @@
 class TestDockerBuilds:
 
     def _build_and_record(self, service, env=None):
-        log = log_name()
-        result = airstack_cmd("image-build", service, timeout=3600, log_name=log)
-        assert result.returncode == 0, f"{service} build failed (exit {result.returncode}):\n{read_log_tail(log)}"
+        result = airstack_cmd("image-build", service, timeout=3600)
+        assert result.returncode == 0, f"{service} build failed (exit {result.returncode}):\n{read_log_tail()}"
 
         size = docker_image_size_mb(service, env=env)
         if size is not None:
diff --git a/tests/test_build_packages.py b/tests/test_build_packages.py
index 708f16389..2f594d261 100644
--- a/tests/test_build_packages.py
+++ b/tests/test_build_packages.py
@@ -1,5 +1,5 @@
 import pytest
-from conftest import airstack_cmd, wait_for_container, docker_exec, log_name, read_log_tail
+from conftest import airstack_cmd, wait_for_container, docker_exec, read_log_tail
 
 
 @pytest.mark.build_packages
@@ -7,48 +7,45 @@
 class TestColconBuilds:
 
     def test_colcon_build_robot(self):
-        log = log_name()
         try:
             result = airstack_cmd("up", "robot-desktop",
                                   env_overrides={"AUTOLAUNCH": "false", "DISPLAY": ""},
-                                  timeout=120, log_name=log)
-            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail(log)}"
+                                  timeout=120)
+            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail()}"
 
             container = wait_for_container("robot.*desktop", timeout=60)
             assert container, "Robot container not found"
 
-            build = docker_exec(container, "bash -ic bws", timeout=600, log_name=log)
-            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail(log)}"
+            build = docker_exec(container, "bash -ic bws", timeout=600)
+            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail()}"
         finally:
-            airstack_cmd("down", log_name=log)
+            airstack_cmd("down")
 
     def test_colcon_build_gcs(self):
-        log = log_name()
         try:
             result = airstack_cmd("up", "gcs",
                                   env_overrides={"AUTOLAUNCH": "false", "DISPLAY": ""},
-                                  timeout=120, log_name=log)
-            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail(log)}"
+                                  timeout=120)
+            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail()}"
 
             container = wait_for_container("gcs", timeout=60)
             assert container, "GCS container not found"
 
-            build = docker_exec(container, "bash -ic bws", timeout=600, log_name=log)
-            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail(log)}"
+            build = docker_exec(container, "bash -ic bws", timeout=600)
+            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail()}"
         finally:
-            airstack_cmd("down", log_name=log)
+            airstack_cmd("down")
 
     def test_colcon_build_ms_airsim(self):
-        log = log_name()
         try:
             result = airstack_cmd(
                 "up", "ms-airsim",
                 env_overrides={"AUTOLAUNCH": "false", "DISPLAY": "",
                                "COMPOSE_PROFILES": "ms-airsim",
                                "URDF_FILE": "robot_descriptions/iris/urdf/iris_stereo.ms-airsim.urdf"},
-                timeout=120, log_name=log,
+                timeout=120,
             )
-            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail(log)}"
+            assert result.returncode == 0, f"airstack up failed:\n{read_log_tail()}"
 
             container = wait_for_container("ms-airsim", timeout=60)
             assert container, "ms-airsim container not found"
@@ -56,8 +53,8 @@ def test_colcon_build_ms_airsim(self):
             build = docker_exec(
                 container,
                 "cd /root/ros_ws && colcon build --symlink-install",
-                timeout=600, log_name=log,
+                timeout=600,
             )
-            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail(log)}"
+            assert build.returncode == 0, f"colcon build failed:\n{read_log_tail()}"
         finally:
-            airstack_cmd("down", log_name=log)
+            airstack_cmd("down")
diff --git a/tests/test_liveliness.py b/tests/test_liveliness.py
new file mode 100644
index 000000000..76024d4dd
--- /dev/null
+++ b/tests/test_liveliness.py
@@ -0,0 +1,291 @@
+"""Liveliness stress tests — parametrized over (sim × num_robots × iteration).
+
+Tests verify the stack is up, tmux processes are alive, sim is producing data,
+expected ROS2 nodes exist, and the system remains stable under a poll window.
+
+See plan: /home/oalama/.claude/plans/piped-crafting-meteor-liveliness.md
+"""
+import time
+
+import pytest
+
+from conftest import (
+    container_running,
+    current_test_id,
+    docker_exec,
+    find_all_containers,
+    get_metrics,
+    get_robot_containers,
+    logger,
+    parallel_sample_hz,
+    ros2_exec,
+    wait_for_first_message,
+)
+
+
+# ── liveliness-specific topic list ─────────────────────────────────────────
+
+SENSOR_TOPIC_TEMPLATES = [
+    "/robot_{N}/sensors/front_stereo/left/image_rect",
+    "/robot_{N}/sensors/front_stereo/right/image_rect",
+    "/robot_{N}/sensors/front_stereo/left/depth_ground_truth",
+    "/robot_{N}/sensors/front_stereo/right/depth_ground_truth",
+]
+
+SENTINEL_NODE_TEMPLATES = [
+    "/robot_{N}/interface/mavros/mavros",
+    "/robot_{N}/robot_state_publisher",
+    "/robot_{N}/trajectory_controller/trajectory_control_node",
+]
+
+
+def sim_side_topics(num_robots):
+    """Return (topic, domain_id) tuples for all sim-side topics at a given robot count.
+
+    Both ms-airsim and Pegasus publish /clock per-robot-domain (ROS 2 has no
+    cross-domain discovery), so there's one /clock on each robot's domain.
+    """
+    topics = []
+    for n in range(1, num_robots + 1):
+        topics.append(("/clock", n))
+        for tmpl in SENSOR_TOPIC_TEMPLATES:
+            topics.append((tmpl.format(N=n), n))
+    return topics
+
+
+# ── probe helpers (shared between one-shot tests and test_stable) ──────────
+
+def _parse_panes(raw):
+    """Return (crashed, active_count). Input lines: 'session:window|pane_pid|title|kids'.
+
+    Crashed: pane with 0 descendants whose title isn't 'shell' (shell-tagged panes
+    are intentionally idle bash). Active: pane with at least one descendant.
+    """
+    crashed = []
+    active = 0
+    for line in raw.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        parts = line.split("|")
+        if len(parts) != 4:
+            continue
+        _, _, title, kids = parts
+        kid_count = int(kids.strip() or 0)
+        if kid_count == 0 and title != "shell":
+            crashed.append(line)
+        elif kid_count > 0:
+            active += 1
+    return crashed, active
+
+
+def _check_tmux_panes(env):
+    """Return (ok, msg). Zero-descendant pane (not a 'shell'-tagged idle bash) = crashed."""
+    # Format: session:window|pane_pid|pane_title|descendant_count
+    cmd = (
+        "tmux list-panes -a -F '#{session_name}:#{window_name}|#{pane_pid}|#{pane_title}' "
+        "| while IFS='|' read -r w pid t; do "
+        "kids=$(pgrep -P \"$pid\" | wc -l); "
+        "printf '%s|%s|%s|%s\\n' \"$w\" \"$pid\" \"$t\" \"$kids\"; "
+        "done"
+    )
+    counts = {}
+    sim_container = env["sim_container"]
+    logger.info("Listing tmux panes in %s", sim_container)
+    result = docker_exec(sim_container, cmd, timeout=10)
+    if result.returncode != 0:
+        return False, f"tmux list-panes failed in {sim_container}"
+    sim_crashed, sim_active = _parse_panes(result.stdout)
+    counts[sim_container] = sim_active
+    if sim_crashed:
+        logger.warning("Sim panes crashed in %s: %s", sim_container, sim_crashed)
+        return False, f"sim panes crashed: {sim_crashed}"
+
+    for rc in get_robot_containers(env["robot_pattern"]):
+        logger.info("Listing tmux panes in %s", rc)
+        r = docker_exec(rc, cmd, timeout=10)
+        if r.returncode != 0:
+            return False, f"tmux list-panes failed in {rc}"
+        rcrashed, ractive = _parse_panes(r.stdout)
+        counts[rc] = ractive
+        if rcrashed:
+            logger.warning("Robot %s panes crashed: %s", rc, rcrashed)
+            return False, f"robot {rc} panes crashed: {rcrashed}"
+    summary = ", ".join(f"{c}={n}" for c, n in counts.items())
+    logger.info("All tmux panes active (%s)", summary)
+    return True, f"all tmux panes active ({summary})"
+
+
+def _check_sentinel_nodes(env):
+    """Return (ok, msg). Expected sentinels per robot domain."""
+    cfg = env["cfg"]
+    robot_containers = get_robot_containers(env["robot_pattern"])
+    if len(robot_containers) < env["num_robots"]:
+        return False, f"only {len(robot_containers)}/{env['num_robots']} robot containers visible"
+    all_missing = {}
+    for n in range(1, env["num_robots"] + 1):
+        logger.info("Checking sentinel nodes for robot_%d on domain %d in %s",
+                    n, n, robot_containers[n - 1])
+        result = ros2_exec(
+            robot_containers[n - 1],
+            "ros2 node list 2>/dev/null",
+            domain_id=n, setup_bash=cfg["robot_setup_bash"], timeout=20,
+        )
+        if result.returncode != 0:
+            return False, f"ros2 node list failed for robot_{n}"
+        nodes = set(result.stdout.splitlines())
+        expected = {t.format(N=n) for t in SENTINEL_NODE_TEMPLATES}
+        missing = expected - nodes
+        if missing:
+            logger.warning("robot_%d missing nodes: %s", n, sorted(missing))
+            all_missing[f"robot_{n}"] = sorted(missing)
+    if all_missing:
+        return False, f"missing sentinel nodes: {all_missing}"
+    total = env["num_robots"] * len(SENTINEL_NODE_TEMPLATES)
+    logger.info("All %d sentinel nodes present", total)
+    return True, f"all {total} sentinel nodes present"
+
+
+def _check_sim_publishing(env):
+    """Parallel Hz sample of all sim-side topics. Returns (ok, msg, rates_dict)."""
+    cfg = env["cfg"]
+    topics = sim_side_topics(env["num_robots"])
+    logger.info("Sampling Hz for %d sim-side topics", len(topics))
+    rates = parallel_sample_hz(
+        env["sim_container"], topics,
+        setup_bash=cfg["sim_setup_bash"], duration=10, window=5,
+    )
+    stalled = [t for t, hz in rates.items() if hz is None or hz == 0.0]
+    if stalled:
+        logger.warning("Stalled topics: %s", stalled)
+        return False, f"stalled topics: {stalled}", rates
+    logger.info("%d topics healthy", len(rates))
+    return True, f"{len(rates)} topics healthy", rates
+
+
+# ── tests ──────────────────────────────────────────────────────────────────
+
+@pytest.mark.liveliness
+@pytest.mark.timeout(1800)
+class TestLiveliness:
+
+    def test_robot_containers_running(self, airstack_env):
+        """Wait up to 120s for N robot containers to be Running."""
+        num_robots = airstack_env["num_robots"]
+        pattern = airstack_env["robot_pattern"]
+
+        deadline = time.time() + 120
+        while time.time() < deadline:
+            containers = get_robot_containers(pattern)
+            if len(containers) >= num_robots and all(container_running(c) for c in containers):
+                return
+            time.sleep(3)
+        pytest.fail(f"only {len(get_robot_containers(pattern))}/{num_robots} robot "
+                    f"containers Running after 120s")
+
+    def test_sim_container_running(self, airstack_env):
+        sc = airstack_env["sim_container"]
+        deadline = time.time() + 120
+        while time.time() < deadline:
+            if container_running(sc):
+                return
+            time.sleep(3)
+        pytest.fail(f"{sc} not Running after 120s")
+
+    def test_gcs_container_running(self, airstack_env):
+        deadline = time.time() + 120
+        while time.time() < deadline:
+            names = find_all_containers("gcs")
+            if names and all(container_running(n) for n in names):
+                return
+            time.sleep(3)
+        pytest.fail("gcs container not Running after 120s")
+
+    def test_sim_ready_time(self, airstack_env):
+        """Wait for first /clock message from the sim container. 600s hard timeout.
+
+        /clock is published per-robot-domain (see sim_side_topics); we listen on
+        robot_1's domain as the readiness signal.
+        """
+        cfg = airstack_env["cfg"]
+        m = get_metrics()
+        tid = current_test_id()
+        start = airstack_env["up_started_at"]
+
+        if wait_for_first_message(
+            airstack_env["sim_container"], "/clock",
+            domain_id=1, setup_bash=cfg["sim_setup_bash"], timeout=600,
+        ) is None:
+            m.record(tid, "sim_ready_duration_s", "timeout", unit="s")
+            pytest.fail("sim never published /clock within 600s")
+        m.record(tid, "sim_ready_duration_s", round(time.time() - start, 2), unit="s")
+
+    def test_tmux_panes_have_expected_processes(self, airstack_env):
+        ok, msg = _check_tmux_panes(airstack_env)
+        assert ok, msg
+
+    def test_sim_publishing(self, airstack_env):
+        """Parallel Hz sample of all sim-side topics. Fail on any stalled topic."""
+        ok, msg, _ = _check_sim_publishing(airstack_env)
+        assert ok, msg
+
+    def test_sentinel_nodes_present(self, airstack_env):
+        """Wait up to 300s for the expected sentinel nodes per robot."""
+        deadline = time.time() + 300
+        ok, msg = False, ""
+        while time.time() < deadline:
+            ok, msg = _check_sentinel_nodes(airstack_env)
+            if ok:
+                return
+            time.sleep(5)
+        pytest.fail(f"sentinel nodes not ready after 300s: {msg}")
+
+    def test_stable(self, airstack_env, request):
+        """Poll every --stable-interval for up to --stable-duration. Early exit on failure."""
+        duration = request.config.getoption("--stable-duration")
+        interval = request.config.getoption("--stable-interval")
+        m = get_metrics()
+        tid = current_test_id()
+
+        series = {}
+        elapsed = 0
+
+        while elapsed < duration:
+            time.sleep(interval)
+            elapsed += interval
+
+            ok_t, msg_t = _check_tmux_panes(airstack_env)
+            ok_n, msg_n = _check_sentinel_nodes(airstack_env)
+            ok_p, msg_p, rates = _check_sim_publishing(airstack_env)
+
+            for topic, hz in rates.items():
+                key = topic.lstrip("/").replace("/", ".")
+                series.setdefault(key, []).append({"t": elapsed, "hz": hz or 0.0})
+
+            if not (ok_t and ok_n and ok_p):
+                self._record_stable_aggregates(m, tid, series)
+                pytest.fail(
+                    f"instability at t={elapsed}s: tmux={msg_t} | "
+                    f"nodes={msg_n} | publishing={msg_p}"
+                )
+
+        self._record_stable_aggregates(m, tid, series)
+
+    @staticmethod
+    def _record_stable_aggregates(m, tid, series):
+        """Record aggregate stats + full time series per topic."""
+        for key, samples in series.items():
+            hz_values = [s["hz"] for s in samples]
+            if not hz_values:
+                continue
+            m.record_list(tid, f"{key}.hz_samples", samples)
+            m.record(tid, f"{key}.hz_first", hz_values[0], unit="Hz",
+                     direction="higher_is_better")
+            m.record(tid, f"{key}.hz_last", hz_values[-1], unit="Hz",
+                     direction="higher_is_better")
+            m.record(tid, f"{key}.hz_mean",
+                     round(sum(hz_values) / len(hz_values), 2),
+                     unit="Hz", direction="higher_is_better")
+            m.record(tid, f"{key}.hz_range",
+                     round(max(hz_values) - min(hz_values), 2),
+                     unit="Hz", direction="lower_is_better")

From 22e7fb81c339f056fac83d3f01da7a8448ebd5fe Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Mon, 20 Apr 2026 11:07:33 -0400
Subject: [PATCH 08/24] Liveliness with properly grouped output table

---
 tests/compare_metrics.py | 445 +++++++++++++++++++++++++++++++--------
 tests/conftest.py        |  96 ++++-----
 tests/requirements.txt   |   2 +
 tests/test_liveliness.py | 134 ++++++------
 4 files changed, 475 insertions(+), 202 deletions(-)

diff --git a/tests/compare_metrics.py b/tests/compare_metrics.py
index a601a49a0..103a78c8f 100644
--- a/tests/compare_metrics.py
+++ b/tests/compare_metrics.py
@@ -10,10 +10,117 @@
 """
 import argparse
 import json
+import re
+import statistics
 import sys
 import xml.etree.ElementTree as ET
+from collections import defaultdict
 from pathlib import Path
 
+from tabulate import tabulate
+
+FLAG_SUFFIX = {"regression": " :red_circle:", "improved": " :green_circle:"}
+
+ITER_RE = re.compile(r"-iter(\d+)(?=\])")
+ROBOT_RE = re.compile(r"\brobot_\d+\b")
+HZ_AGGS = ("mean", "start_mean", "end_mean", "min", "max")
+HZ_METRIC_RE = re.compile(rf"^(.+)\.hz_({'|'.join(HZ_AGGS + ('samples',))})$")
+
+
+def _split_test_name(name):
+    """`test_liveliness.TestLiveliness.test_foo[id]` →
+    (module="test_liveliness", display="test_foo[id]"). Drops the Class segment
+    for display since there's one class per module."""
+    parts = name.split(".", 2)
+    if len(parts) == 3:
+        return parts[0], parts[2]
+    return parts[0], name
+
+
+def _aggregate_samples(series_list):
+    """Align per-iteration sample lists by `t` and return per-step mean/std.
+
+    Input: [[{"t": 10, "hz": 19.27}, ...], [{"t": 10, "hz": 45.67}, ...], ...]
+    Output: [{"t": 10, "hz_mean": 32.5, "hz_std": 13.2, "n": 2}, ...]
+    """
+    by_t = defaultdict(list)
+    for series in series_list:
+        for s in series:
+            t, hz = s.get("t"), s.get("hz")
+            if t is None or hz is None:
+                continue
+            by_t[t].append(hz)
+    out = []
+    for t in sorted(by_t):
+        vals = by_t[t]
+        out.append({
+            "t": t,
+            "hz_mean": round(statistics.mean(vals), 2),
+            "hz_std": round(statistics.pstdev(vals), 2) if len(vals) > 1 else 0.0,
+            "n": len(vals),
+        })
+    return out
+
+
+def _collapse_robots(merged):
+    """Merge per-robot metric keys into robot-agnostic ones (homogeneous robots).
+    `robot_1.sensors.foo.hz_samples` + `robot_2.sensors.foo.hz_samples` →
+    `robot.sensors.foo.hz_samples` with sample lists concatenated. Mutates
+    `merged` in place."""
+    for metrics in merged.values():
+        merged_samples = {}  # new_key -> combined samples list
+        for key, val in list(metrics.items()):
+            if not ROBOT_RE.search(key):
+                continue
+            new_key = ROBOT_RE.sub("robot", key)
+            if new_key == key:
+                continue
+            metrics.pop(key)
+            if isinstance(val, dict) and "samples" in val:
+                merged_samples.setdefault(new_key, []).extend(val["samples"])
+            else:
+                metrics.setdefault(new_key, val)
+        for new_key, samples in merged_samples.items():
+            metrics[new_key] = {"samples": samples}
+
+
+def _expand_hz_samples(merged):
+    """For each `{topic}.hz_samples` time series, synthesize scalar aggregates
+    (hz_mean, hz_min, hz_max, hz_start_mean, hz_end_mean) as peer metrics.
+    Mutates `merged` in place."""
+    suffix = ".hz_samples"
+    hz_up = {"unit": "Hz", "direction": "higher_is_better"}
+    for metrics in merged.values():
+        for key, val in list(metrics.items()):
+            if not (isinstance(val, dict) and "samples" in val and key.endswith(suffix)):
+                continue
+            samples = val["samples"]
+            if not samples:
+                continue
+            # Post-robot-merge, samples from different robots may interleave.
+            # Sort by t so start_mean/end_mean slicing lands on clean time halves.
+            samples = sorted(samples, key=lambda s: s["t"])
+            topic = key.removesuffix(suffix)
+            hz = [s["hz"] for s in samples]
+            ts = [s["t"] for s in samples]
+            half = len(samples) // 2 or 1
+            aggs = {
+                "hz_mean": {"value": round(statistics.mean(hz), 2), **hz_up},
+                "hz_min": {"value": min(hz), **hz_up},
+                "hz_max": {"value": max(hz), **hz_up},
+                "hz_start_mean": {
+                    "value": round(statistics.mean(hz[:half]), 2),
+                    "t_start": ts[0], "t_end": ts[half - 1], **hz_up,
+                },
+            }
+            if len(hz) > half:
+                aggs["hz_end_mean"] = {
+                    "value": round(statistics.mean(hz[half:]), 2),
+                    "t_start": ts[half], "t_end": ts[-1], **hz_up,
+                }
+            for agg_name, entry in aggs.items():
+                metrics.setdefault(f"{topic}.{agg_name}", entry)
+
 
 def parse_results_xml(path):
     tree = ET.parse(path)
@@ -48,7 +155,79 @@ def merge_metrics(run_dir):
         if test_name not in merged:
             merged[test_name] = {}
         merged[test_name].update(test_metrics)
-    return merged
+    _collapse_robots(merged)
+    _expand_hz_samples(merged)
+    return _collapse_iterations(merged)
+
+
+def _collapse_iterations(merged):
+    """Strip -iterN from test keys and aggregate metrics across iterations.
+
+    - Numeric scalars → `value` = mean, `stddev`, `n` (success count),
+      `total` (iter count), `failures` (sentinel count), `missing` (neither).
+      If all iterations produced sentinels (e.g. all timeout), value = sentinel.
+    - Time-series `samples` → aligned per `t` across iterations; output list
+      has `hz_mean`, `hz_std`, `n` per step.
+    """
+    numeric = {}  # (base, key) -> {"values": [...], "sentinels": [...], "meta": {...}}
+    series = {}   # (base, key) -> [samples_list, ...]
+    out = {}
+    iters_per_test = {}
+
+    for name, metrics in merged.items():
+        m = ITER_RE.search(name)
+        iter_n = int(m.group(1)) if m else 0
+        base = ITER_RE.sub("", name)
+        iters_per_test.setdefault(base, set()).add(iter_n)
+        bucket = out.setdefault(base, {})
+        for key, val in metrics.items():
+            if key == "status":
+                if val == "failed" or bucket.get("status") == "failed":
+                    bucket["status"] = "failed"
+                else:
+                    bucket["status"] = val
+                continue
+            if isinstance(val, dict) and "samples" in val:
+                series.setdefault((base, key), []).append(val["samples"])
+                continue
+            if isinstance(val, dict) and "value" in val:
+                acc = numeric.setdefault((base, key), {
+                    "values": [], "sentinels": [], "meta": {
+                        "unit": val.get("unit", ""),
+                        "direction": val.get("direction", "lower_is_better"),
+                    }})
+                for k, v2 in val.items():
+                    if k not in ("value", "unit", "direction", "samples"):
+                        acc["meta"].setdefault(k, v2)
+                v = val["value"]
+                (acc["values"] if isinstance(v, (int, float)) else acc["sentinels"]).append(v)
+                continue
+            bucket.setdefault(key, val)
+
+    for (base, key), acc in numeric.items():
+        total = len(iters_per_test[base])
+        nums, sentinels = acc["values"], acc["sentinels"]
+        entry = dict(acc["meta"])
+        entry["total"] = total
+        entry["failures"] = len(sentinels)
+        entry["missing"] = total - len(nums) - len(sentinels)
+        if nums:
+            entry["value"] = round(statistics.mean(nums), 3)
+            entry["stddev"] = round(statistics.pstdev(nums), 3) if len(nums) > 1 else 0.0
+            entry["n"] = len(nums)
+        elif sentinels:
+            entry["value"] = sentinels[0]
+            entry["n"] = 0
+        else:
+            continue
+        out[base][key] = entry
+
+    for (base, key), series_list in series.items():
+        out[base][key] = {
+            "samples": _aggregate_samples(series_list),
+            "n": len(series_list),
+        }
+    return out
 
 
 def _is_scored(entry):
@@ -68,110 +247,204 @@ def _is_scored(entry):
 
 
 def _fmt(entry):
-    """Format a metric entry for display. Handles sentinels, lists, and numbers."""
+    """Format a metric entry for display. For aggregated numeric metrics,
+    shows `mean ± stddev (n=success/total, F fail, M miss)`. Flags (failures,
+    missing) are only appended when non-zero."""
     if not isinstance(entry, dict):
         return str(entry)
     if "samples" in entry:
-        return f"[{len(entry['samples'])} samples]"
+        label = f"[{len(entry['samples'])} steps"
+        if entry.get("n", 1) > 1:
+            label += f" × n={entry['n']}"
+        return label + "]"
     if "value" not in entry:
         return "—"
     v = entry["value"]
     unit = entry.get("unit", "")
+    n = entry.get("n", 0)
+    total = entry.get("total", n)
+    failures = entry.get("failures", 0)
+    missing = entry.get("missing", 0)
+    stddev = entry.get("stddev")
+
+    def _context():
+        parts = [f"n={n}/{total}"] if total > 1 else []
+        if failures:
+            parts.append(f"{failures} fail")
+        if missing:
+            parts.append(f"{missing} miss")
+        return f" ({', '.join(parts)})" if parts else ""
+
     if isinstance(v, (int, float)):
-        return f"{v:.1f}{unit}"
-    return f"{v}{unit}" if unit else str(v)
+        base = f"{v:.4g}{unit}"
+        if total > 1 and stddev is not None:
+            base = f"{v:.4g}{unit} ± {stddev:.2g}"
+        return base + _context()
+    body = f"{v}{unit}" if unit else str(v)
+    return body + _context()
+
+
+def _score_pair(c, b, threshold):
+    """Compare two metric entries. Returns (baseline_fmt, current_fmt, change_str,
+    flag). flag in {"", "regression", "improved"}. Handles missing/sentinel/
+    time-series entries as well."""
+    if not c or not b:
+        return (_fmt(b) if b else "—",
+                _fmt(c) if c else "—",
+                "new" if c and not b else "removed", "")
+
+    # Non-scorable: time series or sentinel strings.
+    if not _is_scored(c) or not _is_scored(b):
+        flag = ""
+        cv = c.get("value") if isinstance(c, dict) else None
+        bv = b.get("value") if isinstance(b, dict) else None
+        # Timeout after previously-numeric baseline = regression.
+        if isinstance(cv, str) and cv == "timeout" and isinstance(bv, (int, float)):
+            flag = "regression"
+        return _fmt(b), _fmt(c), "—", flag
+
+    cv, bv = c["value"], b["value"]
+    direction = c.get("direction", "lower_is_better")
+    change_pct = ((cv - bv) / bv) * 100 if bv != 0 else 0
+    regressed = (direction == "lower_is_better" and change_pct > threshold) or \
+                (direction == "higher_is_better" and change_pct < -threshold)
+    improved = (direction == "lower_is_better" and change_pct < -threshold) or \
+               (direction == "higher_is_better" and change_pct > threshold)
+    flag = "regression" if regressed else ("improved" if improved else "")
+    return _fmt(b), _fmt(c), f"{change_pct:+.1f}%", flag
+
+
+def _hz_cell(c, b, threshold):
+    """Compact `base → curr (Δ%)` cell for the pivoted sensor table. Appends
+    `t=A-Bs` when the metric carries a t_start/t_end window (start_mean/end_mean).
+    Returns (cell_text, flag)."""
+    bfmt, cfmt, change, flag = _score_pair(c, b, threshold)
+    if not c or not b:
+        return (cfmt if c else bfmt, flag)
+    if not _is_scored(c) or not _is_scored(b):
+        return (f"{bfmt} → {cfmt}", flag)
+    b_short = f"{b['value']:.4g}"
+    c_short = f"{c['value']:.4g}"
+    annotations = [change]
+    t_start, t_end = c.get("t_start"), c.get("t_end")
+    if t_start is not None and t_end is not None:
+        annotations.insert(0, f"t={t_start}-{t_end}s")
+    return (f"{b_short} → {c_short} ({', '.join(annotations)})", flag)
 
 
 def compare(current, baseline, threshold):
-    rows = []
-    has_regression = False
-
-    all_tests = sorted(set(list(current.keys()) + list(baseline.keys())))
-    for test in all_tests:
+    """Split metrics into main table rows and sensor-rate pivot rows.
+
+    Returns (main_rows, hz_rows, hz_iter_counts, has_regression). Test execution
+    order (as written to results.xml / metrics.json) is preserved for grouping.
+    `hz_iter_counts[module] = (baseline_n, current_n)` records the iteration
+    count so format_markdown can annotate the sensor-rate section header."""
+    main_rows = []
+    hz_data = {}
+    hz_iter_counts = {}
+
+    ordered_tests = list(current) + [t for t in baseline if t not in current]
+    for test in ordered_tests:
+        module, display = _split_test_name(test)
         curr = current.get(test, {})
         base = baseline.get(test, {})
-
-        # Collect all metric keys (skip 'status')
-        metric_keys = sorted(set(
-            [k for k in curr if k != "status"] +
-            [k for k in base if k != "status"]
-        ))
+        metric_keys = [k for k in curr if k != "status"] + \
+                      [k for k in base if k != "status" and k not in curr]
         for key in metric_keys:
-            c = curr.get(key)
-            b = base.get(key)
-            if not c or not b:
-                rows.append({
-                    "test": test, "metric": key,
-                    "baseline": _fmt(b) if b else "—",
-                    "current": _fmt(c) if c else "—",
-                    "change": "new" if c and not b else "removed",
-                    "flag": "",
-                })
-                continue
-
-            # Non-scorable: time series, sentinels ("timeout"), strings. Show but don't score.
-            if not _is_scored(c) or not _is_scored(b):
-                flag = ""
-                # Special case: if current is "timeout" but baseline was numeric, it's a regression
-                cv = c.get("value") if isinstance(c, dict) else None
-                bv = b.get("value") if isinstance(b, dict) else None
-                if isinstance(cv, str) and cv == "timeout" and isinstance(bv, (int, float)):
-                    flag = "regression"
-                    has_regression = True
-                rows.append({
-                    "test": test, "metric": key,
-                    "baseline": _fmt(b),
-                    "current": _fmt(c),
-                    "change": "—",
-                    "flag": flag,
-                })
+            c, b = curr.get(key), base.get(key)
+            m = HZ_METRIC_RE.match(key)
+            if m:
+                topic, agg = m.group(1), m.group(2)
+                if agg == "samples":
+                    continue
+                hz_data.setdefault((test, module, display, topic), {})[agg] = \
+                    _hz_cell(c, b, threshold)
+                if module not in hz_iter_counts:
+                    b_n = b.get("total") if isinstance(b, dict) else None
+                    c_n = c.get("total") if isinstance(c, dict) else None
+                    hz_iter_counts[module] = (b_n, c_n)
                 continue
-
-            cv, bv = c["value"], b["value"]
-            direction = c.get("direction", "lower_is_better")
-
-            if bv != 0:
-                change_pct = ((cv - bv) / bv) * 100
-            else:
-                change_pct = 0
-
-            # Determine if this is a regression
-            regressed = (direction == "lower_is_better" and change_pct > threshold) or \
-                        (direction == "higher_is_better" and change_pct < -threshold)
-            improved = (direction == "lower_is_better" and change_pct < -threshold) or \
-                       (direction == "higher_is_better" and change_pct > threshold)
-
-            if regressed:
-                has_regression = True
-
-            rows.append({
-                "test": test, "metric": key,
-                "baseline": _fmt(b),
-                "current": _fmt(c),
-                "change": f"{change_pct:+.1f}%",
-                "flag": "regression" if regressed else ("improved" if improved else ""),
+            bfmt, cfmt, change, flag = _score_pair(c, b, threshold)
+            main_rows.append({
+                "module": module, "test": display, "metric": key,
+                "baseline": bfmt, "current": cfmt,
+                "change": change, "flag": flag,
             })
 
-    return rows, has_regression
-
-
-def format_markdown(rows, has_regression):
-    lines = [
-        "| Test | Metric | Baseline | Current | Change |",
-        "|------|--------|----------|---------|--------|",
+    hz_rows = [
+        {"module": module, "test": display, "topic": topic, "aggs": aggs}
+        for (_, module, display, topic), aggs in hz_data.items()
     ]
-    for r in rows:
-        change = r["change"]
-        if r["flag"] == "regression":
-            change += " :red_circle:"
-        elif r["flag"] == "improved":
-            change += " :green_circle:"
-        lines.append(f"| {r['test']} | {r['metric']} | {r['baseline']} | {r['current']} | {change} |")
+    has_regression = (
+        any(r["flag"] == "regression" for r in main_rows)
+        or any(flag == "regression" for r in hz_rows for _, flag in r["aggs"].values())
+    )
+    return main_rows, hz_rows, hz_iter_counts, has_regression
+
+
+def _hz_section_heading(baseline_n, current_n):
+    if baseline_n and current_n and baseline_n == current_n:
+        suffix = f"n={baseline_n} iterations; "
+    elif baseline_n or current_n:
+        suffix = f"baseline n={baseline_n}, current n={current_n}; "
+    else:
+        suffix = ""
+    return f"### Sim publishing rates ({suffix}baseline → current, per-topic)"
+
+
+def format_markdown(main_rows, hz_rows, hz_iter_counts, has_regression):
+    def hz_cell(pair):
+        if not pair:
+            return "—"
+        text, flag = pair
+        return text + FLAG_SUFFIX.get(flag, "")
+
+    modules = []
+    main_by_module = {}
+    hz_by_module = {}
+    for r in main_rows:
+        mod = r["module"]
+        if mod not in main_by_module:
+            main_by_module[mod] = []
+            if mod not in modules:
+                modules.append(mod)
+        main_by_module[mod].append(r)
+    for r in hz_rows:
+        mod = r["module"]
+        if mod not in hz_by_module:
+            hz_by_module[mod] = []
+            if mod not in modules:
+                modules.append(mod)
+        hz_by_module[mod].append(r)
+
+    sections = []
+    for mod in modules:
+        sub = [f"## {mod}"]
+        main = main_by_module.get(mod, [])
+        hz = hz_by_module.get(mod, [])
+        if main:
+            table = tabulate(
+                [[r["test"], r["metric"], r["baseline"], r["current"],
+                  r["change"] + FLAG_SUFFIX.get(r["flag"], "")] for r in main],
+                headers=["Test", "Metric", "Baseline", "Current", "Change"],
+                tablefmt="github",
+            )
+            sub.append("### Metrics\n\n" + table)
+        if hz:
+            table = tabulate(
+                [[r["test"], r["topic"]] + [hz_cell(r["aggs"].get(agg)) for agg in HZ_AGGS]
+                 for r in hz],
+                headers=["Test", "Topic", *HZ_AGGS],
+                tablefmt="github",
+            )
+            b_n, c_n = hz_iter_counts.get(mod, (None, None))
+            sub.append(_hz_section_heading(b_n, c_n) + "\n\n" + table)
+        sections.append("\n\n".join(sub))
 
     if has_regression:
-        lines += ["", "**Regression detected** — some metrics exceeded the threshold."]
+        sections.append("**Regression detected** — some metrics exceeded the threshold.")
 
-    return "\n".join(lines)
+    return "\n\n".join(sections)
 
 
 def main():
@@ -184,8 +457,8 @@ def main():
 
     current = merge_metrics(Path(args.current))
     baseline = merge_metrics(Path(args.baseline))
-    rows, has_regression = compare(current, baseline, args.threshold)
-    md = format_markdown(rows, has_regression)
+    main_rows, hz_rows, hz_iter_counts, has_regression = compare(current, baseline, args.threshold)
+    md = format_markdown(main_rows, hz_rows, hz_iter_counts, has_regression)
 
     print(md)
     if args.output:
diff --git a/tests/conftest.py b/tests/conftest.py
index 9ec4c40ab..20382efcf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,6 +11,33 @@
 
 import pytest
 
+SIM_CONFIG = {
+    "msairsim": {
+        "profile": "ms-airsim",
+        "sim_container": "ms-airsim",
+        "sim_setup_bash": "/root/ros_ws/install/setup.bash",
+        "robot_setup_bash": "/root/AirStack/robot/ros_ws/install/setup.bash",
+        "extra_env": {
+            "URDF_FILE": "robot_descriptions/iris/urdf/iris_stereo.ms-airsim.urdf",
+            # Clear any user-set paths in .env so entrypoint auto-fetches Blocks.
+            # Shell env wins over --env-file in docker compose substitution.
+            "MS_AIRSIM_ENV_DIR": "",
+            "MS_AIRSIM_BINARY_PATH": "",
+        },
+    },
+    "isaacsim": {
+        "profile": "isaac-sim",
+        "sim_container": "isaac-sim",
+        "sim_setup_bash": "/opt/ros/jazzy/setup.bash",
+        "robot_setup_bash": "/root/AirStack/robot/ros_ws/install/setup.bash",
+        "extra_env": {
+            "ISAAC_SIM_USE_STANDALONE": "true",
+            "ISAAC_SIM_SCRIPT_NAME": "example_multi_px4_pegasus_launch_script.py",
+            "PLAY_SIM_ON_START": "true",
+        },
+    },
+}
+
 AIRSTACK_ROOT = os.environ.get("AIRSTACK_ROOT", str(Path(__file__).parent.parent))
 RUN_DIR = None
 LOGS_DIR = None
@@ -19,6 +46,8 @@
 # Track the currently-running pytest item so current_log() and current_test_id()
 # can pick up the parametrize id without tests having to pass `request` around.
 _CURRENT_ITEM = None
+METRICS = None
+
 
 logger = logging.getLogger("airstack")
 logger.setLevel(logging.INFO)
@@ -109,20 +138,21 @@ def pytest_generate_tests(metafunc):
 
 # ── logging / subprocess helpers ───────────────────────────────────────────
 
+def _nodeid_dotted(nodeid, with_path_sep=False):
+    """pytest nodeid → `module.Class.test_name[params]` form. When
+    `with_path_sep=True`, also flattens `/` in path prefixes (for log filenames)."""
+    out = nodeid.replace(".py::", ".").replace("::", ".")
+    return out.replace("/", ".") if with_path_sep else out
+
+
 def current_log():
     """Log name for the currently-running pytest item, or None outside a test.
 
     Subprocess helpers default to this so every call fired from a test auto-logs
-    to the right file without plumbing log_name through every layer.
-    """
+    to the right file without plumbing log_name through every layer."""
     if _CURRENT_ITEM is None:
         return None
-    # "test_liveliness.py::TestLiveliness::test_foo[id]" →
-    # "test_liveliness.TestLiveliness.test_foo[id]"
-    return (_CURRENT_ITEM.nodeid
-            .replace("/", ".")
-            .replace(".py::", ".")
-            .replace("::", "."))
+    return _nodeid_dotted(_CURRENT_ITEM.nodeid, with_path_sep=True)
 
 
 def read_log_tail(log_name=None, lines=50):
@@ -138,10 +168,7 @@ def read_log_tail(log_name=None, lines=50):
 
 def _run_teed(cmd_list, timeout, log_name=None, env=None, cwd=None):
     """Run a subprocess, teeing stdout+stderr live to the log file and
-    capturing them for parsing.
-
-    
-    """
+    capturing them for parsing."""
     log_name = log_name or current_log()
     if not log_name:
         return subprocess.run(cmd_list, capture_output=True, text=True,
@@ -260,12 +287,12 @@ def __init__(self, path):
         self._path = path
         self._data = json.loads(path.read_text()) if path.exists() else {}
 
-    def record(self, test_name, key, value, unit="", direction="lower_is_better"):
+    def record(self, test_name, key, value, unit="", direction="lower_is_better", **extra):
         if test_name not in self._data:
             self._data[test_name] = {}
-        self._data[test_name][key] = {
-            "value": value, "unit": unit, "direction": direction,
-        }
+        entry = {"value": value, "unit": unit, "direction": direction}
+        entry.update(extra)
+        self._data[test_name][key] = entry
         self._path.write_text(json.dumps(self._data, indent=2))
 
     def record_list(self, test_name, key, values):
@@ -275,10 +302,6 @@ def record_list(self, test_name, key, values):
         self._data[test_name][key] = {"samples": values}
         self._path.write_text(json.dumps(self._data, indent=2))
 
-
-METRICS = None
-
-
 def get_metrics():
     global METRICS
     if METRICS is None:
@@ -287,42 +310,15 @@ def get_metrics():
 
 
 def current_test_id():
-    """Full pytest test id for this test invocation — used as the metrics.json key."""
+    """Test id used as the metrics.json key. Matches JUnit XML's classname.name
+    format so compare_metrics.py can merge results.xml and metrics.json entries."""
     if _CURRENT_ITEM is None:
         return "unknown"
-    return _CURRENT_ITEM.nodeid
+    return _nodeid_dotted(_CURRENT_ITEM.nodeid)
 
 
 # ── shared sim test infrastructure (liveliness, comms, takeoff all reuse) ──
 
-SIM_CONFIG = {
-    "msairsim": {
-        "profile": "ms-airsim",
-        "sim_container": "ms-airsim",
-        "sim_setup_bash": "/root/ros_ws/install/setup.bash",
-        "robot_setup_bash": "/root/AirStack/robot/ros_ws/install/setup.bash",
-        "extra_env": {
-            "URDF_FILE": "robot_descriptions/iris/urdf/iris_stereo.ms-airsim.urdf",
-            # Clear any user-set paths in .env so entrypoint auto-fetches Blocks.
-            # Shell env wins over --env-file in docker compose substitution.
-            "MS_AIRSIM_ENV_DIR": "",
-            "MS_AIRSIM_BINARY_PATH": "",
-        },
-    },
-    "isaacsim": {
-        "profile": "isaac-sim",
-        "sim_container": "isaac-sim",
-        "sim_setup_bash": "/opt/ros/jazzy/setup.bash",
-        "robot_setup_bash": "/root/AirStack/robot/ros_ws/install/setup.bash",
-        "extra_env": {
-            "ISAAC_SIM_USE_STANDALONE": "true",
-            "ISAAC_SIM_SCRIPT_NAME": "example_multi_px4_pegasus_launch_script.py",
-            "PLAY_SIM_ON_START": "true",
-        },
-    },
-}
-
-
 def wait_for_first_message(container, topic, domain_id, setup_bash, timeout=60):
     """Wait up to `timeout` seconds for one message on `topic`. Returns seconds
     elapsed on success, None on timeout. Each attempt sources the workspace
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a9f5ceeac..a5e676282 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,2 +1,4 @@
 pytest
 pytest-timeout
+pytest-dependency
+tabulate
diff --git a/tests/test_liveliness.py b/tests/test_liveliness.py
index 76024d4dd..0da6a06ed 100644
--- a/tests/test_liveliness.py
+++ b/tests/test_liveliness.py
@@ -165,42 +165,53 @@ def _check_sim_publishing(env):
 
 # ── tests ──────────────────────────────────────────────────────────────────
 
+def _poll_until(predicate, timeout, interval, fail_msg):
+    """Sleep-poll `predicate` up to `timeout` seconds. On deadline, fail via
+    `pytest.fail` with `fail_msg` (str) or `fail_msg()` (callable), so the
+    message can reflect predicate-collected state."""
+    deadline = time.time() + timeout
+    while time.time() < deadline:
+        if predicate():
+            return
+        time.sleep(interval)
+    pytest.fail(fail_msg() if callable(fail_msg) else fail_msg)
+
+
 @pytest.mark.liveliness
 @pytest.mark.timeout(1800)
 class TestLiveliness:
 
+    @pytest.mark.dependency(name="containers")
     def test_robot_containers_running(self, airstack_env):
         """Wait up to 120s for N robot containers to be Running."""
         num_robots = airstack_env["num_robots"]
         pattern = airstack_env["robot_pattern"]
 
-        deadline = time.time() + 120
-        while time.time() < deadline:
+        def ready():
             containers = get_robot_containers(pattern)
-            if len(containers) >= num_robots and all(container_running(c) for c in containers):
-                return
-            time.sleep(3)
-        pytest.fail(f"only {len(get_robot_containers(pattern))}/{num_robots} robot "
-                    f"containers Running after 120s")
+            return (len(containers) >= num_robots
+                    and all(container_running(c) for c in containers))
+
+        _poll_until(ready, timeout=120, interval=3,
+                    fail_msg=lambda: f"only {len(get_robot_containers(pattern))}/"
+                                     f"{num_robots} robot containers Running after 120s")
 
+    @pytest.mark.dependency(name="sim_container", depends=["containers"])
     def test_sim_container_running(self, airstack_env):
         sc = airstack_env["sim_container"]
-        deadline = time.time() + 120
-        while time.time() < deadline:
-            if container_running(sc):
-                return
-            time.sleep(3)
-        pytest.fail(f"{sc} not Running after 120s")
+        _poll_until(lambda: container_running(sc),
+                    timeout=120, interval=3,
+                    fail_msg=f"{sc} not Running after 120s")
 
+    @pytest.mark.dependency(depends=["containers"])
     def test_gcs_container_running(self, airstack_env):
-        deadline = time.time() + 120
-        while time.time() < deadline:
+        def ready():
             names = find_all_containers("gcs")
-            if names and all(container_running(n) for n in names):
-                return
-            time.sleep(3)
-        pytest.fail("gcs container not Running after 120s")
+            return bool(names) and all(container_running(n) for n in names)
+        _poll_until(ready, timeout=120, interval=3,
+                    fail_msg="gcs container not Running after 120s")
 
+    @pytest.mark.dependency(name="sim_ready", depends=["sim_container"])
     def test_sim_ready_time(self, airstack_env):
         """Wait for first /clock message from the sim container. 600s hard timeout.
 
@@ -220,28 +231,36 @@ def test_sim_ready_time(self, airstack_env):
             pytest.fail("sim never published /clock within 600s")
         m.record(tid, "sim_ready_duration_s", round(time.time() - start, 2), unit="s")
 
+    @pytest.mark.dependency(name="tmux", depends=["containers"])
     def test_tmux_panes_have_expected_processes(self, airstack_env):
         ok, msg = _check_tmux_panes(airstack_env)
         assert ok, msg
 
+    @pytest.mark.dependency(name="sim_publishing", depends=["sim_ready"])
     def test_sim_publishing(self, airstack_env):
         """Parallel Hz sample of all sim-side topics. Fail on any stalled topic."""
         ok, msg, _ = _check_sim_publishing(airstack_env)
         assert ok, msg
 
+    @pytest.mark.dependency(name="nodes", depends=["containers"])
     def test_sentinel_nodes_present(self, airstack_env):
         """Wait up to 300s for the expected sentinel nodes per robot."""
-        deadline = time.time() + 300
-        ok, msg = False, ""
-        while time.time() < deadline:
+        last_msg = [""]
+
+        def ready():
             ok, msg = _check_sentinel_nodes(airstack_env)
-            if ok:
-                return
-            time.sleep(5)
-        pytest.fail(f"sentinel nodes not ready after 300s: {msg}")
+            last_msg[0] = msg
+            return ok
+
+        _poll_until(ready, timeout=300, interval=5,
+                    fail_msg=lambda: f"sentinel nodes not ready after 300s: {last_msg[0]}")
 
+    @pytest.mark.dependency(depends=["sim_ready", "sim_publishing", "nodes", "tmux"])
     def test_stable(self, airstack_env, request):
-        """Poll every --stable-interval for up to --stable-duration. Early exit on failure."""
+        """Poll every --stable-interval for up to --stable-duration. Early exit on failure.
+
+        Only the raw hz time series is recorded per topic; compare_metrics.py
+        derives mean/min/max/start_mean/end_mean from it."""
         duration = request.config.getoption("--stable-duration")
         interval = request.config.getoption("--stable-interval")
         m = get_metrics()
@@ -250,42 +269,25 @@ def test_stable(self, airstack_env, request):
         series = {}
         elapsed = 0
 
-        while elapsed < duration:
-            time.sleep(interval)
-            elapsed += interval
-
-            ok_t, msg_t = _check_tmux_panes(airstack_env)
-            ok_n, msg_n = _check_sentinel_nodes(airstack_env)
-            ok_p, msg_p, rates = _check_sim_publishing(airstack_env)
-
-            for topic, hz in rates.items():
-                key = topic.lstrip("/").replace("/", ".")
-                series.setdefault(key, []).append({"t": elapsed, "hz": hz or 0.0})
-
-            if not (ok_t and ok_n and ok_p):
-                self._record_stable_aggregates(m, tid, series)
-                pytest.fail(
-                    f"instability at t={elapsed}s: tmux={msg_t} | "
-                    f"nodes={msg_n} | publishing={msg_p}"
-                )
-
-        self._record_stable_aggregates(m, tid, series)
-
-    @staticmethod
-    def _record_stable_aggregates(m, tid, series):
-        """Record aggregate stats + full time series per topic."""
-        for key, samples in series.items():
-            hz_values = [s["hz"] for s in samples]
-            if not hz_values:
-                continue
-            m.record_list(tid, f"{key}.hz_samples", samples)
-            m.record(tid, f"{key}.hz_first", hz_values[0], unit="Hz",
-                     direction="higher_is_better")
-            m.record(tid, f"{key}.hz_last", hz_values[-1], unit="Hz",
-                     direction="higher_is_better")
-            m.record(tid, f"{key}.hz_mean",
-                     round(sum(hz_values) / len(hz_values), 2),
-                     unit="Hz", direction="higher_is_better")
-            m.record(tid, f"{key}.hz_range",
-                     round(max(hz_values) - min(hz_values), 2),
-                     unit="Hz", direction="lower_is_better")
+        try:
+            while elapsed < duration:
+                time.sleep(interval)
+                elapsed += interval
+
+                ok_t, msg_t = _check_tmux_panes(airstack_env)
+                ok_n, msg_n = _check_sentinel_nodes(airstack_env)
+                ok_p, msg_p, rates = _check_sim_publishing(airstack_env)
+
+                for topic, hz in rates.items():
+                    key = topic.lstrip("/").replace("/", ".")
+                    series.setdefault(key, []).append({"t": elapsed, "hz": hz or 0.0})
+
+                if not (ok_t and ok_n and ok_p):
+                    pytest.fail(
+                        f"instability at t={elapsed}s: tmux={msg_t} | "
+                        f"nodes={msg_n} | publishing={msg_p}"
+                    )
+        finally:
+            for key, samples in series.items():
+                if samples:
+                    m.record_list(tid, f"{key}.hz_samples", samples)

From 21a8b09c74e622ee91daa983a2daf65bd43e938a Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Mon, 20 Apr 2026 12:44:59 -0400
Subject: [PATCH 09/24] Add compute usage logging to liveliness

---
 tests/compare_metrics.py | 252 +++++++++++++++++++++++++--------------
 tests/conftest.py        |  74 ++++++++++++
 tests/requirements.txt   |   1 +
 tests/test_liveliness.py |  35 +++++-
 4 files changed, 272 insertions(+), 90 deletions(-)

diff --git a/tests/compare_metrics.py b/tests/compare_metrics.py
index 103a78c8f..55a05ce05 100644
--- a/tests/compare_metrics.py
+++ b/tests/compare_metrics.py
@@ -23,8 +23,27 @@
 
 ITER_RE = re.compile(r"-iter(\d+)(?=\])")
 ROBOT_RE = re.compile(r"\brobot_\d+\b")
-HZ_AGGS = ("mean", "start_mean", "end_mean", "min", "max")
-HZ_METRIC_RE = re.compile(rf"^(.+)\.hz_({'|'.join(HZ_AGGS + ('samples',))})$")
+AGGS = ("mean", "start_mean", "end_mean", "min", "max")
+
+# Time-series metrics recorded as `{prefix}.{type}_samples`. Each entry here
+# declares the display unit + regression direction for the derived aggregates.
+SAMPLE_TYPES = {
+    "hz": {"unit": "Hz", "direction": "higher_is_better"},
+    "cpu_pct": {"unit": "%", "direction": "lower_is_better"},
+    "mem_mb": {"unit": "MB", "direction": "lower_is_better"},
+    "disk_io_mb": {"unit": "MB", "direction": "lower_is_better"},
+    "net_io_mb": {"unit": "MB", "direction": "lower_is_better"},
+    "gpu_pct": {"unit": "%", "direction": "lower_is_better"},
+    "vram_mb": {"unit": "MB", "direction": "lower_is_better"},
+    "gpu_temp_c": {"unit": "°C", "direction": "lower_is_better"},
+    "gpu_power_w": {"unit": "W", "direction": "lower_is_better"},
+}
+COMPUTE_TYPES = tuple(k for k in SAMPLE_TYPES if k != "hz")
+
+HZ_METRIC_RE = re.compile(rf"^(.+)\.hz_({'|'.join(AGGS + ('samples',))})$")
+COMPUTE_METRIC_RE = re.compile(
+    rf"^(.+)\.({'|'.join(COMPUTE_TYPES)})_({'|'.join(AGGS + ('samples',))})$"
+)
 
 
 def _split_test_name(name):
@@ -40,39 +59,53 @@ def _split_test_name(name):
 def _aggregate_samples(series_list):
     """Align per-iteration sample lists by `t` and return per-step mean/std.
 
-    Input: [[{"t": 10, "hz": 19.27}, ...], [{"t": 10, "hz": 45.67}, ...], ...]
-    Output: [{"t": 10, "hz_mean": 32.5, "hz_std": 13.2, "n": 2}, ...]
+    Input: [[{"t": 10, "value": 19.27}, ...], [{"t": 10, "value": 45.67}, ...], ...]
+    Output: [{"t": 10, "mean": 32.5, "std": 13.2, "n": 2}, ...]
     """
     by_t = defaultdict(list)
     for series in series_list:
         for s in series:
-            t, hz = s.get("t"), s.get("hz")
-            if t is None or hz is None:
+            t, v = s.get("t"), s.get("value", s.get("hz"))
+            if t is None or v is None:
                 continue
-            by_t[t].append(hz)
+            by_t[t].append(v)
     out = []
     for t in sorted(by_t):
         vals = by_t[t]
         out.append({
             "t": t,
-            "hz_mean": round(statistics.mean(vals), 2),
-            "hz_std": round(statistics.pstdev(vals), 2) if len(vals) > 1 else 0.0,
+            "mean": round(statistics.mean(vals), 2),
+            "std": round(statistics.pstdev(vals), 2) if len(vals) > 1 else 0.0,
             "n": len(vals),
         })
     return out
 
 
+def _collapse_robot_topic_names(key):
+    """`robot_1.sensors.foo` → `robot.sensors.foo`. No-op for keys without a
+    topic-style `robot_N` segment."""
+    return ROBOT_RE.sub("robot", key)
+
+
+def _collapse_robot_container_names(key):
+    """`airstack-robot-desktop-1.cpu_pct` → `airstack-robot-desktop.cpu_pct`.
+    Strips a docker-compose `-N` suffix from the first dotted segment only,
+    which is where a container name lives in our metric keys."""
+    first, dot, rest = key.partition(".")
+    first = re.sub(r"-\d+$", "", first)
+    return f"{first}{dot}{rest}"
+
+
 def _collapse_robots(merged):
     """Merge per-robot metric keys into robot-agnostic ones (homogeneous robots).
-    `robot_1.sensors.foo.hz_samples` + `robot_2.sensors.foo.hz_samples` →
-    `robot.sensors.foo.hz_samples` with sample lists concatenated. Mutates
-    `merged` in place."""
+    Both topic-style (`robot_N`) and container-style (`-N` replica suffix)
+    naming schemes collapse to the same base. Sample lists are concatenated on
+    collision."""
     for metrics in merged.values():
-        merged_samples = {}  # new_key -> combined samples list
+        merged_samples = {}
         for key, val in list(metrics.items()):
-            if not ROBOT_RE.search(key):
-                continue
-            new_key = ROBOT_RE.sub("robot", key)
+            new_key = _collapse_robot_container_names(
+                _collapse_robot_topic_names(key))
             if new_key == key:
                 continue
             metrics.pop(key)
@@ -84,42 +117,46 @@ def _collapse_robots(merged):
             metrics[new_key] = {"samples": samples}
 
 
-def _expand_hz_samples(merged):
-    """For each `{topic}.hz_samples` time series, synthesize scalar aggregates
-    (hz_mean, hz_min, hz_max, hz_start_mean, hz_end_mean) as peer metrics.
-    Mutates `merged` in place."""
-    suffix = ".hz_samples"
-    hz_up = {"unit": "Hz", "direction": "higher_is_better"}
+def _expand_time_series(merged):
+    """For each `{prefix}.{type}_samples` time series (type ∈ SAMPLE_TYPES),
+    synthesize scalar aggregates ({type}_{mean,min,max,start_mean,end_mean})
+    as peer metrics. Mutates `merged` in place."""
     for metrics in merged.values():
         for key, val in list(metrics.items()):
-            if not (isinstance(val, dict) and "samples" in val and key.endswith(suffix)):
+            if not (isinstance(val, dict) and "samples" in val
+                    and key.endswith("_samples")):
+                continue
+            stem = key.removesuffix("_samples")
+            sample_type = next(
+                (t for t in SAMPLE_TYPES if stem.endswith(f".{t}")), None)
+            if sample_type is None:
                 continue
             samples = val["samples"]
             if not samples:
                 continue
-            # Post-robot-merge, samples from different robots may interleave.
-            # Sort by t so start_mean/end_mean slicing lands on clean time halves.
+            # Post-collapse, samples from different robots/containers may
+            # interleave. Sort by t so start_mean/end_mean land on clean halves.
             samples = sorted(samples, key=lambda s: s["t"])
-            topic = key.removesuffix(suffix)
-            hz = [s["hz"] for s in samples]
+            meta = SAMPLE_TYPES[sample_type]
+            vals = [s.get("value", s.get("hz")) for s in samples]
             ts = [s["t"] for s in samples]
             half = len(samples) // 2 or 1
             aggs = {
-                "hz_mean": {"value": round(statistics.mean(hz), 2), **hz_up},
-                "hz_min": {"value": min(hz), **hz_up},
-                "hz_max": {"value": max(hz), **hz_up},
-                "hz_start_mean": {
-                    "value": round(statistics.mean(hz[:half]), 2),
-                    "t_start": ts[0], "t_end": ts[half - 1], **hz_up,
+                "mean": {"value": round(statistics.mean(vals), 2), **meta},
+                "min": {"value": min(vals), **meta},
+                "max": {"value": max(vals), **meta},
+                "start_mean": {
+                    "value": round(statistics.mean(vals[:half]), 2),
+                    "t_start": ts[0], "t_end": ts[half - 1], **meta,
                 },
             }
-            if len(hz) > half:
-                aggs["hz_end_mean"] = {
-                    "value": round(statistics.mean(hz[half:]), 2),
-                    "t_start": ts[half], "t_end": ts[-1], **hz_up,
+            if len(vals) > half:
+                aggs["end_mean"] = {
+                    "value": round(statistics.mean(vals[half:]), 2),
+                    "t_start": ts[half], "t_end": ts[-1], **meta,
                 }
             for agg_name, entry in aggs.items():
-                metrics.setdefault(f"{topic}.{agg_name}", entry)
+                metrics.setdefault(f"{stem}_{agg_name}", entry)
 
 
 def parse_results_xml(path):
@@ -156,7 +193,7 @@ def merge_metrics(run_dir):
             merged[test_name] = {}
         merged[test_name].update(test_metrics)
     _collapse_robots(merged)
-    _expand_hz_samples(merged)
+    _expand_time_series(merged)
     return _collapse_iterations(merged)
 
 
@@ -333,15 +370,19 @@ def _hz_cell(c, b, threshold):
 
 
 def compare(current, baseline, threshold):
-    """Split metrics into main table rows and sensor-rate pivot rows.
-
-    Returns (main_rows, hz_rows, hz_iter_counts, has_regression). Test execution
-    order (as written to results.xml / metrics.json) is preserved for grouping.
-    `hz_iter_counts[module] = (baseline_n, current_n)` records the iteration
-    count so format_markdown can annotate the sensor-rate section header."""
+    """Split metrics into three groups: flat rows, hz pivot rows, compute pivot
+    rows. Test execution order is preserved for grouping. `iter_counts[module]`
+    records the iteration count for annotating pivot-table headers."""
     main_rows = []
-    hz_data = {}
-    hz_iter_counts = {}
+    hz_data = {}        # (test, module, display, topic) → {agg: (text, flag)}
+    compute_data = {}   # (test, module, display, entity, metric_type) → {agg: (text, flag)}
+    iter_counts = {}    # module → (baseline_n, current_n)
+
+    def note_iters(module, c, b):
+        if module not in iter_counts:
+            b_n = b.get("total") if isinstance(b, dict) else None
+            c_n = c.get("total") if isinstance(c, dict) else None
+            iter_counts[module] = (b_n, c_n)
 
     ordered_tests = list(current) + [t for t in baseline if t not in current]
     for test in ordered_tests:
@@ -352,18 +393,28 @@ def compare(current, baseline, threshold):
                       [k for k in base if k != "status" and k not in curr]
         for key in metric_keys:
             c, b = curr.get(key), base.get(key)
-            m = HZ_METRIC_RE.match(key)
-            if m:
-                topic, agg = m.group(1), m.group(2)
+
+            hz_m = HZ_METRIC_RE.match(key)
+            if hz_m:
+                topic, agg = hz_m.group(1), hz_m.group(2)
                 if agg == "samples":
                     continue
                 hz_data.setdefault((test, module, display, topic), {})[agg] = \
                     _hz_cell(c, b, threshold)
-                if module not in hz_iter_counts:
-                    b_n = b.get("total") if isinstance(b, dict) else None
-                    c_n = c.get("total") if isinstance(c, dict) else None
-                    hz_iter_counts[module] = (b_n, c_n)
+                note_iters(module, c, b)
                 continue
+
+            compute_m = COMPUTE_METRIC_RE.match(key)
+            if compute_m:
+                entity, metric_type, agg = compute_m.group(1), compute_m.group(2), compute_m.group(3)
+                if agg == "samples":
+                    continue
+                compute_data.setdefault(
+                    (test, module, display, entity, metric_type), {}
+                )[agg] = _hz_cell(c, b, threshold)
+                note_iters(module, c, b)
+                continue
+
             bfmt, cfmt, change, flag = _score_pair(c, b, threshold)
             main_rows.append({
                 "module": module, "test": display, "metric": key,
@@ -375,53 +426,65 @@ def compare(current, baseline, threshold):
         {"module": module, "test": display, "topic": topic, "aggs": aggs}
         for (_, module, display, topic), aggs in hz_data.items()
     ]
+    compute_rows = [
+        {"module": module, "test": display, "entity": entity,
+         "metric_type": metric_type, "aggs": aggs}
+        for (_, module, display, entity, metric_type), aggs in compute_data.items()
+    ]
     has_regression = (
         any(r["flag"] == "regression" for r in main_rows)
         or any(flag == "regression" for r in hz_rows for _, flag in r["aggs"].values())
+        or any(flag == "regression" for r in compute_rows for _, flag in r["aggs"].values())
     )
-    return main_rows, hz_rows, hz_iter_counts, has_regression
+    return main_rows, hz_rows, compute_rows, iter_counts, has_regression
 
 
-def _hz_section_heading(baseline_n, current_n):
+def _iter_annotation(baseline_n, current_n):
     if baseline_n and current_n and baseline_n == current_n:
-        suffix = f"n={baseline_n} iterations; "
-    elif baseline_n or current_n:
-        suffix = f"baseline n={baseline_n}, current n={current_n}; "
-    else:
-        suffix = ""
-    return f"### Sim publishing rates ({suffix}baseline → current, per-topic)"
+        return f"n={baseline_n} iterations; "
+    if baseline_n or current_n:
+        return f"baseline n={baseline_n}, current n={current_n}; "
+    return ""
 
 
-def format_markdown(main_rows, hz_rows, hz_iter_counts, has_regression):
-    def hz_cell(pair):
-        if not pair:
-            return "—"
-        text, flag = pair
-        return text + FLAG_SUFFIX.get(flag, "")
+def _pivot_cell(pair):
+    if not pair:
+        return "—"
+    text, flag = pair
+    return text + FLAG_SUFFIX.get(flag, "")
+
 
+def _group_by_module(rows):
     modules = []
-    main_by_module = {}
-    hz_by_module = {}
-    for r in main_rows:
+    grouped = {}
+    for r in rows:
         mod = r["module"]
-        if mod not in main_by_module:
-            main_by_module[mod] = []
+        if mod not in grouped:
+            grouped[mod] = []
             if mod not in modules:
                 modules.append(mod)
-        main_by_module[mod].append(r)
-    for r in hz_rows:
-        mod = r["module"]
-        if mod not in hz_by_module:
-            hz_by_module[mod] = []
-            if mod not in modules:
-                modules.append(mod)
-        hz_by_module[mod].append(r)
+        grouped[mod].append(r)
+    return modules, grouped
+
+
+def format_markdown(main_rows, hz_rows, compute_rows, iter_counts, has_regression):
+    main_mods, main_by_module = _group_by_module(main_rows)
+    hz_mods, hz_by_module = _group_by_module(hz_rows)
+    compute_mods, compute_by_module = _group_by_module(compute_rows)
+    modules = []
+    for m in main_mods + hz_mods + compute_mods:
+        if m not in modules:
+            modules.append(m)
 
     sections = []
     for mod in modules:
         sub = [f"## {mod}"]
         main = main_by_module.get(mod, [])
         hz = hz_by_module.get(mod, [])
+        compute = compute_by_module.get(mod, [])
+        b_n, c_n = iter_counts.get(mod, (None, None))
+        annotation = _iter_annotation(b_n, c_n)
+
         if main:
             table = tabulate(
                 [[r["test"], r["metric"], r["baseline"], r["current"],
@@ -432,13 +495,27 @@ def hz_cell(pair):
             sub.append("### Metrics\n\n" + table)
         if hz:
             table = tabulate(
-                [[r["test"], r["topic"]] + [hz_cell(r["aggs"].get(agg)) for agg in HZ_AGGS]
+                [[r["test"], r["topic"]] + [_pivot_cell(r["aggs"].get(agg)) for agg in AGGS]
                  for r in hz],
-                headers=["Test", "Topic", *HZ_AGGS],
+                headers=["Test", "Topic", *AGGS],
                 tablefmt="github",
             )
-            b_n, c_n = hz_iter_counts.get(mod, (None, None))
-            sub.append(_hz_section_heading(b_n, c_n) + "\n\n" + table)
+            sub.append(
+                f"### Sim publishing rates ({annotation}baseline → current, per-topic)\n\n"
+                + table
+            )
+        if compute:
+            table = tabulate(
+                [[r["test"], r["entity"], r["metric_type"]]
+                 + [_pivot_cell(r["aggs"].get(agg)) for agg in AGGS]
+                 for r in compute],
+                headers=["Test", "Entity", "Metric", *AGGS],
+                tablefmt="github",
+            )
+            sub.append(
+                f"### Compute usage ({annotation}baseline → current, "
+                f"per-container and global)\n\n" + table
+            )
         sections.append("\n\n".join(sub))
 
     if has_regression:
@@ -457,8 +534,9 @@ def main():
 
     current = merge_metrics(Path(args.current))
     baseline = merge_metrics(Path(args.baseline))
-    main_rows, hz_rows, hz_iter_counts, has_regression = compare(current, baseline, args.threshold)
-    md = format_markdown(main_rows, hz_rows, hz_iter_counts, has_regression)
+    main_rows, hz_rows, compute_rows, iter_counts, has_regression = compare(
+        current, baseline, args.threshold)
+    md = format_markdown(main_rows, hz_rows, compute_rows, iter_counts, has_regression)
 
     print(md)
     if args.output:
diff --git a/tests/conftest.py b/tests/conftest.py
index 20382efcf..42ccf35e2 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -259,6 +259,80 @@ def wait_for_container(name_pattern, timeout=120):
     raise TimeoutError(f"Container matching '{name_pattern}' not running after {timeout}s")
 
 
+# ── compute-usage sampling ─────────────────────────────────────────────────
+
+_BYTES_RE = re.compile(r"([\d.]+)\s*([KMGT]?i?B)$")
+_BYTES_TO_MB = {
+    "B": 1 / (1024 * 1024),
+    "KiB": 1 / 1024, "KB": 1 / 1000,
+    "MiB": 1, "MB": 1,
+    "GiB": 1024, "GB": 1000,
+    "TiB": 1024 * 1024, "TB": 1_000_000,
+}
+
+
+def _parse_docker_bytes(s):
+    """Parse a docker-stats byte string (e.g. '123.4MiB', '0B') to MB."""
+    m = _BYTES_RE.match((s or "").strip())
+    if not m:
+        return 0.0
+    return float(m.group(1)) * _BYTES_TO_MB.get(m.group(2), 1)
+
+
+def sample_compute_usage(sim_container):
+    """Snapshot of compute resources: per-container CPU/mem/disk-IO/net-IO plus
+    global host CPU/mem and GPU util/VRAM/temp/power. Returns {key: value},
+    keys shaped `{entity}.{metric}` where entity is the full container name or
+    'host'. Per-robot replicas (e.g. airstack-robot-desktop-1/2/3) are kept
+    distinct so raw metrics.json preserves per-robot data; compare_metrics
+    pools them at report time. Silently omits metrics that fail to sample."""
+    import psutil
+
+    out = {}
+
+    stats = _run_teed(
+        ["docker", "stats", "--no-stream", "--format", "{{json .}}"],
+        timeout=20,
+    )
+    for line in stats.stdout.strip().splitlines():
+        try:
+            d = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        name = d.get("Name", "")
+        if not name or name.startswith("docker-test-run"):
+            continue
+        out[f"{name}.cpu_pct"] = float(d.get("CPUPerc", "0%").rstrip("%") or 0)
+        mem_raw = d.get("MemUsage", "").split("/")[0].strip()
+        out[f"{name}.mem_mb"] = _parse_docker_bytes(mem_raw)
+        for io_field, metric in (("BlockIO", "disk_io_mb"), ("NetIO", "net_io_mb")):
+            parts = (d.get(io_field, "") or "").split("/")
+            total = sum(_parse_docker_bytes(p.strip()) for p in parts)
+            out[f"{name}.{metric}"] = total
+
+    out["host.cpu_pct"] = psutil.cpu_percent(interval=0.5)
+    out["host.mem_mb"] = psutil.virtual_memory().used / (1024 * 1024)
+
+    gpu = _run_teed(
+        ["docker", "exec", sim_container, "nvidia-smi",
+         "--query-gpu=utilization.gpu,memory.used,temperature.gpu,power.draw",
+         "--format=csv,noheader,nounits"],
+        timeout=10,
+    )
+    if gpu.returncode == 0 and gpu.stdout.strip():
+        fields = [f.strip() for f in gpu.stdout.strip().splitlines()[0].split(",")]
+        if len(fields) >= 4:
+            try:
+                out["host.gpu_pct"] = float(fields[0])
+                out["host.vram_mb"] = float(fields[1])
+                out["host.gpu_temp_c"] = float(fields[2])
+                out["host.gpu_power_w"] = float(fields[3])
+            except ValueError:
+                pass
+
+    return out
+
+
 def docker_image_size_mb(service, env=None):
     compose_env = os.environ.copy()
     if env:
diff --git a/tests/requirements.txt b/tests/requirements.txt
index a5e676282..382731834 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -2,3 +2,4 @@ pytest
 pytest-timeout
 pytest-dependency
 tabulate
+psutil
diff --git a/tests/test_liveliness.py b/tests/test_liveliness.py
index 0da6a06ed..731de7581 100644
--- a/tests/test_liveliness.py
+++ b/tests/test_liveliness.py
@@ -19,6 +19,7 @@
     logger,
     parallel_sample_hz,
     ros2_exec,
+    sample_compute_usage,
     wait_for_first_message,
 )
 
@@ -163,6 +164,22 @@ def _check_sim_publishing(env):
     return True, f"{len(rates)} topics healthy", rates
 
 
+def _check_compute_usage(env):
+    """Snapshot compute resources. Returns (ok, msg, samples_dict). ok=True as
+    long as sampling produced any numeric values; this test is diagnostic, not
+    gating — regressions surface via compare_metrics."""
+    logger.info("Sampling compute usage")
+    try:
+        samples = sample_compute_usage(env["sim_container"])
+    except Exception as e:
+        logger.warning("Compute sampling raised: %s", e)
+        return False, f"compute sampling failed: {e}", {}
+    if not samples:
+        return False, "no compute samples returned", {}
+    logger.info("Sampled %d compute metrics", len(samples))
+    return True, f"{len(samples)} compute metrics sampled", samples
+
+
 # ── tests ──────────────────────────────────────────────────────────────────
 
 def _poll_until(predicate, timeout, interval, fail_msg):
@@ -242,6 +259,14 @@ def test_sim_publishing(self, airstack_env):
         ok, msg, _ = _check_sim_publishing(airstack_env)
         assert ok, msg
 
+    @pytest.mark.dependency(name="compute", depends=["sim_ready"])
+    def test_compute_usage(self, airstack_env):
+        """Snapshot per-container CPU/mem/IO + host CPU/mem + GPU util/VRAM/
+        temp/power. Passes as long as sampling returned values — time-series
+        recording happens in test_stable."""
+        ok, msg, _ = _check_compute_usage(airstack_env)
+        assert ok, msg
+
     @pytest.mark.dependency(name="nodes", depends=["containers"])
     def test_sentinel_nodes_present(self, airstack_env):
         """Wait up to 300s for the expected sentinel nodes per robot."""
@@ -277,10 +302,14 @@ def test_stable(self, airstack_env, request):
                 ok_t, msg_t = _check_tmux_panes(airstack_env)
                 ok_n, msg_n = _check_sentinel_nodes(airstack_env)
                 ok_p, msg_p, rates = _check_sim_publishing(airstack_env)
+                _, _, compute = _check_compute_usage(airstack_env)
 
                 for topic, hz in rates.items():
-                    key = topic.lstrip("/").replace("/", ".")
-                    series.setdefault(key, []).append({"t": elapsed, "hz": hz or 0.0})
+                    key = topic.lstrip("/").replace("/", ".") + ".hz"
+                    series.setdefault(key, []).append({"t": elapsed, "value": hz or 0.0})
+
+                for key, value in compute.items():
+                    series.setdefault(key, []).append({"t": elapsed, "value": value})
 
                 if not (ok_t and ok_n and ok_p):
                     pytest.fail(
@@ -290,4 +319,4 @@ def test_stable(self, airstack_env, request):
         finally:
             for key, samples in series.items():
                 if samples:
-                    m.record_list(tid, f"{key}.hz_samples", samples)
+                    m.record_list(tid, f"{key}_samples", samples)

From 307aac86e3938ade522849c6d071b1bf0d6470d4 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Mon, 20 Apr 2026 20:57:42 -0400
Subject: [PATCH 10/24] Parse metrics to support parsing single results file
 into markdown.

---
 tests/conftest.py                             |   6 +-
 .../{compare_metrics.py => parse_metrics.py}  | 228 ++++++++++--------
 tests/test_liveliness.py                      |   4 +-
 3 files changed, 128 insertions(+), 110 deletions(-)
 rename tests/{compare_metrics.py => parse_metrics.py} (74%)

diff --git a/tests/conftest.py b/tests/conftest.py
index 42ccf35e2..f94f73f43 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -284,7 +284,7 @@ def sample_compute_usage(sim_container):
     global host CPU/mem and GPU util/VRAM/temp/power. Returns {key: value},
     keys shaped `{entity}.{metric}` where entity is the full container name or
     'host'. Per-robot replicas (e.g. airstack-robot-desktop-1/2/3) are kept
-    distinct so raw metrics.json preserves per-robot data; compare_metrics
+    distinct so raw metrics.json preserves per-robot data; parse_metrics
     pools them at report time. Silently omits metrics that fail to sample."""
     import psutil
 
@@ -370,7 +370,7 @@ def record(self, test_name, key, value, unit="", direction="lower_is_better", **
         self._path.write_text(json.dumps(self._data, indent=2))
 
     def record_list(self, test_name, key, values):
-        """Store a raw list (time series) — not scored by compare_metrics."""
+        """Store a raw list (time series) — not scored by parse_metrics."""
         if test_name not in self._data:
             self._data[test_name] = {}
         self._data[test_name][key] = {"samples": values}
@@ -385,7 +385,7 @@ def get_metrics():
 
 def current_test_id():
     """Test id used as the metrics.json key. Matches JUnit XML's classname.name
-    format so compare_metrics.py can merge results.xml and metrics.json entries."""
+    format so parse_metrics.py can merge results.xml and metrics.json entries."""
     if _CURRENT_ITEM is None:
         return "unknown"
     return _nodeid_dotted(_CURRENT_ITEM.nodeid)
diff --git a/tests/compare_metrics.py b/tests/parse_metrics.py
similarity index 74%
rename from tests/compare_metrics.py
rename to tests/parse_metrics.py
index 55a05ce05..2ca4fc1db 100644
--- a/tests/compare_metrics.py
+++ b/tests/parse_metrics.py
@@ -1,12 +1,14 @@
 #!/usr/bin/env python3
-"""Compare test metrics between two runs.
+"""Parse test metrics. Renders a markdown report for one run, or a diff
+between two runs when --baseline is supplied.
 
 Reads results.xml (JUnit XML) for test durations and metrics.json for custom
-metrics (image sizes, etc.). Outputs a markdown table and exits 1 on regression.
+metrics. In diff mode, exits 1 on regression; in single mode, always exits 0.
 
 Usage:
-    python compare_metrics.py --current tests/results/<run>/ --baseline tests/results/<run>/
-    python compare_metrics.py --current tests/results/<run>/ --baseline tests/results/<run>/ --threshold 30
+    python parse_metrics.py --current tests/results/<run>/
+    python parse_metrics.py --current tests/results/<run>/ --baseline tests/results/<run>/
+    python parse_metrics.py --current tests/results/<run>/ --baseline tests/results/<run>/ --threshold 30
 """
 import argparse
 import json
@@ -321,25 +323,18 @@ def _context():
     return body + _context()
 
 
-def _score_pair(c, b, threshold):
-    """Compare two metric entries. Returns (baseline_fmt, current_fmt, change_str,
-    flag). flag in {"", "regression", "improved"}. Handles missing/sentinel/
-    time-series entries as well."""
+def _score(c, b, threshold):
+    """Compute change% and regression flag for a metric pair. Returns
+    (change_str, flag). flag ∈ {"", "regression", "improved"}. When either
+    entry is missing/sentinel/time-series, returns a stub with an empty flag
+    (except: `timeout` current after numeric baseline → regression)."""
     if not c or not b:
-        return (_fmt(b) if b else "—",
-                _fmt(c) if c else "—",
-                "new" if c and not b else "removed", "")
-
-    # Non-scorable: time series or sentinel strings.
+        return ("new" if c and not b else "removed"), ""
     if not _is_scored(c) or not _is_scored(b):
-        flag = ""
         cv = c.get("value") if isinstance(c, dict) else None
         bv = b.get("value") if isinstance(b, dict) else None
-        # Timeout after previously-numeric baseline = regression.
-        if isinstance(cv, str) and cv == "timeout" and isinstance(bv, (int, float)):
-            flag = "regression"
-        return _fmt(b), _fmt(c), "—", flag
-
+        flag = "regression" if (cv == "timeout" and isinstance(bv, (int, float))) else ""
+        return "—", flag
     cv, bv = c["value"], b["value"]
     direction = c.get("direction", "lower_is_better")
     change_pct = ((cv - bv) / bv) * 100 if bv != 0 else 0
@@ -348,35 +343,45 @@ def _score_pair(c, b, threshold):
     improved = (direction == "lower_is_better" and change_pct < -threshold) or \
                (direction == "higher_is_better" and change_pct > threshold)
     flag = "regression" if regressed else ("improved" if improved else "")
-    return _fmt(b), _fmt(c), f"{change_pct:+.1f}%", flag
-
+    return f"{change_pct:+.1f}%", flag
+
+
+def _pivot_cell(c, b, threshold, diff_mode):
+    """Render a pivot-table cell. Diff mode: `b_short → c_short (Δ%[, t=...])`
+    + regression flag suffix. Single mode: just the current value. Returns
+    (text, flag)."""
+    def t_window(entry):
+        ts, te = (entry or {}).get("t_start"), (entry or {}).get("t_end")
+        return f"t={ts}-{te}s" if ts is not None and te is not None else ""
+
+    if not diff_mode or not (c and b):
+        entry = c or b
+        if not entry:
+            return "—", ""
+        if not _is_scored(entry):
+            return _fmt(entry), ""
+        text = f"{entry['value']:.4g}{entry.get('unit', '')}"
+        t = t_window(entry)
+        return (f"{text} ({t})" if t else text), ""
 
-def _hz_cell(c, b, threshold):
-    """Compact `base → curr (Δ%)` cell for the pivoted sensor table. Appends
-    `t=A-Bs` when the metric carries a t_start/t_end window (start_mean/end_mean).
-    Returns (cell_text, flag)."""
-    bfmt, cfmt, change, flag = _score_pair(c, b, threshold)
-    if not c or not b:
-        return (cfmt if c else bfmt, flag)
     if not _is_scored(c) or not _is_scored(b):
-        return (f"{bfmt} → {cfmt}", flag)
-    b_short = f"{b['value']:.4g}"
-    c_short = f"{c['value']:.4g}"
-    annotations = [change]
-    t_start, t_end = c.get("t_start"), c.get("t_end")
-    if t_start is not None and t_end is not None:
-        annotations.insert(0, f"t={t_start}-{t_end}s")
-    return (f"{b_short} → {c_short} ({', '.join(annotations)})", flag)
-
-
-def compare(current, baseline, threshold):
-    """Split metrics into three groups: flat rows, hz pivot rows, compute pivot
-    rows. Test execution order is preserved for grouping. `iter_counts[module]`
-    records the iteration count for annotating pivot-table headers."""
+        return f"{_fmt(b)} → {_fmt(c)}", ""
+
+    change, flag = _score(c, b, threshold)
+    t = t_window(c)
+    annotations = ([t] if t else []) + [change]
+    return f"{b['value']:.4g} → {c['value']:.4g} ({', '.join(annotations)})", flag
+
+
+def build_rows(current, baseline):
+    """Route metrics into three groups: flat rows, hz pivot rows, compute pivot
+    rows. Rows carry raw metric entries; rendering/scoring happens in
+    format_markdown. `baseline` may be an empty dict for single-input mode.
+    Test execution order from `current` is preserved for grouping."""
     main_rows = []
-    hz_data = {}        # (test, module, display, topic) → {agg: (text, flag)}
-    compute_data = {}   # (test, module, display, entity, metric_type) → {agg: (text, flag)}
-    iter_counts = {}    # module → (baseline_n, current_n)
+    hz_data = {}       # (test, module, display, topic) → {agg: (c, b)}
+    compute_data = {}  # (test, module, display, entity, metric_type) → {agg: (c, b)}
+    iter_counts = {}   # module → (baseline_n, current_n)
 
     def note_iters(module, c, b):
         if module not in iter_counts:
@@ -399,8 +404,7 @@ def note_iters(module, c, b):
                 topic, agg = hz_m.group(1), hz_m.group(2)
                 if agg == "samples":
                     continue
-                hz_data.setdefault((test, module, display, topic), {})[agg] = \
-                    _hz_cell(c, b, threshold)
+                hz_data.setdefault((test, module, display, topic), {})[agg] = (c, b)
                 note_iters(module, c, b)
                 continue
 
@@ -411,15 +415,13 @@ def note_iters(module, c, b):
                     continue
                 compute_data.setdefault(
                     (test, module, display, entity, metric_type), {}
-                )[agg] = _hz_cell(c, b, threshold)
+                )[agg] = (c, b)
                 note_iters(module, c, b)
                 continue
 
-            bfmt, cfmt, change, flag = _score_pair(c, b, threshold)
             main_rows.append({
                 "module": module, "test": display, "metric": key,
-                "baseline": bfmt, "current": cfmt,
-                "change": change, "flag": flag,
+                "current_entry": c, "baseline_entry": b,
             })
 
     hz_rows = [
@@ -431,15 +433,12 @@ def note_iters(module, c, b):
          "metric_type": metric_type, "aggs": aggs}
         for (_, module, display, entity, metric_type), aggs in compute_data.items()
     ]
-    has_regression = (
-        any(r["flag"] == "regression" for r in main_rows)
-        or any(flag == "regression" for r in hz_rows for _, flag in r["aggs"].values())
-        or any(flag == "regression" for r in compute_rows for _, flag in r["aggs"].values())
-    )
-    return main_rows, hz_rows, compute_rows, iter_counts, has_regression
+    return main_rows, hz_rows, compute_rows, iter_counts
 
 
-def _iter_annotation(baseline_n, current_n):
+def _iter_annotation(baseline_n, current_n, diff_mode):
+    if not diff_mode:
+        return f"n={current_n} iterations; " if current_n else ""
     if baseline_n and current_n and baseline_n == current_n:
         return f"n={baseline_n} iterations; "
     if baseline_n or current_n:
@@ -447,13 +446,6 @@ def _iter_annotation(baseline_n, current_n):
     return ""
 
 
-def _pivot_cell(pair):
-    if not pair:
-        return "—"
-    text, flag = pair
-    return text + FLAG_SUFFIX.get(flag, "")
-
-
 def _group_by_module(rows):
     modules = []
     grouped = {}
@@ -467,7 +459,39 @@ def _group_by_module(rows):
     return modules, grouped
 
 
-def format_markdown(main_rows, hz_rows, compute_rows, iter_counts, has_regression):
+def format_markdown(main_rows, hz_rows, compute_rows, iter_counts, threshold, diff_mode):
+    regressions = [False]
+
+    def pivot_cell(pair):
+        if not pair:
+            return "—"
+        text, flag = _pivot_cell(*pair, threshold=threshold, diff_mode=diff_mode)
+        if flag == "regression":
+            regressions[0] = True
+        return text + FLAG_SUFFIX.get(flag, "")
+
+    def render_main(rows):
+        if diff_mode:
+            out = []
+            for r in rows:
+                c, b = r["current_entry"], r["baseline_entry"]
+                change, flag = _score(c, b, threshold)
+                if flag == "regression":
+                    regressions[0] = True
+                out.append([
+                    r["test"], r["metric"],
+                    _fmt(b) if b else "—",
+                    _fmt(c) if c else "—",
+                    change + FLAG_SUFFIX.get(flag, ""),
+                ])
+            return out, ["Test", "Metric", "Baseline", "Current", "Change"]
+        return ([[r["test"], r["metric"], _fmt(r["current_entry"])] for r in rows],
+                ["Test", "Metric", "Value"])
+
+    def render_pivot(rows, leading):
+        return [leading(r) + [pivot_cell(r["aggs"].get(agg)) for agg in AGGS]
+                for r in rows]
+
     main_mods, main_by_module = _group_by_module(main_rows)
     hz_mods, hz_by_module = _group_by_module(hz_rows)
     compute_mods, compute_by_module = _group_by_module(compute_rows)
@@ -476,73 +500,67 @@ def format_markdown(main_rows, hz_rows, compute_rows, iter_counts, has_regressio
         if m not in modules:
             modules.append(m)
 
+    hz_suffix = "baseline → current, per-topic" if diff_mode else "per-topic"
+    compute_suffix = ("baseline → current, per-container and global" if diff_mode
+                      else "per-container and global")
+
     sections = []
     for mod in modules:
         sub = [f"## {mod}"]
-        main = main_by_module.get(mod, [])
-        hz = hz_by_module.get(mod, [])
-        compute = compute_by_module.get(mod, [])
         b_n, c_n = iter_counts.get(mod, (None, None))
-        annotation = _iter_annotation(b_n, c_n)
+        annotation = _iter_annotation(b_n, c_n, diff_mode)
 
+        main = main_by_module.get(mod, [])
         if main:
-            table = tabulate(
-                [[r["test"], r["metric"], r["baseline"], r["current"],
-                  r["change"] + FLAG_SUFFIX.get(r["flag"], "")] for r in main],
-                headers=["Test", "Metric", "Baseline", "Current", "Change"],
-                tablefmt="github",
-            )
-            sub.append("### Metrics\n\n" + table)
+            rows, headers = render_main(main)
+            sub.append("### Metrics\n\n" + tabulate(rows, headers=headers, tablefmt="github"))
+
+        hz = hz_by_module.get(mod, [])
         if hz:
-            table = tabulate(
-                [[r["test"], r["topic"]] + [_pivot_cell(r["aggs"].get(agg)) for agg in AGGS]
-                 for r in hz],
-                headers=["Test", "Topic", *AGGS],
-                tablefmt="github",
-            )
             sub.append(
-                f"### Sim publishing rates ({annotation}baseline → current, per-topic)\n\n"
-                + table
-            )
+                f"### Sim publishing rates ({annotation}{hz_suffix})\n\n"
+                + tabulate(render_pivot(hz, lambda r: [r["test"], r["topic"]]),
+                           headers=["Test", "Topic", *AGGS], tablefmt="github"))
+
+        compute = compute_by_module.get(mod, [])
         if compute:
-            table = tabulate(
-                [[r["test"], r["entity"], r["metric_type"]]
-                 + [_pivot_cell(r["aggs"].get(agg)) for agg in AGGS]
-                 for r in compute],
-                headers=["Test", "Entity", "Metric", *AGGS],
-                tablefmt="github",
-            )
             sub.append(
-                f"### Compute usage ({annotation}baseline → current, "
-                f"per-container and global)\n\n" + table
-            )
+                f"### Compute usage ({annotation}{compute_suffix})\n\n"
+                + tabulate(render_pivot(
+                    compute, lambda r: [r["test"], r["entity"], r["metric_type"]]),
+                    headers=["Test", "Entity", "Metric", *AGGS], tablefmt="github"))
+
         sections.append("\n\n".join(sub))
 
-    if has_regression:
+    has_regression = regressions[0]
+    if diff_mode and has_regression:
         sections.append("**Regression detected** — some metrics exceeded the threshold.")
 
-    return "\n\n".join(sections)
+    return "\n\n".join(sections), has_regression
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Compare test metrics between runs")
+    parser = argparse.ArgumentParser(
+        description="Render a markdown report for a test run, or a diff if --baseline is supplied.")
     parser.add_argument("--current", required=True, help="Current run directory")
-    parser.add_argument("--baseline", required=True, help="Baseline run directory")
+    parser.add_argument("--baseline", help="Baseline run directory (enables diff mode)")
     parser.add_argument("--threshold", type=float, default=20, help="Regression threshold (%%)")
     parser.add_argument("--output", help="Write markdown report to file")
     args = parser.parse_args()
 
     current = merge_metrics(Path(args.current))
-    baseline = merge_metrics(Path(args.baseline))
-    main_rows, hz_rows, compute_rows, iter_counts, has_regression = compare(
-        current, baseline, args.threshold)
-    md = format_markdown(main_rows, hz_rows, compute_rows, iter_counts, has_regression)
+    baseline = merge_metrics(Path(args.baseline)) if args.baseline else {}
+    diff_mode = bool(args.baseline)
+
+    main_rows, hz_rows, compute_rows, iter_counts = build_rows(current, baseline)
+    md, has_regression = format_markdown(
+        main_rows, hz_rows, compute_rows, iter_counts, args.threshold, diff_mode)
 
     print(md)
     if args.output:
         Path(args.output).write_text(md)
 
-    sys.exit(1 if has_regression else 0)
+    sys.exit(1 if diff_mode and has_regression else 0)
 
 
 if __name__ == "__main__":
diff --git a/tests/test_liveliness.py b/tests/test_liveliness.py
index 731de7581..fb2a54a2b 100644
--- a/tests/test_liveliness.py
+++ b/tests/test_liveliness.py
@@ -167,7 +167,7 @@ def _check_sim_publishing(env):
 def _check_compute_usage(env):
     """Snapshot compute resources. Returns (ok, msg, samples_dict). ok=True as
     long as sampling produced any numeric values; this test is diagnostic, not
-    gating — regressions surface via compare_metrics."""
+    gating — regressions surface via parse_metrics."""
     logger.info("Sampling compute usage")
     try:
         samples = sample_compute_usage(env["sim_container"])
@@ -284,7 +284,7 @@ def ready():
     def test_stable(self, airstack_env, request):
         """Poll every --stable-interval for up to --stable-duration. Early exit on failure.
 
-        Only the raw hz time series is recorded per topic; compare_metrics.py
+        Only the raw hz time series is recorded per topic; parse_metrics.py
         derives mean/min/max/start_mean/end_mean from it."""
         duration = request.config.getoption("--stable-duration")
         interval = request.config.getoption("--stable-interval")

From 18b12120834593c6eb21b78f3857bb2413321fd9 Mon Sep 17 00:00:00 2001
From: Andrew Jong <ajong@andrew.cmu.edu>
Date: Tue, 21 Apr 2026 11:59:17 -0400
Subject: [PATCH 11/24] Add docs, add serivces and scripts for github ci/cd

---
 .github/runners/airstack-runner.service       |  26 ++
 .github/runners/register-runner.sh            |  96 +++++++
 .github/workflows/integration-tests.yml       | 221 +++++++++++++++
 .../intermediate/testing/system_testing.md    | 267 ++++++++++++++++++
 mkdocs.yml                                    |   6 +-
 5 files changed, 613 insertions(+), 3 deletions(-)
 create mode 100644 .github/runners/airstack-runner.service
 create mode 100644 .github/runners/register-runner.sh
 create mode 100644 .github/workflows/integration-tests.yml

diff --git a/.github/runners/airstack-runner.service b/.github/runners/airstack-runner.service
new file mode 100644
index 000000000..89cfe4d40
--- /dev/null
+++ b/.github/runners/airstack-runner.service
@@ -0,0 +1,26 @@
+[Unit]
+Description=AirStack GitHub Actions Runner (ephemeral)
+Documentation=https://docs.github.com/en/actions/hosting-your-own-runners
+After=network-online.target docker.service
+Wants=network-online.target
+
+[Service]
+User=runner
+Group=runner
+WorkingDirectory=/opt/actions-runner
+
+# Place runtime configuration here (REPO_URL, RUNNER_LABELS, etc.).
+# Each line: KEY=value  (no quotes needed, no export keyword)
+EnvironmentFile=/etc/github-runner-env
+
+ExecStart=/opt/actions-runner/register-runner.sh
+
+# Restart unconditionally so the loop survives transient API errors or reboots.
+Restart=always
+RestartSec=5
+
+# Give the runner enough time to finish a long job before systemd kills it.
+TimeoutStopSec=120
+
+[Install]
+WantedBy=multi-user.target
diff --git a/.github/runners/register-runner.sh b/.github/runners/register-runner.sh
new file mode 100644
index 000000000..1b5e0f3ac
--- /dev/null
+++ b/.github/runners/register-runner.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+# AirStack ephemeral GitHub Actions runner loop.
+#
+# Registers a fresh runner, executes exactly one job, then loops to re-register.
+# The --ephemeral flag tells the GitHub API to remove the runner after one job,
+# preventing stale registrations and cross-job state pollution.
+#
+# Setup (one-time, on the OpenStack VM):
+#   1. Create a non-root runner user:
+#        sudo useradd -m -s /bin/bash runner
+#        sudo usermod -aG docker runner
+#
+#   2. Download and unpack the GitHub Actions runner into RUNNER_DIR:
+#        sudo mkdir -p /opt/actions-runner
+#        cd /opt/actions-runner
+#        # Get the latest runner URL from:
+#        # https://github.com/actions/runner/releases
+#        curl -Lo actions-runner.tar.gz <URL>
+#        tar xzf actions-runner.tar.gz
+#        sudo chown -R runner:runner /opt/actions-runner
+#
+#   3. Store a GitHub PAT (repo scope for private repos, public_repo for public):
+#        echo "ghp_YOUR_TOKEN_HERE" | sudo tee /etc/github-runner-pat
+#        sudo chmod 600 /etc/github-runner-pat
+#        sudo chown runner:runner /etc/github-runner-pat
+#
+#   4. Copy this script into the runner directory and make it executable:
+#        sudo cp register-runner.sh /opt/actions-runner/register-runner.sh
+#        sudo chown runner:runner /opt/actions-runner/register-runner.sh
+#        sudo chmod +x /opt/actions-runner/register-runner.sh
+#
+#   5. Install the systemd unit (see airstack-runner.service) and enable it:
+#        sudo cp airstack-runner.service /etc/systemd/system/
+#        sudo systemctl daemon-reload
+#        sudo systemctl enable --now airstack-runner.service
+#
+# Configuration: set these in /etc/github-runner-env (loaded by the systemd unit)
+# or export them before running this script manually.
+
+set -euo pipefail
+
+REPO_URL="${REPO_URL:-https://github.com/YOUR_ORG/AirStack}"
+# Derived from REPO_URL for the registration token API call, e.g. "YOUR_ORG/AirStack"
+REPO_PATH="${REPO_PATH:-$(echo "$REPO_URL" | sed 's|https://github.com/||')}"
+RUNNER_DIR="${RUNNER_DIR:-/opt/actions-runner}"
+PAT_FILE="${PAT_FILE:-/etc/github-runner-pat}"
+RUNNER_LABELS="${RUNNER_LABELS:-self-hosted,airstack,gpu}"
+RUNNER_GROUP="${RUNNER_GROUP:-Default}"
+
+if [ ! -f "$PAT_FILE" ]; then
+  echo "ERROR: PAT file not found at $PAT_FILE" >&2
+  exit 1
+fi
+
+echo "Starting ephemeral runner loop for $REPO_URL"
+
+while true; do
+  echo "[$(date -u +%FT%TZ)] Requesting registration token..."
+
+  TOKEN=$(curl -sf -X POST \
+    -H "Authorization: token $(cat "$PAT_FILE")" \
+    -H "Accept: application/vnd.github+json" \
+    "https://api.github.com/repos/${REPO_PATH}/actions/runners/registration-token" \
+    | jq -r .token)
+
+  if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+    echo "ERROR: Failed to obtain registration token. Check PAT and repo path." >&2
+    sleep 30
+    continue
+  fi
+
+  echo "[$(date -u +%FT%TZ)] Configuring runner (ephemeral)..."
+
+  # --ephemeral: runner de-registers itself after completing one job.
+  # --replace: allows re-registration with the same name after a restart.
+  # Runner name encodes hostname + PID so parallel instances are unique.
+  "$RUNNER_DIR/config.sh" \
+    --url "$REPO_URL" \
+    --token "$TOKEN" \
+    --name "openstack-$(hostname -s)-$$" \
+    --labels "$RUNNER_LABELS" \
+    --runnergroup "$RUNNER_GROUP" \
+    --ephemeral \
+    --unattended \
+    --replace
+
+  echo "[$(date -u +%FT%TZ)] Runner configured. Waiting for a job..."
+
+  # run.sh blocks until the job completes, then returns (ephemeral runner exits cleanly).
+  "$RUNNER_DIR/run.sh" || true
+
+  echo "[$(date -u +%FT%TZ)] Job finished. Re-registering..."
+
+  # Brief pause to avoid hammering the API if config.sh / run.sh fail immediately.
+  sleep 2
+done
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
new file mode 100644
index 000000000..892e00b06
--- /dev/null
+++ b/.github/workflows/integration-tests.yml
@@ -0,0 +1,221 @@
+name: Integration Tests
+
+on:
+  pull_request:
+    branches: [main, develop]
+  workflow_dispatch:
+    inputs:
+      marks:
+        description: "pytest marks expression (e.g. liveliness, build_docker, 'liveliness or build_docker')"
+        default: liveliness
+        required: false
+      sim:
+        description: "Sim targets, comma-separated: msairsim, isaacsim"
+        default: msairsim
+        required: false
+      num_robots:
+        description: "Robot counts, comma-separated (e.g. 1,3)"
+        default: "1"
+        required: false
+      stress_iterations:
+        description: "Iterations per (sim, num_robots) config"
+        default: "1"
+        required: false
+      stable_duration:
+        description: "Seconds for test_stable polling window"
+        default: "120"
+        required: false
+      baseline_run_id:
+        description: "Run ID to use as baseline for metric comparison (blank = latest successful run on main)"
+        default: ""
+        required: false
+
+jobs:
+  run-tests:
+    name: Run Tests
+    runs-on: [self-hosted, airstack, gpu]
+    # Only run on PRs from the same repo (not forks) to prevent arbitrary code
+    # execution on the self-hosted runner from untrusted contributors.
+    if: >
+      github.event_name == 'workflow_dispatch' ||
+      github.event.pull_request.head.repo.full_name == github.repository
+    timeout-minutes: 120
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install test dependencies
+        run: pip3 install -r tests/requirements.txt
+
+      - name: Ensure airstack.sh is executable
+        run: chmod +x airstack.sh
+
+      - name: Run tests
+        env:
+          AIRSTACK_ROOT: ${{ github.workspace }}
+          DISPLAY: ""
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            MARKS="${{ inputs.marks }}"
+            SIM="${{ inputs.sim }}"
+            NUM_ROBOTS="${{ inputs.num_robots }}"
+            ITERATIONS="${{ inputs.stress_iterations }}"
+            STABLE="${{ inputs.stable_duration }}"
+          else
+            MARKS="build_docker or build_packages"
+            SIM="msairsim"
+            NUM_ROBOTS="1"
+            ITERATIONS="1"
+            STABLE="120"
+          fi
+
+          pytest tests/ \
+            -m "$MARKS" \
+            --sim "$SIM" \
+            --num-robots "$NUM_ROBOTS" \
+            --stress-iterations "$ITERATIONS" \
+            --stable-duration "$STABLE" \
+            -v
+
+      - name: Upload test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: test-results-${{ github.sha }}-${{ github.run_id }}
+          path: tests/results/
+          retention-days: 90
+
+  report:
+    name: Metrics Report
+    runs-on: ubuntu-latest
+    needs: run-tests
+    if: always()
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install report dependencies
+        run: pip install tabulate
+
+      - name: Download current test results
+        uses: actions/download-artifact@v4
+        with:
+          name: test-results-${{ github.sha }}-${{ github.run_id }}
+          path: current-results/
+
+      # PR mode: fetch latest artifact from the base branch (e.g. develop or main)
+      - name: Download baseline results (PR)
+        if: github.event_name == 'pull_request'
+        uses: dawidd6/action-download-artifact@v3
+        continue-on-error: true
+        with:
+          workflow: integration-tests.yml
+          branch: ${{ github.base_ref }}
+          name_is_regexp: true
+          name: "test-results-.*"
+          path: baseline-results/
+          if_no_artifact_found: warn
+
+      # Manual dispatch with explicit baseline run ID
+      - name: Download baseline results (manual, explicit run ID)
+        if: >
+          github.event_name == 'workflow_dispatch' &&
+          inputs.baseline_run_id != ''
+        uses: actions/download-artifact@v4
+        continue-on-error: true
+        with:
+          run-id: ${{ inputs.baseline_run_id }}
+          name_is_regexp: true
+          name: "test-results-.*"
+          path: baseline-results/
+
+      # Manual dispatch without explicit baseline: fetch latest from main
+      - name: Download baseline results (manual, latest main)
+        if: >
+          github.event_name == 'workflow_dispatch' &&
+          inputs.baseline_run_id == ''
+        uses: dawidd6/action-download-artifact@v3
+        continue-on-error: true
+        with:
+          workflow: integration-tests.yml
+          branch: main
+          name_is_regexp: true
+          name: "test-results-.*"
+          path: baseline-results/
+          if_no_artifact_found: warn
+
+      - name: Locate result directories
+        id: dirs
+        run: |
+          CURRENT=$(ls current-results/ 2>/dev/null | sort -r | head -1)
+          echo "current=current-results/$CURRENT" >> "$GITHUB_OUTPUT"
+
+          BASELINE=$(ls baseline-results/ 2>/dev/null | sort -r | head -1)
+          if [ -n "$BASELINE" ]; then
+            echo "baseline=baseline-results/$BASELINE" >> "$GITHUB_OUTPUT"
+          else
+            echo "baseline=" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Generate metrics report
+        id: report
+        continue-on-error: true
+        run: |
+          CURRENT="${{ steps.dirs.outputs.current }}"
+          BASELINE="${{ steps.dirs.outputs.baseline }}"
+
+          if [ -n "$BASELINE" ]; then
+            python tests/parse_metrics.py \
+              --current "$CURRENT" \
+              --baseline "$BASELINE" \
+              --output report.md
+          else
+            python tests/parse_metrics.py \
+              --current "$CURRENT" \
+              --output report.md
+          fi
+
+      - name: Post PR comment
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            let body;
+            try {
+              body = fs.readFileSync('report.md', 'utf8');
+            } catch {
+              body = '_No metrics report generated._';
+            }
+            const header = `## Test Metrics — \`${{ github.sha }}\`\n\n`;
+            await github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: header + body,
+            });
+
+      - name: Write job summary
+        if: always()
+        run: |
+          if [ -f report.md ]; then
+            echo "## Test Metrics — \`${{ github.sha }}\`" >> "$GITHUB_STEP_SUMMARY"
+            echo "" >> "$GITHUB_STEP_SUMMARY"
+            cat report.md >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "_No metrics report generated._" >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: Fail on regression
+        if: steps.report.outcome == 'failure'
+        run: |
+          echo "::error::Metric regression detected — see the report above for details."
+          exit 1
diff --git a/docs/development/intermediate/testing/system_testing.md b/docs/development/intermediate/testing/system_testing.md
index e69de29bb..07b2b0404 100644
--- a/docs/development/intermediate/testing/system_testing.md
+++ b/docs/development/intermediate/testing/system_testing.md
@@ -0,0 +1,267 @@
+# System Testing
+
+AirStack's system tests bring up the full Docker-based stack — simulator, robot containers, and GCS — and verify end-to-end behavior: container health, ROS 2 node presence, sensor publishing rates, and compute resource usage. Tests are written in Python with pytest and live under `tests/` at the repo root.
+
+---
+
+## Test Suite Structure
+
+| Module | Mark | What it tests | Hardware required |
+|--------|------|---------------|-------------------|
+| [`test_build_docker.py`](../../../../tests/test_build_docker.py) | `build_docker` | Docker image builds (robot-desktop, gcs, isaac-sim, ms-airsim); records image sizes | Docker daemon |
+| [`test_build_packages.py`](../../../../tests/test_build_packages.py) | `build_packages` | `colcon build` inside each container (robot, GCS, ms-airsim ROS workspace) | Docker daemon |
+| [`test_liveliness.py`](../../../../tests/test_liveliness.py) | `liveliness` | Full stack up: container health, tmux process liveness, sentinel ROS 2 nodes, sim topic publishing rates, compute usage, sustained stability | Docker daemon, GPU, sim license |
+
+Marks can be combined with pytest logic: `-m "build_docker or build_packages"`, `-m liveliness`.
+
+---
+
+## Test Infrastructure
+
+All shared fixtures, helpers, and configuration live in [`tests/conftest.py`](../../../../tests/conftest.py).
+
+### `airstack_env` fixture
+
+Parametrized over `(sim, num_robots, iteration)` tuples derived from CLI flags. For each combination it:
+
+1. Calls `airstack up` with the appropriate `COMPOSE_PROFILES`, `NUM_ROBOTS`, and headless flags
+2. Records `airstack_up_duration_s` to `metrics.json`
+3. Yields an `env` dict used by every `TestLiveliness` test
+4. Tears down with `airstack down` and records `airstack_down_duration_s`
+
+### `MetricsRecorder`
+
+Writes custom metrics to `tests/results/<timestamp>/metrics.json` after each `record()` call. Keys follow the pattern `test_node_id → metric_key → {value, unit, direction}`. Time-series data (Hz samples, compute snapshots) are stored as `{key}_samples` lists and expanded into scalar aggregates (mean, min, max, start_mean, end_mean) by `parse_metrics.py`.
+
+### Output files
+
+Every test run produces a timestamped directory:
+
+```
+tests/results/
+└── 2025-04-21_14-30-00/
+    ├── results.xml        # JUnit XML — test durations and pass/fail status
+    ├── metrics.json       # Custom metrics (image sizes, Hz, compute, timing)
+    └── logs/
+        ├── test_build_docker.TestDockerBuilds.test_build_robot_desktop.log
+        ├── test_liveliness.TestLiveliness.test_stable[msairsim-1-iter0].log
+        └── ...            # One log file per test execution
+```
+
+---
+
+## Running Locally
+
+### Prerequisites
+
+- Docker daemon running with the `runner` user (or your user) in the `docker` group
+- NVIDIA drivers + `nvidia-container-toolkit` for liveliness tests
+- `pip install -r tests/requirements.txt`
+
+### Direct (recommended for development)
+
+```bash
+# From the repo root:
+export AIRSTACK_ROOT=$(pwd)
+
+# Build tests only (fast, no GPU needed)
+pytest tests/ -m "build_docker or build_packages" -v
+
+# Full liveliness run — ms-airsim, 1 robot, 1 iteration, 60s stability window
+pytest tests/ -m liveliness \
+  --sim msairsim \
+  --num-robots 1 \
+  --stress-iterations 1 \
+  --stable-duration 60 \
+  -v
+
+# Show GUI windows (for local visual inspection)
+pytest tests/ -m liveliness --gui -v
+```
+
+### Docker-compose wrapper
+
+The `tests/docker/` directory provides a containerized test runner that has Docker CLI and all Python dependencies pre-installed.
+
+```bash
+export AIRSTACK_PATH=$(pwd)
+docker compose -f tests/docker/docker-compose.yaml run --rm test \
+  pytest -m "build_docker or build_packages" -v
+```
+
+### CLI option reference
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--sim` | `msairsim,isaacsim` | Comma-separated sim targets |
+| `--num-robots` | `1,3` | Comma-separated robot counts |
+| `--stress-iterations` | `3` | Up/down cycles per (sim, num_robots) config |
+| `--stable-duration` | `120` | Seconds `test_stable` polls for |
+| `--stable-interval` | `10` | Seconds between polls in `test_stable` |
+| `--gui` | off | Show simulator GUI (disables headless mode) |
+
+---
+
+## Metrics Reporting (`parse_metrics.py`)
+
+[`tests/parse_metrics.py`](../../../../tests/parse_metrics.py) reads `results.xml` and `metrics.json` from a run directory and produces a markdown report. It has two modes:
+
+### Single-run report
+
+```bash
+python tests/parse_metrics.py \
+  --current tests/results/2025-04-21_14-30-00/
+```
+
+Prints a markdown table of all recorded metrics. Always exits 0.
+
+### Diff / regression check
+
+```bash
+python tests/parse_metrics.py \
+  --current  tests/results/2025-04-21_14-30-00/ \
+  --baseline tests/results/2025-04-20_09-00-00/ \
+  --threshold 20          # optional: regression if change% exceeds this (default 20)
+  --output   report.md    # optional: also write to file
+```
+
+Prints a side-by-side comparison. Exits **1** if any metric regresses beyond the threshold; exits 0 otherwise.
+
+The report has three sections per test module:
+
+- **Metrics** — flat table of scalar metrics (test name, metric key, value/baseline, change%)
+- **Sim publishing rates** — pivot table of topic Hz aggregates (mean, start_mean, end_mean, min, max)
+- **Compute usage** — pivot table of CPU/memory/GPU metrics per container
+
+Regressions are flagged with :red_circle:, improvements with :green_circle:.
+
+---
+
+## CI/CD Integration
+
+### Workflow: `integration-tests.yml`
+
+[`.github/workflows/integration-tests.yml`](../../../../.github/workflows/integration-tests.yml) runs on:
+
+- **Pull requests** to `main` or `develop` — automatically runs `build_docker or build_packages` tests (no GPU-intensive liveliness run on every PR)
+- **Manual dispatch** (`workflow_dispatch`) — fully configurable for liveliness runs and metric comparisons
+
+#### Manual dispatch inputs
+
+| Input | Default | Description |
+|-------|---------|-------------|
+| `marks` | `liveliness` | pytest marks expression |
+| `sim` | `msairsim` | Sim targets |
+| `num_robots` | `1` | Robot counts |
+| `stress_iterations` | `1` | Iterations per config |
+| `stable_duration` | `120` | Stability polling seconds |
+| `baseline_run_id` | _(blank)_ | Run ID for comparison; blank = latest `main` run |
+
+#### Jobs
+
+**`run-tests`** runs on the self-hosted GPU runner (`[self-hosted, airstack, gpu]`). It installs dependencies, runs pytest, and uploads `tests/results/` as an artifact named `test-results-<sha>-<run_id>` with 90-day retention.
+
+**`report`** runs on `ubuntu-latest` after `run-tests` (even if it failed). It:
+
+1. Downloads the current artifact
+2. Downloads a baseline artifact (from the base branch for PRs, from `main` for manual runs, or from the specified `baseline_run_id`)
+3. Runs `parse_metrics.py` in diff mode if a baseline is found, otherwise in single-run mode
+4. Posts the markdown report as a PR comment (PR runs) or to the job summary (all runs)
+5. Fails with `::error::` if `parse_metrics.py` exits 1 (regression detected)
+
+#### Required third-party action
+
+The workflow uses [`dawidd6/action-download-artifact@v3`](https://github.com/dawidd6/action-download-artifact) to download artifacts from other workflow runs by branch name. This is a community action and must be trusted in your repository's Actions settings if you use a restricted allowed-actions policy.
+
+---
+
+## Self-Hosted Runner Setup
+
+AirStack's tests require a GPU and Docker, so they run on a self-hosted OpenStack VM. The setup uses the **ephemeral runner** pattern: each runner process registers, executes exactly one job, and then de-registers. This prevents cross-job environment contamination and stale runner accumulation.
+
+### 1. Create the runner user
+
+```bash
+sudo useradd -m -s /bin/bash runner
+sudo usermod -aG docker runner   # allows Docker commands without sudo
+```
+
+### 2. Install the GitHub Actions runner binary
+
+Download the latest runner tarball from [github.com/actions/runner/releases](https://github.com/actions/runner/releases) and unpack it:
+
+```bash
+sudo mkdir -p /opt/actions-runner
+cd /opt/actions-runner
+# Replace the URL with the current release for linux-x64:
+curl -Lo actions-runner.tar.gz https://github.com/actions/runner/releases/download/vX.Y.Z/actions-runner-linux-x64-X.Y.Z.tar.gz
+sudo tar xzf actions-runner.tar.gz -C /opt/actions-runner
+sudo chown -R runner:runner /opt/actions-runner
+```
+
+### 3. Store the GitHub PAT
+
+Create a fine-grained or classic PAT with **`repo`** scope (for private repos) or **`public_repo`** scope (for public repos). Store it securely:
+
+```bash
+echo "ghp_YOUR_TOKEN_HERE" | sudo tee /etc/github-runner-pat
+sudo chmod 600 /etc/github-runner-pat
+sudo chown runner:runner /etc/github-runner-pat
+```
+
+### 4. Configure runner environment
+
+Create `/etc/github-runner-env` (loaded by the systemd unit):
+
+```ini
+REPO_URL=https://github.com/YOUR_ORG/AirStack
+REPO_PATH=YOUR_ORG/AirStack
+RUNNER_LABELS=self-hosted,airstack,gpu
+RUNNER_GROUP=Default
+RUNNER_DIR=/opt/actions-runner
+PAT_FILE=/etc/github-runner-pat
+```
+
+```bash
+sudo chmod 600 /etc/github-runner-env
+sudo chown runner:runner /etc/github-runner-env
+```
+
+### 5. Install the registration script
+
+```bash
+sudo cp .github/runners/register-runner.sh /opt/actions-runner/register-runner.sh
+sudo chown runner:runner /opt/actions-runner/register-runner.sh
+sudo chmod +x /opt/actions-runner/register-runner.sh
+```
+
+### 6. Install and enable the systemd service
+
+```bash
+sudo cp .github/runners/airstack-runner.service /etc/systemd/system/
+sudo systemctl daemon-reload
+sudo systemctl enable --now airstack-runner.service
+```
+
+Check status:
+
+```bash
+sudo systemctl status airstack-runner.service
+sudo journalctl -u airstack-runner.service -f
+```
+
+### 7. Verify runner registration
+
+After the service starts it will loop waiting for a job. Confirm it appears in **GitHub → Repository → Settings → Actions → Runners** with the labels `self-hosted`, `airstack`, `gpu` and status **Idle**.
+
+Trigger a `workflow_dispatch` run and watch the runner pick it up, complete the job, and re-register.
+
+### Security considerations
+
+| Concern | Mitigation |
+|---------|------------|
+| Fork PRs executing arbitrary code on the runner | Workflow has `if: github.event.pull_request.head.repo.full_name == github.repository` — fork PRs are skipped entirely |
+| Cross-job state pollution | `--ephemeral` flag: runner de-registers and the process exits after each job; the systemd loop starts a clean process for the next job |
+| Runner running as root | Dedicated non-root `runner` user; never set `RUNNER_ALLOW_RUNASROOT=1` |
+| Docker socket gives root-equivalent access | Accepted risk for lab use; the fork PR guard above limits who can reach the runner |
+| Long-lived PAT stored on disk | Scope the PAT to the minimum required; rotate it periodically; `chmod 600` and owned by `runner` only |
diff --git a/mkdocs.yml b/mkdocs.yml
index b6ebaea65..c14223f78 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -67,10 +67,10 @@ nav:
           - Testing:
               - docs/development/intermediate/testing/index.md
               - docs/development/intermediate/testing/testing_frameworks.md
-              - docs/development/intermediate/testing/unit_testing.md
-              - docs/development/intermediate/testing/integration_testing.md
-              - docs/development/intermediate/testing/system_testing.md
               - docs/development/intermediate/testing/ci_cd.md
+              - docs/development/intermediate/testing/system_testing.md
+              - docs/development/intermediate/testing/integration_testing.md
+              - docs/development/intermediate/testing/unit_testing.md
           - Frame Conventions: docs/development/intermediate/frame_conventions.md
           - Contributing: 
               - docs/development/intermediate/contributing.md

From dbf425f8ba7a48d79e4d06af78693e2a2d06e87e Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Tue, 21 Apr 2026 22:18:26 -0400
Subject: [PATCH 12/24] Tag test docker + add measuring realtime factor to
 liveliness

---
 .airstack/modules/dev.sh         |  5 +-
 tests/docker/docker-compose.yaml |  3 ++
 tests/parse_metrics.py           |  1 +
 tests/test_liveliness.py         | 84 ++++++++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/.airstack/modules/dev.sh b/.airstack/modules/dev.sh
index d2fa24a6c..1308464b6 100644
--- a/.airstack/modules/dev.sh
+++ b/.airstack/modules/dev.sh
@@ -7,12 +7,13 @@
 function cmd_dev_test {
     check_docker
     local compose_file="$PROJECT_ROOT/tests/docker/docker-compose.yaml"
+    local env_file="$PROJECT_ROOT/.env"
     export AIRSTACK_PATH="$PROJECT_ROOT"
     # Grant X access so sim containers spawned by tests in GUI mode
     # (`pytest --gui`) can reach the host's X server. No-op otherwise.
     xhost + || log_warn "xhost failed (is DISPLAY set? xhost installed?)"
-    docker compose -f "$compose_file" build --quiet
-    docker compose -f "$compose_file" run --rm test pytest "$@"
+    docker compose --env-file "$env_file" -f "$compose_file" build --quiet
+    docker compose --env-file "$env_file" -f "$compose_file" run --rm test pytest "$@"
 }
 
 # Function to build documentation
diff --git a/tests/docker/docker-compose.yaml b/tests/docker/docker-compose.yaml
index 795891d19..4e392d284 100644
--- a/tests/docker/docker-compose.yaml
+++ b/tests/docker/docker-compose.yaml
@@ -1,8 +1,11 @@
 services:
   test:
+    image: &tests_image ${PROJECT_DOCKER_REGISTRY}/${PROJECT_NAME}:v${VERSION}_tests
     build:
       context: ../
       dockerfile: docker/Dockerfile
+      tags:
+        - *tests_image
     volumes:
       - /var/run/docker.sock:/var/run/docker.sock
       - ${AIRSTACK_PATH}:${AIRSTACK_PATH}:ro
diff --git a/tests/parse_metrics.py b/tests/parse_metrics.py
index 2ca4fc1db..27b71a3fe 100644
--- a/tests/parse_metrics.py
+++ b/tests/parse_metrics.py
@@ -39,6 +39,7 @@
     "vram_mb": {"unit": "MB", "direction": "lower_is_better"},
     "gpu_temp_c": {"unit": "°C", "direction": "lower_is_better"},
     "gpu_power_w": {"unit": "W", "direction": "lower_is_better"},
+    "realtime_factor": {"unit": "", "direction": "higher_is_better"},
 }
 COMPUTE_TYPES = tuple(k for k in SAMPLE_TYPES if k != "hz")
 
diff --git a/tests/test_liveliness.py b/tests/test_liveliness.py
index fb2a54a2b..a3e42f502 100644
--- a/tests/test_liveliness.py
+++ b/tests/test_liveliness.py
@@ -180,6 +180,77 @@ def _check_compute_usage(env):
     return True, f"{len(samples)} compute metrics sampled", samples
 
 
+def _read_clock_once(sim_container, setup_bash):
+    """Read one /clock message. Returns (sim_t_seconds, wall_t_seconds) or
+    (None, None) on failure. Wall time is recorded right after parsing so the
+    variable subscription-setup latency of `ros2 topic echo --once` is bounded
+    on the near side — i.e. wall_t is taken just after the sim sample was
+    observed, not before the subscription opened."""
+    result = ros2_exec(
+        sim_container,
+        "timeout 5 ros2 topic echo --once /clock",
+        domain_id=1, setup_bash=setup_bash, timeout=10,
+    )
+    if result.returncode != 0:
+        logger.warning("ros2 topic echo /clock failed (rc=%d): stderr=%s",
+                       result.returncode, result.stderr.strip()[:300])
+        return None, None
+    sec = nsec = None
+    for line in result.stdout.splitlines():
+        s = line.strip()
+        if s.startswith("sec:") and sec is None:
+            try:
+                sec = int(s.split(":", 1)[1].strip())
+            except ValueError:
+                pass
+        elif s.startswith("nanosec:") and nsec is None:
+            try:
+                nsec = int(s.split(":", 1)[1].strip())
+            except ValueError:
+                pass
+        if sec is not None and nsec is not None:
+            return sec + nsec * 1e-9, time.time()
+    logger.warning("could not parse /clock sec/nanosec. stdout head=%r",
+                   result.stdout[:300])
+    return None, None
+
+
+def _check_realtime_factor(env, sample_interval=20.0):
+    """Measure sim realtime factor = Δ sim_time / Δ wall_time between two
+    /clock reads. Returns (ok, msg, rtf_or_None). ok=False only if rtf < 0.1
+    (sim essentially stalled).
+
+    Uses a 20s window so subscription-setup jitter (~1s) is <5% of the signal.
+    Not called from test_stable (overruns the 10s poll cadence)."""
+    cfg = env["cfg"]
+    sim_container = env["sim_container"]
+    setup_bash = cfg["sim_setup_bash"]
+
+    logger.info("RTF: reading initial /clock from %s", sim_container)
+    sim_t1, wall_t1 = _read_clock_once(sim_container, setup_bash)
+    if sim_t1 is None:
+        return False, "failed to read initial /clock", None
+    logger.info("RTF: initial sim_t=%.3f, sleeping %.1fs", sim_t1, sample_interval)
+
+    time.sleep(sample_interval)
+
+    sim_t2, wall_t2 = _read_clock_once(sim_container, setup_bash)
+    if sim_t2 is None:
+        return False, "failed to read final /clock", None
+
+    wall_delta = wall_t2 - wall_t1
+    sim_delta = sim_t2 - sim_t1
+    logger.info("RTF: final sim_t=%.3f (sim Δ=%.3fs, wall Δ=%.3fs)",
+                sim_t2, sim_delta, wall_delta)
+    if wall_delta <= 0:
+        return False, "non-positive wall time delta", None
+    rtf = sim_delta / wall_delta
+    logger.info("RTF: %.3f", rtf)
+    if rtf < 0.1:
+        return False, f"RTF={rtf:.3f} (sim near-stalled)", rtf
+    return True, f"RTF={rtf:.3f}", rtf
+
+
 # ── tests ──────────────────────────────────────────────────────────────────
 
 def _poll_until(predicate, timeout, interval, fail_msg):
@@ -267,6 +338,19 @@ def test_compute_usage(self, airstack_env):
         ok, msg, _ = _check_compute_usage(airstack_env)
         assert ok, msg
 
+    @pytest.mark.dependency(name="rtf", depends=["sim_ready"])
+    def test_realtime_factor(self, airstack_env):
+        """Measure RTF = Δ sim_time / Δ wall_time from /clock. Fails only if
+        sim is near-stalled (RTF < 0.1); low-but-nonzero values are recorded
+        but not gating."""
+        ok, msg, rtf = _check_realtime_factor(airstack_env)
+        if rtf is not None:
+            get_metrics().record(
+                current_test_id(), "sim.realtime_factor", round(rtf, 3),
+                unit="", direction="higher_is_better",
+            )
+        assert ok, msg
+
     @pytest.mark.dependency(name="nodes", depends=["containers"])
     def test_sentinel_nodes_present(self, airstack_env):
         """Wait up to 300s for the expected sentinel nodes per robot."""

From 95b52abd981f3e12f4712ec260963e5607dfd2f9 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Wed, 22 Apr 2026 00:29:58 -0400
Subject: [PATCH 13/24] Initial working autonomy (Takeoff, hover, land) tests.

Still need to work on making metrics more informative and informative logging messages.

Also have not tested state calculating state estimation errors as we will need ground truth from airsim and isaacsim
---
 tests/conftest.py      |  44 ++++
 tests/pytest.ini       |   3 +-
 tests/requirements.txt |   1 +
 tests/test_autonomy.py | 528 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 574 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_autonomy.py

diff --git a/tests/conftest.py b/tests/conftest.py
index f94f73f43..cc1a788c7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -71,6 +71,9 @@ def pytest_addoption(parser):
     parser.addoption("--gui", action="store_true", default=False,
                      help="Show sim GUI windows for visual sanity checks. "
                           "Default: headless (no X, good for CI).")
+    parser.addoption("--takeoff-velocities", default="0.5,1,2",
+                     help="Comma-separated takeoff/land velocities (m/s) to "
+                          "sweep in test_autonomy. Default: 0.5,1,2")
 
 
 def pytest_configure(config):
@@ -101,6 +104,14 @@ def pytest_runtest_teardown(item):
     _CURRENT_ITEM = None
 
 
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    """Attach phase reports to the item so fixtures can inspect pass/fail."""
+    outcome = yield
+    rep = outcome.get_result()
+    setattr(item, f"_rep_{rep.when}", rep)
+
+
 @contextmanager
 def logger_to(log_name):
     """Temporarily route `logger` to a different file. Suspends any handlers
@@ -136,6 +147,37 @@ def pytest_generate_tests(metafunc):
     metafunc.parametrize("airstack_env", params, ids=ids, indirect=True, scope="class")
 
 
+# Sort autonomy tests by (airstack_env, velocity, phase) so the stack comes
+# up once per env and the drone goes ground→air→ground per velocity.
+_AUTONOMY_PHASE_ORDER = {
+    "test_px4_ready": 0,
+    "test_takeoff":   1,
+    "test_hover":     2,
+    "test_landing":   3,
+}
+
+
+def pytest_collection_modifyitems(items):
+    def phase(item):
+        if getattr(item.module, "__name__", "") != "test_autonomy":
+            return None
+        name = item.originalname or item.name.split("[", 1)[0]
+        return _AUTONOMY_PHASE_ORDER.get(name)
+
+    def sort_key(item):
+        cs = getattr(item, "callspec", None)
+        env = cs.params.get("airstack_env", ()) if cs else ()
+        vel = float(cs.params.get("velocity", 0.0)) if cs else 0.0
+        return (env, vel, phase(item))
+
+    slots = [(i, it) for i, it in enumerate(items) if phase(it) is not None]
+    if not slots:
+        return
+    sorted_items = sorted((it for _, it in slots), key=sort_key)
+    for (i, _), new_item in zip(slots, sorted_items):
+        items[i] = new_item
+
+
 # ── logging / subprocess helpers ───────────────────────────────────────────
 
 def _nodeid_dotted(nodeid, with_path_sep=False):
@@ -509,6 +551,8 @@ def airstack_env(request):
     env_overrides.update(cfg.get("extra_env", {}))
 
     with logger_to(log):
+        logger.info("Shutting down any previously running stack")
+        airstack_cmd("down", timeout=120, log_name=log)
         logger.info("Bringing up stack: sim=%s num_robots=%d iter=%d headless=%s",
                     sim, num_robots, iteration, headless)
         t0 = time.time()
diff --git a/tests/pytest.ini b/tests/pytest.ini
index 2650a4181..f55a8e6e5 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -3,8 +3,7 @@ markers =
     build_docker: Docker image build tests
     build_packages: Colcon workspace build tests
     liveliness: Container health and ROS2 node presence
-    comms: Cross-container ROS2 communication
-    takeoff: Takeoff/land scenario tests
+    autonomy: Autonomy action tests (takeoff / hover / land)
 testpaths = .
 addopts = -v --durations=0
 cache_dir = /tmp/.pytest_cache
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 382731834..bc2a16d4d 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -3,3 +3,4 @@ pytest-timeout
 pytest-dependency
 tabulate
 psutil
+pandas
diff --git a/tests/test_autonomy.py b/tests/test_autonomy.py
new file mode 100644
index 000000000..89d036fc4
--- /dev/null
+++ b/tests/test_autonomy.py
@@ -0,0 +1,528 @@
+"""Autonomy tests — 4-phase chain per velocity.
+
+Per (sim, num_robots, iter, velocity): ready → takeoff → hover → land.
+Drone returns to ground at end of each velocity so the next velocity
+starts fresh. A local `pytest_collection_modifyitems` hook reorders the
+autonomy tests so the full 4-phase chain runs per velocity before pytest
+advances to the next velocity.
+"""
+import bisect
+import math
+import statistics
+import subprocess
+import time
+from concurrent.futures import ThreadPoolExecutor
+from io import StringIO
+
+import pandas as pd
+import pytest
+
+from conftest import (
+    ROS_DISTRO_SETUP,
+    current_test_id,
+    get_metrics,
+    get_robot_containers,
+    logger,
+    ros2_exec,
+)
+
+# ── configuration ──────────────────────────────────────────────────────────
+
+TARGET_ALTITUDE_M = 10.0
+HOVER_DURATION_S = 10.0
+PX4_READY_TIMEOUT_S = 300.0
+PX4_POLL_INTERVAL_S = 2.0
+MOTION_ABOVE_START_M = 0.3  # z threshold for "drone started moving" (relative to z[0])
+
+# Full column schemas of `ros2 topic echo --csv` output, in declaration order.
+# Covariance arrays expand to 36 comma-separated values each. Downstream code
+# reads only the ~9 fields it cares about by name (e.g. "pose.pose.position.z")
+# — other columns are parsed but unused.
+ODOM_SCHEMA = (
+    ["header.stamp.sec", "header.stamp.nanosec",
+     "header.frame_id", "child_frame_id",
+     "pose.pose.position.x", "pose.pose.position.y", "pose.pose.position.z",
+     "pose.pose.orientation.x", "pose.pose.orientation.y",
+     "pose.pose.orientation.z", "pose.pose.orientation.w"]
+    + [f"pose.covariance[{i}]" for i in range(36)]
+    + ["twist.twist.linear.x", "twist.twist.linear.y", "twist.twist.linear.z",
+       "twist.twist.angular.x", "twist.twist.angular.y", "twist.twist.angular.z"]
+    + [f"twist.covariance[{i}]" for i in range(36)]
+)
+POSE_SCHEMA = [
+    "header.stamp.sec", "header.stamp.nanosec", "header.frame_id",
+    "pose.position.x", "pose.position.y", "pose.position.z",
+    "pose.orientation.x", "pose.orientation.y",
+    "pose.orientation.z", "pose.orientation.w",
+]
+
+METRIC_UNITS = {
+    "ready_duration_sys_s": "s",
+    "takeoff_duration_sim_s": "s",
+    "land_duration_sim_s": "s",
+    "velocity_rmse_m_sim_s": "m/s",
+    "attitude_stddev_rad": "rad",
+    # Everything else: "m".
+}
+
+
+def _phase_timeout(velocity):
+    """Takeoff/land timeout scaled so 0.5 m/s runs don't time out spuriously."""
+    return max(30.0, TARGET_ALTITUDE_M / velocity + 15.0)
+
+
+# ── pytest hooks ───────────────────────────────────────────────────────────
+
+def pytest_generate_tests(metafunc):
+    """Parametrize tests that request `velocity` from --takeoff-velocities.
+
+    Phase-order reordering (so the 4-test chain runs per-velocity, not
+    parametrize-first) is done by `pytest_collection_modifyitems` in
+    conftest.py — that hook isn't discovered from test modules.
+    """
+    if "velocity" in metafunc.fixturenames:
+        raw = metafunc.config.getoption("--takeoff-velocities")
+        vels = [float(v) for v in raw.split(",") if v.strip()]
+        metafunc.parametrize("velocity", vels, ids=[f"v{v}" for v in vels])
+
+
+# ── subprocess / CSV helpers ───────────────────────────────────────────────
+
+def _start_csv_stream(container, topic, domain, setup_bash,
+                      duration_s, out_path):
+    """Background `ros2 topic echo --csv` streaming to out_path.
+
+    Each message prints as a single CSV line with all primitives flattened in
+    declaration order. Callers pick the exact numeric columns they want via
+    pandas `usecols` (see `_parse_csv`). `--no-arr`/`--no-str` are deliberately
+    NOT used: they replace fields with placeholder strings (e.g. `<string
+    length: <0>>`) instead of dropping them, which would break index mapping.
+
+    Returns (popen, file_handle, err_file_handle). Caller must close both
+    file handles after the process terminates (see `_finish_captures`).
+    """
+    cmd = (
+        f"source {ROS_DISTRO_SETUP} && source {setup_bash} && "
+        f"export ROS_DOMAIN_ID={domain} && "
+        f"timeout {int(duration_s)} ros2 topic echo --csv {topic}"
+    )
+    f = open(out_path, "w")
+    ef = open(out_path + ".err", "w")
+    try:
+        proc = subprocess.Popen(
+            ["docker", "exec", container, "bash", "-c", cmd],
+            stdout=f, stderr=ef,
+        )
+    except BaseException:
+        f.close()
+        ef.close()
+        raise
+    return proc, f, ef
+
+
+def _parse_csv(path, schema):
+    """Read ros2 `--csv` output. `schema` names every column in the flattened
+    CSV in declaration order. Non-CSV lines (stray `WARNING:` prints ros2 emits
+    to stdout) are filtered before pandas parses."""
+    with open(path) as f:
+        good = [line for line in f if line.count(",") >= len(schema) - 1]
+    if not good:
+        return []
+    df = pd.read_csv(StringIO("".join(good)), header=None, names=schema)
+    return df.to_dict("records")
+
+
+def _stamp(row, prefix="header.stamp"):
+    """Sim-time seconds from a parsed row."""
+    return row[f"{prefix}.sec"] + row[f"{prefix}.nanosec"] * 1e-9
+
+
+# ── action result parsing ──────────────────────────────────────────────────
+
+def _action_ok(stdout):
+    """True when ros2 action send_goal --feedback reports success: true (YAML bool)."""
+    return "success: true" in stdout
+
+
+def _action_message(stdout):
+    for line in stdout.splitlines():
+        s = line.strip()
+        if s.startswith("message:"):
+            return s[len("message:"):].strip().strip("'\"")
+    return "\n".join(stdout.strip().splitlines()[-5:])
+
+
+# ── metric computation ────────────────────────────────────────────────────
+
+def _roll_pitch(qx, qy, qz, qw):
+    """Quaternion → (roll, pitch) in radians. Yaw is unused at hover."""
+    roll = math.atan2(2.0 * (qw * qx + qy * qz), 1.0 - 2.0 * (qx * qx + qy * qy))
+    sinp = max(-1.0, min(1.0, 2.0 * (qw * qy - qz * qx)))
+    pitch = math.asin(sinp)
+    return roll, pitch
+
+
+def _valid_range(start, end):
+    """True iff both indices are set and end follows start."""
+    return start is not None and end is not None and end > start
+
+
+def _velocity_rmse(ts, zs, i0, i1, v_cmd):
+    """RMSE of dz/dt vs commanded velocity across the [i0, i1] sample range."""
+    sq_errs = []
+    for i in range(i0 + 1, i1 + 1):
+        dt = ts[i] - ts[i - 1]
+        if dt > 1e-6:
+            sq_errs.append(((zs[i] - zs[i - 1]) / dt - v_cmd) ** 2)
+    if not sq_errs:
+        return None
+    return math.sqrt(sum(sq_errs) / len(sq_errs))
+
+
+def _tracking_metrics_takeoff(odom, target, velocity):
+    zs = [r["pose.pose.position.z"] for r in odom]
+    ts = [_stamp(r) for r in odom]
+    peak = max(zs)
+    out = {
+        "peak_altitude_m": round(peak, 3),
+        "altitude_error_m": round(abs(peak - target), 3),
+        "overshoot_m": round(max(0.0, peak - target), 3),
+    }
+    # Motion threshold is relative to starting altitude so drones that spawn
+    # slightly above ground (landing gear, URDF origin offset) don't register
+    # the first sample as "already moving".
+    z0 = zs[0]
+    first_motion = next((i for i, z in enumerate(zs)
+                         if z > z0 + MOTION_ABOVE_START_M), None)
+    first_at_target = next((i for i, z in enumerate(zs) if z >= target * 0.95), None)
+    if _valid_range(first_motion, first_at_target):
+        out["takeoff_duration_sim_s"] = round(ts[first_at_target] - ts[first_motion], 3)
+        rmse = _velocity_rmse(ts, zs, first_motion, first_at_target, velocity)
+        if rmse is not None:
+            out["velocity_rmse_m_sim_s"] = round(rmse, 3)
+    return out
+
+
+def _tracking_metrics_hover(odom, target):
+    xs = [r["pose.pose.position.x"] for r in odom]
+    ys = [r["pose.pose.position.y"] for r in odom]
+    zs = [r["pose.pose.position.z"] for r in odom]
+    out = {
+        "hover_altitude_stddev_m": round(statistics.pstdev(zs) if len(zs) > 1 else 0.0, 3),
+        "hover_altitude_mean_error_m": round(abs(statistics.mean(zs) - target), 3),
+    }
+    x0, y0 = xs[0], ys[0]
+    drift = max(math.sqrt((x - x0) ** 2 + (y - y0) ** 2) for x, y in zip(xs, ys))
+    out["horizontal_drift_max_m"] = round(drift, 3)
+    if len(odom) > 1:
+        rolls, pitches = zip(*(_roll_pitch(
+            r["pose.pose.orientation.x"], r["pose.pose.orientation.y"],
+            r["pose.pose.orientation.z"], r["pose.pose.orientation.w"])
+            for r in odom))
+        out["attitude_stddev_rad"] = round(
+            statistics.pstdev(rolls) + statistics.pstdev(pitches), 3)
+    return out
+
+
+def _tracking_metrics_landing(odom, velocity):
+    zs = [r["pose.pose.position.z"] for r in odom]
+    ts = [_stamp(r) for r in odom]
+    out = {"final_altitude_m": round(zs[-1], 3)}
+    peak = max(zs)
+    first_descent = next((i for i, z in enumerate(zs) if z < peak * 0.8), None)
+    first_at_ground = next((i for i, z in enumerate(zs) if z < 0.5), None)
+    if _valid_range(first_descent, first_at_ground):
+        out["land_duration_sim_s"] = round(ts[first_at_ground] - ts[first_descent], 3)
+        rmse = _velocity_rmse(ts, zs, first_descent, first_at_ground, -velocity)
+        if rmse is not None:
+            out["velocity_rmse_m_sim_s"] = round(rmse, 3)
+    return out
+
+
+def _gt_metrics(odom, gt):
+    """Odom vs ground-truth state-estimation error. Empty dict when GT missing."""
+    if not gt:
+        return {}
+    gt_sorted = sorted(gt, key=_stamp)
+    gt_stamps = [_stamp(r) for r in gt_sorted]
+    errs, z_biases = [], []
+    for row in odom:
+        t = _stamp(row)
+        i = bisect.bisect_left(gt_stamps, t)
+        candidates = []
+        if i > 0:
+            candidates.append(gt_sorted[i - 1])
+        if i < len(gt_sorted):
+            candidates.append(gt_sorted[i])
+        if not candidates:
+            continue
+        best = min(candidates, key=lambda r: abs(_stamp(r) - t))
+        ox, oy, oz = (row["pose.pose.position.x"],
+                      row["pose.pose.position.y"],
+                      row["pose.pose.position.z"])
+        gx, gy, gz = (best["pose.position.x"],
+                      best["pose.position.y"],
+                      best["pose.position.z"])
+        errs.append(math.sqrt((ox - gx) ** 2 + (oy - gy) ** 2 + (oz - gz) ** 2))
+        z_biases.append(oz - gz)
+    if not errs:
+        return {}
+    return {
+        "odometry_error_mean_m": round(statistics.mean(errs), 3),
+        "odometry_error_max_m": round(max(errs), 3),
+        "odometry_altitude_bias_m": round(statistics.mean(z_biases), 3),
+    }
+
+
+def _record(robot_n, metrics_dict):
+    """Record per-robot scalar metrics; unit inferred from the key suffix."""
+    m = get_metrics()
+    tid = current_test_id()
+    for key, value in metrics_dict.items():
+        if value is None:
+            continue
+        unit = METRIC_UNITS.get(key, "m")
+        m.record(tid, f"robot_{robot_n}.{key}", value,
+                 unit=unit, direction="lower_is_better")
+
+
+# ── capture bundle helper ──────────────────────────────────────────────────
+
+def _start_captures(robot_container, setup_bash, domain, duration_s, tag):
+    """Start odom + ground-truth CSV streams for one robot. Returns a handle
+    that `_finish_captures` later consumes to wait for completion and parse
+    both CSVs. The handle carries `duration_s` so the caller-less `wait`
+    timeout matches what the in-container streams were capped at."""
+    odom_path = f"/tmp/auto_r{domain}_{tag}_odom.csv"
+    gt_path = f"/tmp/auto_r{domain}_{tag}_gt.csv"
+    odom_proc, odom_fh, odom_ef = _start_csv_stream(
+        robot_container, f"/robot_{domain}/interface/mavros/local_position/odom",
+        domain, setup_bash, duration_s, odom_path)
+    gt_proc, gt_fh, gt_ef = _start_csv_stream(
+        robot_container, f"/robot_{domain}/ground_truth/pose",
+        domain, setup_bash, duration_s, gt_path)
+    return {
+        "duration_s": duration_s,
+        "odom": (odom_proc, odom_fh, odom_ef, odom_path),
+        "gt": (gt_proc, gt_fh, gt_ef, gt_path),
+    }
+
+
+def _finish_captures(streams):
+    """Stop capture subprocesses and return parsed (odom, gt) samples.
+    Callers invoke this right after the action completes, so we actively
+    terminate the captures instead of waiting for their internal `timeout N`
+    to elapse — otherwise fast takeoffs would block until the full capture
+    window expires. gt will be empty if no ground-truth publisher exists."""
+    (odom_proc, odom_fh, odom_ef, odom_path) = streams["odom"]
+    (gt_proc, gt_fh, gt_ef, gt_path) = streams["gt"]
+    try:
+        for proc in (odom_proc, gt_proc):
+            proc.terminate()
+            try:
+                proc.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait(timeout=5)
+    finally:
+        odom_fh.close()
+        gt_fh.close()
+        odom_ef.close()
+        gt_ef.close()
+    odom = _parse_csv(odom_path, ODOM_SCHEMA)
+    gt = _parse_csv(gt_path, POSE_SCHEMA)
+    if not odom:
+        logger.warning("odom capture empty. stdout head=%r stderr head=%r",
+                       open(odom_path).read(500),
+                       open(odom_path + ".err").read(500))
+    if not gt:
+        logger.warning("ground truth not available — skipping state-estimation error metrics.")
+    return odom, gt
+
+
+# ── per-robot workers (run in parallel for multi-robot) ───────────────────
+
+def _run_parallel(num_robots, fn):
+    """Run `fn(n)` for n=1..num_robots concurrently. If any worker raises, the
+    exception surfaces after all workers finish (so partial multi-robot
+    failures still show all results). Single-robot runs skip the executor."""
+    if num_robots == 1:
+        fn(1)
+        return
+    with ThreadPoolExecutor(max_workers=num_robots) as ex:
+        list(ex.map(fn, range(1, num_robots + 1)))
+
+
+def _takeoff_one_robot(n, robot_container, cfg, velocity):
+    timeout = _phase_timeout(velocity)
+    target = TARGET_ALTITUDE_M
+    streams = _start_captures(robot_container, cfg["robot_setup_bash"],
+                              n, timeout + 5, f"v{velocity}_takeoff")
+    goal = f"{{target_altitude_m: {target}, velocity_m_s: {velocity}}}"
+    result = ros2_exec(
+        robot_container,
+        f'ros2 action send_goal --feedback /robot_{n}/tasks/takeoff '
+        f'task_msgs/action/TakeoffTask "{goal}"',
+        domain_id=n, setup_bash=cfg["robot_setup_bash"],
+        timeout=int(timeout + 10),
+    )
+    odom, gt = _finish_captures(streams)
+    if not _action_ok(result.stdout):
+        pytest.fail(f"robot_{n} takeoff failed: {_action_message(result.stdout)}")
+    if not odom:
+        pytest.fail(f"robot_{n} takeoff: no odom samples captured")
+    metrics = _tracking_metrics_takeoff(odom, target, velocity)
+    metrics.update(_gt_metrics(odom, gt))
+    _record(n, metrics)
+    peak = metrics["peak_altitude_m"]
+    assert peak >= target * 0.9, (
+        f"robot_{n} peak altitude {peak:.2f}m < target*0.9={target * 0.9:.2f}m")
+
+
+def _hover_one_robot(n, robot_container, cfg, velocity):
+    target = TARGET_ALTITUDE_M
+    streams = _start_captures(robot_container, cfg["robot_setup_bash"],
+                              n, HOVER_DURATION_S + 2, f"v{velocity}_hover")
+    # Passive phase: no blocking action, so we sleep to let the capture
+    # collect samples before _finish_captures terminates it.
+    time.sleep(HOVER_DURATION_S)
+    odom, gt = _finish_captures(streams)
+    if not odom:
+        pytest.fail(f"robot_{n} hover: no odom samples captured")
+    metrics = _tracking_metrics_hover(odom, target)
+    metrics.update(_gt_metrics(odom, gt))
+    _record(n, metrics)
+    mean_err = metrics["hover_altitude_mean_error_m"]
+    assert mean_err < 0.5, (
+        f"robot_{n} hover altitude mean error {mean_err:.2f}m exceeds ±0.5m tolerance")
+
+
+def _landing_one_robot(n, robot_container, cfg, velocity):
+    timeout = _phase_timeout(velocity)
+    streams = _start_captures(robot_container, cfg["robot_setup_bash"],
+                              n, timeout + 5, f"v{velocity}_land")
+    goal = f"{{velocity_m_s: {velocity}}}"
+    result = ros2_exec(
+        robot_container,
+        f'ros2 action send_goal --feedback /robot_{n}/tasks/land '
+        f'task_msgs/action/LandTask "{goal}"',
+        domain_id=n, setup_bash=cfg["robot_setup_bash"],
+        timeout=int(timeout + 10),
+    )
+    odom, gt = _finish_captures(streams)
+    if not _action_ok(result.stdout):
+        pytest.fail(f"robot_{n} land failed: {_action_message(result.stdout)}")
+    if not odom:
+        pytest.fail(f"robot_{n} land: no odom samples captured")
+    metrics = _tracking_metrics_landing(odom, velocity)
+    metrics.update(_gt_metrics(odom, gt))
+    _record(n, metrics)
+    final = metrics["final_altitude_m"]
+    assert final < 0.5, f"robot_{n} final altitude {final:.2f}m > 0.5m"
+
+
+# ── tests ──────────────────────────────────────────────────────────────────
+
+@pytest.mark.autonomy
+@pytest.mark.timeout(1800)
+class TestAutonomy:
+
+    @pytest.fixture(scope="session")
+    def _failed_envs(self):
+        return set()
+
+    @pytest.fixture(scope="session")
+    def _ready_envs(self):
+        return set()
+
+    @pytest.fixture(autouse=True)
+    def _chain_guard(self, request, airstack_env, _failed_envs):
+        env_id = (airstack_env["sim"], airstack_env["num_robots"],
+                  airstack_env["iteration"])
+        if env_id in _failed_envs:
+            pytest.skip(f"earlier autonomy test failed in {env_id}")
+        yield
+        rep = getattr(request.node, "_rep_call", None)
+        if rep is not None and rep.failed:
+            _failed_envs.add(env_id)
+
+    @pytest.mark.dependency(name="autonomy_ready")
+    def test_px4_ready(self, airstack_env, velocity, _ready_envs):
+        """Wait until every robot's MAVROS reports connected=True. Skipped on
+        velocities after the first in the same airstack_env — the drone is
+        already proven alive and post-landing MAV_STATE fluctuations shouldn't
+        re-gate the chain. The takeoff action handles arming/mode itself, so
+        STANDBY is not a required precondition."""
+        env_id = (airstack_env["sim"], airstack_env["num_robots"],
+                  airstack_env["iteration"])
+        if env_id in _ready_envs:
+            logger.info("px4_ready already confirmed for %s; skipping", env_id)
+            return
+
+        cfg = airstack_env["cfg"]
+        robot_container = get_robot_containers(airstack_env["robot_pattern"])[0]
+        num_robots = airstack_env["num_robots"]
+
+        started = time.time()
+        ready_at = {}
+        last_seen = {}
+        pending = list(range(1, num_robots + 1))
+        deadline = started + PX4_READY_TIMEOUT_S
+
+        while pending and time.time() < deadline:
+            for n in list(pending):
+                result = ros2_exec(
+                    robot_container,
+                    f"timeout 5 ros2 topic echo --once --csv "
+                    f"--field connected /robot_{n}/interface/mavros/state",
+                    domain_id=n, setup_bash=cfg["robot_setup_bash"], timeout=10,
+                )
+                for line in result.stdout.splitlines():
+                    s = line.strip()
+                    if s in ("True", "False"):
+                        last_seen[n] = s
+                        if s == "True":
+                            ready_at[n] = round(time.time() - started, 2)
+                            pending.remove(n)
+                        break
+            if pending:
+                logger.info("waiting for MAVROS connected=True; pending=%s "
+                            "last_seen=%s elapsed=%.0fs",
+                            pending, last_seen, time.time() - started)
+                time.sleep(PX4_POLL_INTERVAL_S)
+
+        if pending:
+            last = {n: last_seen.get(n, "no-sample") for n in sorted(pending)}
+            pytest.fail(f"robots {sorted(pending)} never reported connected=True "
+                        f"within {PX4_READY_TIMEOUT_S:.0f}s. Last seen: {last}")
+
+        for n, dur in ready_at.items():
+            _record(n, {"ready_duration_sys_s": dur})
+        _ready_envs.add(env_id)
+
+    @pytest.mark.dependency(name="autonomy_takeoff", depends=["autonomy_ready"])
+    def test_takeoff(self, airstack_env, velocity):
+        """Send TakeoffTask per robot in parallel; verify peak altitude and record metrics."""
+        cfg = airstack_env["cfg"]
+        robot_container = get_robot_containers(airstack_env["robot_pattern"])[0]
+        num_robots = airstack_env["num_robots"]
+        _run_parallel(num_robots,
+                      lambda n: _takeoff_one_robot(n, robot_container, cfg, velocity))
+
+    @pytest.mark.dependency(name="autonomy_hover", depends=["autonomy_takeoff"])
+    def test_hover(self, airstack_env, velocity):
+        """Observe odom for HOVER_DURATION_S seconds per robot in parallel; check stability."""
+        cfg = airstack_env["cfg"]
+        robot_container = get_robot_containers(airstack_env["robot_pattern"])[0]
+        num_robots = airstack_env["num_robots"]
+        _run_parallel(num_robots,
+                      lambda n: _hover_one_robot(n, robot_container, cfg, velocity))
+
+    @pytest.mark.dependency(name="autonomy_landing", depends=["autonomy_hover"])
+    def test_landing(self, airstack_env, velocity):
+        """Send LandTask per robot in parallel; verify final altitude and record metrics."""
+        cfg = airstack_env["cfg"]
+        robot_container = get_robot_containers(airstack_env["robot_pattern"])[0]
+        num_robots = airstack_env["num_robots"]
+        _run_parallel(num_robots,
+                      lambda n: _landing_one_robot(n, robot_container, cfg, velocity))

From dacaeb422e3f675f398dc01761034455d1a3dfb0 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 09:34:17 -0400
Subject: [PATCH 14/24] Add airsim GT publishing and compute odom vs GT metrics

---
 .../ms_airsim_ros_bridge/bridge_node.py       | 102 +++++++++++++++-
 .../src/ms_airsim_ros_bridge/package.xml      |   1 +
 tests/test_autonomy.py                        | 111 ++++++++----------
 3 files changed, 154 insertions(+), 60 deletions(-)

diff --git a/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/ms_airsim_ros_bridge/bridge_node.py b/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/ms_airsim_ros_bridge/bridge_node.py
index de011b4c4..415a11adf 100644
--- a/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/ms_airsim_ros_bridge/bridge_node.py
+++ b/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/ms_airsim_ros_bridge/bridge_node.py
@@ -15,9 +15,32 @@
 import rclpy
 from rclpy.node import Node
 from sensor_msgs.msg import Image, CameraInfo
+from nav_msgs.msg import Odometry
 from rosgraph_msgs.msg import Clock
 
 
+_SQRT2_INV = 1.0 / math.sqrt(2.0)
+
+
+def _ned_vec_to_enu(x_ned, y_ned, z_ned):
+    """World-frame vector (position/velocity) in NED → ENU."""
+    return y_ned, x_ned, -z_ned
+
+
+def _ned_quat_to_enu(qx, qy, qz, qw):
+    """Body-FRD-in-world-NED quaternion (AirSim) → body-FLU-in-world-ENU (ROS).
+
+    Composition: q_enu = q_NED_TO_ENU * q_ned * q_FLU_TO_FRD, where
+    q_NED_TO_ENU is 180° about (1,1,0)/√2 and q_FLU_TO_FRD is 180° about +X.
+    Returns (x, y, z, w) in geometry_msgs/Quaternion order.
+    """
+    w_enu = -_SQRT2_INV * (qw + qz)
+    x_enu = -_SQRT2_INV * (qx + qy)
+    y_enu = _SQRT2_INV * (qy - qx)
+    z_enu = _SQRT2_INV * (qz - qw)
+    return x_enu, y_enu, z_enu, w_enu
+
+
 class MsAirSimRosBridge(Node):
 
     def __init__(self):
@@ -27,11 +50,13 @@ def __init__(self):
         self.declare_parameter('publish_rate', 15.0)
         self.declare_parameter('robot_name', 'robot_1')
         self.declare_parameter('clock_rate', 50.0)
+        self.declare_parameter('gt_rate', 50.0)
 
         ip = self.get_parameter('ms_airsim_ip').value
         rate = self.get_parameter('publish_rate').value
         robot_name = self.get_parameter('robot_name').value
         clock_rate = self.get_parameter('clock_rate').value
+        gt_rate = self.get_parameter('gt_rate').value
         self.vehicle_name = robot_name
 
         # Connect to AirSim (retry until ready)
@@ -85,6 +110,9 @@ def __init__(self):
         self.left_depth_pub = self.create_publisher(Image, f'{prefix}/left/depth_ground_truth', 1)
         self.right_depth_pub = self.create_publisher(Image, f'{prefix}/right/depth_ground_truth', 1)
         self.clock_pub = self.create_publisher(Clock, '/clock', 10)
+        self.gt_pub = self.create_publisher(
+            Odometry, f'/{robot_name}/odom_ground_truth', 10
+        )
 
         self._shutdown = threading.Event()
 
@@ -95,6 +123,16 @@ def __init__(self):
         self._clock_thread = threading.Thread(target=self._clock_loop, daemon=True)
         self._clock_thread.start()
 
+        # Dedicated ground-truth-odom thread with its own AirSim client.
+        # AirSim kinematics are world-NED / body-FRD; we convert to world-ENU /
+        # body-FLU so the topic matches the frame of MAVROS local_position/odom
+        # and the test's _gt_metrics can compare component-wise.
+        self._gt_client = airsim.MultirotorClient(ip=ip)
+        self._gt_client.confirmConnection()
+        self._gt_interval = 1.0 / gt_rate
+        self._gt_thread = threading.Thread(target=self._gt_loop, daemon=True)
+        self._gt_thread.start()
+
         # Background image fetcher with its own AirSim client
         self._image_client = airsim.MultirotorClient(ip=ip)
         self._image_client.confirmConnection()
@@ -111,7 +149,9 @@ def __init__(self):
         )
 
     def _clock_loop(self):
+        next_t = time.monotonic()
         while not self._shutdown.is_set():
+            next_t += self._clock_interval
             try:
                 state = self._clock_client.getMultirotorState(
                     vehicle_name=self.vehicle_name
@@ -123,7 +163,66 @@ def _clock_loop(self):
                 self.clock_pub.publish(clock_msg)
             except Exception:
                 pass
-            time.sleep(self._clock_interval)
+            slack = next_t - time.monotonic()
+            if slack > 0:
+                time.sleep(slack)
+
+    def _gt_loop(self):
+        next_t = time.monotonic()
+        while not self._shutdown.is_set():
+            next_t += self._gt_interval
+            try:
+                k = self._gt_client.simGetGroundTruthKinematics(
+                    vehicle_name=self.vehicle_name
+                )
+                sim_ts = self._gt_client.getMultirotorState(
+                    vehicle_name=self.vehicle_name
+                ).timestamp
+
+                msg = Odometry()
+                msg.header.stamp.sec = int(sim_ts // 1_000_000_000)
+                msg.header.stamp.nanosec = int(sim_ts % 1_000_000_000)
+                msg.header.frame_id = f'{self.vehicle_name}/map'
+                msg.child_frame_id = f'{self.vehicle_name}/base_link'
+
+                px, py, pz = _ned_vec_to_enu(
+                    k.position.x_val, k.position.y_val, k.position.z_val
+                )
+                msg.pose.pose.position.x = px
+                msg.pose.pose.position.y = py
+                msg.pose.pose.position.z = pz
+
+                qx, qy, qz, qw = _ned_quat_to_enu(
+                    k.orientation.x_val, k.orientation.y_val,
+                    k.orientation.z_val, k.orientation.w_val,
+                )
+                msg.pose.pose.orientation.x = qx
+                msg.pose.pose.orientation.y = qy
+                msg.pose.pose.orientation.z = qz
+                msg.pose.pose.orientation.w = qw
+
+                lx, ly, lz = _ned_vec_to_enu(
+                    k.linear_velocity.x_val, k.linear_velocity.y_val,
+                    k.linear_velocity.z_val,
+                )
+                msg.twist.twist.linear.x = lx
+                msg.twist.twist.linear.y = ly
+                msg.twist.twist.linear.z = lz
+
+                ax, ay, az = _ned_vec_to_enu(
+                    k.angular_velocity.x_val, k.angular_velocity.y_val,
+                    k.angular_velocity.z_val,
+                )
+                msg.twist.twist.angular.x = ax
+                msg.twist.twist.angular.y = ay
+                msg.twist.twist.angular.z = az
+
+                self.gt_pub.publish(msg)
+            except Exception:
+                pass
+            slack = next_t - time.monotonic()
+            if slack > 0:
+                time.sleep(slack)
 
     def _has_subscribers(self, *publishers):
         return any(p.get_subscription_count() > 0 for p in publishers)
@@ -219,6 +318,7 @@ def _make_cam_info(self, stamp, side, tx):
     def destroy_node(self):
         self._shutdown.set()
         self._clock_thread.join(timeout=2.0)
+        self._gt_thread.join(timeout=2.0)
         self._image_thread.join(timeout=2.0)
         super().destroy_node()
 
diff --git a/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/package.xml b/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/package.xml
index 71fded1bb..b32fa80c3 100644
--- a/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/package.xml
+++ b/simulation/ms-airsim/ros_ws/src/ms_airsim_ros_bridge/package.xml
@@ -8,6 +8,7 @@
 
   <exec_depend>rclpy</exec_depend>
   <exec_depend>sensor_msgs</exec_depend>
+  <exec_depend>nav_msgs</exec_depend>
   <exec_depend>rosgraph_msgs</exec_depend>
 
   <export>
diff --git a/tests/test_autonomy.py b/tests/test_autonomy.py
index 89d036fc4..dbf3fbb93 100644
--- a/tests/test_autonomy.py
+++ b/tests/test_autonomy.py
@@ -33,6 +33,8 @@
 PX4_READY_TIMEOUT_S = 300.0
 PX4_POLL_INTERVAL_S = 2.0
 MOTION_ABOVE_START_M = 0.3  # z threshold for "drone started moving" (relative to z[0])
+SETTLING_WINDOW_S = 1.0     # seconds of trailing samples used for steady-state altitude
+MAX_GT_MATCH_AGE_S = 0.1    # drop an odom sample if nearest GT is >100ms away
 
 # Full column schemas of `ros2 topic echo --csv` output, in declaration order.
 # Covariance arrays expand to 36 comma-separated values each. Downstream code
@@ -49,12 +51,6 @@
        "twist.twist.angular.x", "twist.twist.angular.y", "twist.twist.angular.z"]
     + [f"twist.covariance[{i}]" for i in range(36)]
 )
-POSE_SCHEMA = [
-    "header.stamp.sec", "header.stamp.nanosec", "header.frame_id",
-    "pose.position.x", "pose.position.y", "pose.position.z",
-    "pose.orientation.x", "pose.orientation.y",
-    "pose.orientation.z", "pose.orientation.w",
-]
 
 METRIC_UNITS = {
     "ready_duration_sys_s": "s",
@@ -154,14 +150,6 @@ def _action_message(stdout):
 
 # ── metric computation ────────────────────────────────────────────────────
 
-def _roll_pitch(qx, qy, qz, qw):
-    """Quaternion → (roll, pitch) in radians. Yaw is unused at hover."""
-    roll = math.atan2(2.0 * (qw * qx + qy * qz), 1.0 - 2.0 * (qx * qx + qy * qy))
-    sinp = max(-1.0, min(1.0, 2.0 * (qw * qy - qz * qx)))
-    pitch = math.asin(sinp)
-    return roll, pitch
-
-
 def _valid_range(start, end):
     """True iff both indices are set and end follows start."""
     return start is not None and end is not None and end > start
@@ -183,9 +171,15 @@ def _tracking_metrics_takeoff(odom, target, velocity):
     zs = [r["pose.pose.position.z"] for r in odom]
     ts = [_stamp(r) for r in odom]
     peak = max(zs)
+    # Steady-state altitude at the moment of success: mean of samples within
+    # the trailing SETTLING_WINDOW_S. Captures where the drone actually parked,
+    # vs `peak` which captures transient overshoot.
+    cutoff = ts[-1] - SETTLING_WINDOW_S
+    settled = [z for z, t in zip(zs, ts) if t >= cutoff]
     out = {
-        "peak_altitude_m": round(peak, 3),
-        "altitude_error_m": round(abs(peak - target), 3),
+        # Signed: positive = settled above target, negative = below target.
+        "altitude_error_m": round(statistics.mean(settled) - target, 3),
+        # Unsigned transient overshoot: 0 if drone never went above target.
         "overshoot_m": round(max(0.0, peak - target), 3),
     }
     # Motion threshold is relative to starting altitude so drones that spawn
@@ -207,21 +201,18 @@ def _tracking_metrics_hover(odom, target):
     xs = [r["pose.pose.position.x"] for r in odom]
     ys = [r["pose.pose.position.y"] for r in odom]
     zs = [r["pose.pose.position.z"] for r in odom]
-    out = {
-        "hover_altitude_stddev_m": round(statistics.pstdev(zs) if len(zs) > 1 else 0.0, 3),
+    # Total 3D positional jitter around the mean point. Equal to
+    # sqrt(var(x) + var(y) + var(z)) — one axis-agnostic stability number.
+    if len(odom) > 1:
+        pos_stddev = math.sqrt(statistics.pvariance(xs)
+                               + statistics.pvariance(ys)
+                               + statistics.pvariance(zs))
+    else:
+        pos_stddev = 0.0
+    return {
         "hover_altitude_mean_error_m": round(abs(statistics.mean(zs) - target), 3),
+        "hover_position_stddev_m": round(pos_stddev, 3),
     }
-    x0, y0 = xs[0], ys[0]
-    drift = max(math.sqrt((x - x0) ** 2 + (y - y0) ** 2) for x, y in zip(xs, ys))
-    out["horizontal_drift_max_m"] = round(drift, 3)
-    if len(odom) > 1:
-        rolls, pitches = zip(*(_roll_pitch(
-            r["pose.pose.orientation.x"], r["pose.pose.orientation.y"],
-            r["pose.pose.orientation.z"], r["pose.pose.orientation.w"])
-            for r in odom))
-        out["attitude_stddev_rad"] = round(
-            statistics.pstdev(rolls) + statistics.pstdev(pitches), 3)
-    return out
 
 
 def _tracking_metrics_landing(odom, velocity):
@@ -257,12 +248,14 @@ def _gt_metrics(odom, gt):
         if not candidates:
             continue
         best = min(candidates, key=lambda r: abs(_stamp(r) - t))
+        if abs(_stamp(best) - t) > MAX_GT_MATCH_AGE_S:
+            continue  # stale GT — pairing would conflate motion with bias
         ox, oy, oz = (row["pose.pose.position.x"],
                       row["pose.pose.position.y"],
                       row["pose.pose.position.z"])
-        gx, gy, gz = (best["pose.position.x"],
-                      best["pose.position.y"],
-                      best["pose.position.z"])
+        gx, gy, gz = (best["pose.pose.position.x"],
+                      best["pose.pose.position.y"],
+                      best["pose.pose.position.z"])
         errs.append(math.sqrt((ox - gx) ** 2 + (oy - gy) ** 2 + (oz - gz) ** 2))
         z_biases.append(oz - gz)
     if not errs:
@@ -299,7 +292,7 @@ def _start_captures(robot_container, setup_bash, domain, duration_s, tag):
         robot_container, f"/robot_{domain}/interface/mavros/local_position/odom",
         domain, setup_bash, duration_s, odom_path)
     gt_proc, gt_fh, gt_ef = _start_csv_stream(
-        robot_container, f"/robot_{domain}/ground_truth/pose",
+        robot_container, f"/robot_{domain}/odom_ground_truth",
         domain, setup_bash, duration_s, gt_path)
     return {
         "duration_s": duration_s,
@@ -330,7 +323,7 @@ def _finish_captures(streams):
         odom_ef.close()
         gt_ef.close()
     odom = _parse_csv(odom_path, ODOM_SCHEMA)
-    gt = _parse_csv(gt_path, POSE_SCHEMA)
+    gt = _parse_csv(gt_path, ODOM_SCHEMA)
     if not odom:
         logger.warning("odom capture empty. stdout head=%r stderr head=%r",
                        open(odom_path).read(500),
@@ -374,9 +367,10 @@ def _takeoff_one_robot(n, robot_container, cfg, velocity):
     metrics = _tracking_metrics_takeoff(odom, target, velocity)
     metrics.update(_gt_metrics(odom, gt))
     _record(n, metrics)
-    peak = metrics["peak_altitude_m"]
-    assert peak >= target * 0.9, (
-        f"robot_{n} peak altitude {peak:.2f}m < target*0.9={target * 0.9:.2f}m")
+    err = metrics["altitude_error_m"]
+    assert abs(err) <= target * 0.1, (
+        f"robot_{n} settled altitude {target + err:.2f}m differs from "
+        f"target {target:.1f}m by more than 10%")
 
 
 def _hover_one_robot(n, robot_container, cfg, velocity):
@@ -448,11 +442,16 @@ def _chain_guard(self, request, airstack_env, _failed_envs):
 
     @pytest.mark.dependency(name="autonomy_ready")
     def test_px4_ready(self, airstack_env, velocity, _ready_envs):
-        """Wait until every robot's MAVROS reports connected=True. Skipped on
-        velocities after the first in the same airstack_env — the drone is
-        already proven alive and post-landing MAV_STATE fluctuations shouldn't
-        re-gate the chain. The takeoff action handles arming/mode itself, so
-        STANDBY is not a required precondition."""
+        """Wait until /robot_N/interface/mavros/local_position/odom is publishing.
+
+        That topic goes live only after PX4's EKF converges and sets a home
+        position — the exact precondition PX4's arming preflight requires and
+        the topic the test later captures during takeoff. `connected=True` on
+        mavros/state fires ~25s earlier and is insufficient (takeoff action
+        returns `failed to arm` in that window).
+
+        Skipped on velocities after the first in the same airstack_env.
+        """
         env_id = (airstack_env["sim"], airstack_env["num_robots"],
                   airstack_env["iteration"])
         if env_id in _ready_envs:
@@ -465,36 +464,30 @@ def test_px4_ready(self, airstack_env, velocity, _ready_envs):
 
         started = time.time()
         ready_at = {}
-        last_seen = {}
         pending = list(range(1, num_robots + 1))
         deadline = started + PX4_READY_TIMEOUT_S
 
         while pending and time.time() < deadline:
             for n in list(pending):
+                # --once exits 0 on the first message; the inner `timeout` makes
+                # it exit nonzero if nothing is published within the window.
                 result = ros2_exec(
                     robot_container,
-                    f"timeout 5 ros2 topic echo --once --csv "
-                    f"--field connected /robot_{n}/interface/mavros/state",
+                    f"timeout 5 ros2 topic echo --once "
+                    f"/robot_{n}/interface/mavros/local_position/odom",
                     domain_id=n, setup_bash=cfg["robot_setup_bash"], timeout=10,
                 )
-                for line in result.stdout.splitlines():
-                    s = line.strip()
-                    if s in ("True", "False"):
-                        last_seen[n] = s
-                        if s == "True":
-                            ready_at[n] = round(time.time() - started, 2)
-                            pending.remove(n)
-                        break
+                if result.returncode == 0:
+                    ready_at[n] = round(time.time() - started, 2)
+                    pending.remove(n)
             if pending:
-                logger.info("waiting for MAVROS connected=True; pending=%s "
-                            "last_seen=%s elapsed=%.0fs",
-                            pending, last_seen, time.time() - started)
+                logger.info("waiting for local_position/odom; pending=%s elapsed=%.0fs",
+                            pending, time.time() - started)
                 time.sleep(PX4_POLL_INTERVAL_S)
 
         if pending:
-            last = {n: last_seen.get(n, "no-sample") for n in sorted(pending)}
-            pytest.fail(f"robots {sorted(pending)} never reported connected=True "
-                        f"within {PX4_READY_TIMEOUT_S:.0f}s. Last seen: {last}")
+            pytest.fail(f"robots {sorted(pending)} never published "
+                        f"local_position/odom within {PX4_READY_TIMEOUT_S:.0f}s")
 
         for n, dur in ready_at.items():
             _record(n, {"ready_duration_sys_s": dur})

From 26392b6c5f22a261174749883524c75cc60ee38b Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 10:32:57 -0400
Subject: [PATCH 15/24] Hover measures drift from hover start not to target.
 Pass rates are rendered from results.xml using parse_metrics.py

---
 tests/parse_metrics.py | 66 ++++++++++++++++++++++++++++++++++++++++--
 tests/test_autonomy.py | 42 +++++++++++++++++----------
 2 files changed, 90 insertions(+), 18 deletions(-)

diff --git a/tests/parse_metrics.py b/tests/parse_metrics.py
index 27b71a3fe..0bc5456f0 100644
--- a/tests/parse_metrics.py
+++ b/tests/parse_metrics.py
@@ -179,6 +179,27 @@ def parse_results_xml(path):
     return metrics
 
 
+def parse_passrates(path):
+    """Per (module, base_test) pass/fail/skip counts aggregated across -iterN
+    iterations. results.xml is the authoritative source — metrics.json can't
+    distinguish early-fail from skipped (neither produces entries)."""
+    if not path.exists():
+        return {}
+    counts = {}
+    for tc in ET.parse(path).iter("testcase"):
+        full = f"{tc.get('classname')}.{tc.get('name')}"
+        module, display = _split_test_name(full)
+        base = ITER_RE.sub("", display)
+        if tc.find("failure") is not None or tc.find("error") is not None:
+            outcome = "fail"
+        elif tc.find("skipped") is not None:
+            outcome = "skip"
+        else:
+            outcome = "pass"
+        counts.setdefault((module, base), {"pass": 0, "fail": 0, "skip": 0})[outcome] += 1
+    return counts
+
+
 def parse_metrics_json(path):
     if not path.exists():
         return {}
@@ -460,7 +481,8 @@ def _group_by_module(rows):
     return modules, grouped
 
 
-def format_markdown(main_rows, hz_rows, compute_rows, iter_counts, threshold, diff_mode):
+def format_markdown(main_rows, hz_rows, compute_rows, iter_counts,
+                    current_pr, baseline_pr, threshold, diff_mode):
     regressions = [False]
 
     def pivot_cell(pair):
@@ -493,11 +515,41 @@ def render_pivot(rows, leading):
         return [leading(r) + [pivot_cell(r["aggs"].get(agg)) for agg in AGGS]
                 for r in rows]
 
+    def _rate(c):
+        considered = c["pass"] + c["fail"]
+        return f"{c['pass'] * 100 / considered:.0f}%" if considered else "—"
+
+    def render_passrates(mod):
+        bases = sorted({b for (m, b) in current_pr if m == mod}
+                       | {b for (m, b) in baseline_pr if m == mod})
+        if not bases:
+            return None
+        rows = []
+        empty = {"pass": 0, "fail": 0, "skip": 0}
+        if diff_mode:
+            for b in bases:
+                cur, bl = current_pr.get((mod, b), empty), baseline_pr.get((mod, b), empty)
+                rows.append([
+                    b,
+                    f"{bl['pass']} → {cur['pass']}",
+                    f"{bl['fail']} → {cur['fail']}",
+                    f"{bl['skip']} → {cur['skip']}",
+                    f"{_rate(bl)} → {_rate(cur)}",
+                ])
+            headers = ["Test", "Pass", "Fail", "Skip", "Rate (baseline → current)"]
+        else:
+            for b in bases:
+                c = current_pr.get((mod, b), empty)
+                rows.append([b, c["pass"], c["fail"], c["skip"], _rate(c)])
+            headers = ["Test", "Pass", "Fail", "Skip", "Rate"]
+        return tabulate(rows, headers=headers, tablefmt="github")
+
     main_mods, main_by_module = _group_by_module(main_rows)
     hz_mods, hz_by_module = _group_by_module(hz_rows)
     compute_mods, compute_by_module = _group_by_module(compute_rows)
+    pr_mods = list(dict.fromkeys(m for (m, _) in list(current_pr) + list(baseline_pr)))
     modules = []
-    for m in main_mods + hz_mods + compute_mods:
+    for m in main_mods + hz_mods + compute_mods + pr_mods:
         if m not in modules:
             modules.append(m)
 
@@ -511,6 +563,10 @@ def render_pivot(rows, leading):
         b_n, c_n = iter_counts.get(mod, (None, None))
         annotation = _iter_annotation(b_n, c_n, diff_mode)
 
+        pr_table = render_passrates(mod)
+        if pr_table is not None:
+            sub.append("### Pass rates\n\n" + pr_table)
+
         main = main_by_module.get(mod, [])
         if main:
             rows, headers = render_main(main)
@@ -551,11 +607,15 @@ def main():
 
     current = merge_metrics(Path(args.current))
     baseline = merge_metrics(Path(args.baseline)) if args.baseline else {}
+    current_pr = parse_passrates(Path(args.current) / "results.xml")
+    baseline_pr = (parse_passrates(Path(args.baseline) / "results.xml")
+                   if args.baseline else {})
     diff_mode = bool(args.baseline)
 
     main_rows, hz_rows, compute_rows, iter_counts = build_rows(current, baseline)
     md, has_regression = format_markdown(
-        main_rows, hz_rows, compute_rows, iter_counts, args.threshold, diff_mode)
+        main_rows, hz_rows, compute_rows, iter_counts,
+        current_pr, baseline_pr, args.threshold, diff_mode)
 
     print(md)
     if args.output:
diff --git a/tests/test_autonomy.py b/tests/test_autonomy.py
index dbf3fbb93..757b3948c 100644
--- a/tests/test_autonomy.py
+++ b/tests/test_autonomy.py
@@ -197,20 +197,29 @@ def _tracking_metrics_takeoff(odom, target, velocity):
     return out
 
 
-def _tracking_metrics_hover(odom, target):
+def _tracking_metrics_hover(odom):
+    """Measure whether the drone stayed put relative to where takeoff left it.
+
+    Reference altitude is the mean over the first SETTLING_WINDOW_S of hover
+    (not the takeoff target), so takeoff inaccuracy doesn't leak into hover.
+    Hover tests "drone holds position", not "drone is at target".
+    """
     xs = [r["pose.pose.position.x"] for r in odom]
     ys = [r["pose.pose.position.y"] for r in odom]
     zs = [r["pose.pose.position.z"] for r in odom]
+    ts = [_stamp(r) for r in odom]
+
+    ref_cutoff = ts[0] + SETTLING_WINDOW_S
+    ref_z = statistics.mean(z for z, t in zip(zs, ts) if t <= ref_cutoff)
+
     # Total 3D positional jitter around the mean point. Equal to
     # sqrt(var(x) + var(y) + var(z)) — one axis-agnostic stability number.
-    if len(odom) > 1:
-        pos_stddev = math.sqrt(statistics.pvariance(xs)
-                               + statistics.pvariance(ys)
-                               + statistics.pvariance(zs))
-    else:
-        pos_stddev = 0.0
+    pos_stddev = math.sqrt(statistics.pvariance(xs)
+                           + statistics.pvariance(ys)
+                           + statistics.pvariance(zs)) if len(odom) > 1 else 0.0
     return {
-        "hover_altitude_mean_error_m": round(abs(statistics.mean(zs) - target), 3),
+        # Drift from starting altitude over the full hover window.
+        "hover_altitude_mean_error_m": round(abs(statistics.mean(zs) - ref_z), 3),
         "hover_position_stddev_m": round(pos_stddev, 3),
     }
 
@@ -374,7 +383,6 @@ def _takeoff_one_robot(n, robot_container, cfg, velocity):
 
 
 def _hover_one_robot(n, robot_container, cfg, velocity):
-    target = TARGET_ALTITUDE_M
     streams = _start_captures(robot_container, cfg["robot_setup_bash"],
                               n, HOVER_DURATION_S + 2, f"v{velocity}_hover")
     # Passive phase: no blocking action, so we sleep to let the capture
@@ -383,12 +391,12 @@ def _hover_one_robot(n, robot_container, cfg, velocity):
     odom, gt = _finish_captures(streams)
     if not odom:
         pytest.fail(f"robot_{n} hover: no odom samples captured")
-    metrics = _tracking_metrics_hover(odom, target)
+    metrics = _tracking_metrics_hover(odom)
     metrics.update(_gt_metrics(odom, gt))
     _record(n, metrics)
-    mean_err = metrics["hover_altitude_mean_error_m"]
-    assert mean_err < 0.5, (
-        f"robot_{n} hover altitude mean error {mean_err:.2f}m exceeds ±0.5m tolerance")
+    drift = metrics["hover_altitude_mean_error_m"]
+    assert drift < 0.5, (
+        f"robot_{n} drifted {drift:.2f}m in altitude during hover (>0.5m tolerance)")
 
 
 def _landing_one_robot(n, robot_container, cfg, velocity):
@@ -438,7 +446,11 @@ def _chain_guard(self, request, airstack_env, _failed_envs):
         yield
         rep = getattr(request.node, "_rep_call", None)
         if rep is not None and rep.failed:
-            _failed_envs.add(env_id)
+            # Hover failures don't poison the chain — we still want landing
+            # to run so the drone comes back to the ground, and the next
+            # velocity gets its chance.
+            if "test_hover" not in request.node.name:
+                _failed_envs.add(env_id)
 
     @pytest.mark.dependency(name="autonomy_ready")
     def test_px4_ready(self, airstack_env, velocity, _ready_envs):
@@ -511,7 +523,7 @@ def test_hover(self, airstack_env, velocity):
         _run_parallel(num_robots,
                       lambda n: _hover_one_robot(n, robot_container, cfg, velocity))
 
-    @pytest.mark.dependency(name="autonomy_landing", depends=["autonomy_hover"])
+    @pytest.mark.dependency(name="autonomy_landing", depends=["autonomy_takeoff"])
     def test_landing(self, airstack_env, velocity):
         """Send LandTask per robot in parallel; verify final altitude and record metrics."""
         cfg = airstack_env["cfg"]

From 3b2a82798413216926f9288c9ec8545423ea7c5c Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 11:10:27 -0400
Subject: [PATCH 16/24] Fix broken px4_ready now uses MAVROS connected and odom
 publication for better reliability

---
 tests/test_autonomy.py | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/tests/test_autonomy.py b/tests/test_autonomy.py
index 757b3948c..7432d84d3 100644
--- a/tests/test_autonomy.py
+++ b/tests/test_autonomy.py
@@ -476,30 +476,54 @@ def test_px4_ready(self, airstack_env, velocity, _ready_envs):
 
         started = time.time()
         ready_at = {}
+        # Per-robot progress through the two sequential gates.
+        connected = set()   # robots that have reported mavros/state.connected=True
         pending = list(range(1, num_robots + 1))
         deadline = started + PX4_READY_TIMEOUT_S
 
         while pending and time.time() < deadline:
             for n in list(pending):
-                # --once exits 0 on the first message; the inner `timeout` makes
-                # it exit nonzero if nothing is published within the window.
-                result = ros2_exec(
+                # Gate 1: MAVROS ↔ PX4 heartbeat. Fast, reliable signal that
+                # the stack is alive.
+                if n not in connected:
+                    r = ros2_exec(
+                        robot_container,
+                        f"timeout 5 ros2 topic echo --once --csv "
+                        f"--field connected /robot_{n}/interface/mavros/state",
+                        domain_id=n, setup_bash=cfg["robot_setup_bash"], timeout=10,
+                    )
+                    if any(line.strip() == "True" for line in r.stdout.splitlines()):
+                        connected.add(n)
+                    else:
+                        continue  # try again next poll
+
+                # Gate 2: local_position/odom actually publishing (EKF has a
+                # valid local origin). Catches the case where connected=True
+                # fires long before PX4 is ready for arming.
+                r = ros2_exec(
                     robot_container,
                     f"timeout 5 ros2 topic echo --once "
                     f"/robot_{n}/interface/mavros/local_position/odom",
                     domain_id=n, setup_bash=cfg["robot_setup_bash"], timeout=10,
                 )
-                if result.returncode == 0:
+                if r.returncode == 0 and "pose:" in r.stdout:
                     ready_at[n] = round(time.time() - started, 2)
                     pending.remove(n)
+
             if pending:
-                logger.info("waiting for local_position/odom; pending=%s elapsed=%.0fs",
-                            pending, time.time() - started)
+                logger.info("px4_ready: connected=%s pending=%s elapsed=%.0fs",
+                            sorted(connected), pending, time.time() - started)
                 time.sleep(PX4_POLL_INTERVAL_S)
 
         if pending:
-            pytest.fail(f"robots {sorted(pending)} never published "
-                        f"local_position/odom within {PX4_READY_TIMEOUT_S:.0f}s")
+            not_connected = [n for n in pending if n not in connected]
+            if not_connected:
+                pytest.fail(f"robots {sorted(not_connected)} never reported "
+                            f"MAVROS connected=True within "
+                            f"{PX4_READY_TIMEOUT_S:.0f}s")
+            pytest.fail(f"robots {sorted(pending)} connected but never "
+                        f"published local_position/odom within "
+                        f"{PX4_READY_TIMEOUT_S:.0f}s")
 
         for n, dur in ready_at.items():
             _record(n, {"ready_duration_sys_s": dur})

From 2922a7efe37b0337b8090c15b50003acdf605a88 Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 11:24:36 -0400
Subject: [PATCH 17/24] Standardize displayed parameterization order in results
 and tables

---
 tests/conftest.py | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index cc1a788c7..675771b24 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -171,11 +171,36 @@ def sort_key(item):
         return (env, vel, phase(item))
 
     slots = [(i, it) for i, it in enumerate(items) if phase(it) is not None]
-    if not slots:
-        return
-    sorted_items = sorted((it for _, it in slots), key=sort_key)
-    for (i, _), new_item in zip(slots, sorted_items):
-        items[i] = new_item
+    if slots:
+        sorted_items = sorted((it for _, it in slots), key=sort_key)
+        for (i, _), new_item in zip(slots, sorted_items):
+            items[i] = new_item
+
+    # Rewrite bracketed test IDs into a consistent hierarchy: sim > robots >
+    # velocity > iteration. Bypasses pytest's own concatenation (which would
+    # otherwise order by reverse-parametrize-call order). Keeps pytest console,
+    # JUnit XML, and metrics.json all in the same natural order without
+    # refactoring the parametrize structure.
+    for item in items:
+        cs = getattr(item, "callspec", None)
+        if cs is None:
+            continue
+        env = cs.params.get("airstack_env")
+        parts = []
+        if env:
+            sim, n, i = env
+            parts.append(f"{sim}-rob#{n}")
+        if "velocity" in cs.params:
+            parts.append(f"v{cs.params['velocity']}")
+        if env:
+            parts.append(f"iter{i}")
+        if not parts:
+            continue
+        new_id = "-".join(parts)
+        if cs.id == new_id:
+            continue
+        item.name = item.name.replace(f"[{cs.id}]", f"[{new_id}]")
+        item._nodeid = item._nodeid.replace(f"[{cs.id}]", f"[{new_id}]")
 
 
 # ── logging / subprocess helpers ───────────────────────────────────────────

From acf59b1c49917db40fb8c694865887a07eb3d31e Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 11:40:28 -0400
Subject: [PATCH 18/24] Enforce module ordering

---
 tests/conftest.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 675771b24..d7a989f10 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -147,22 +147,42 @@ def pytest_generate_tests(metafunc):
     metafunc.parametrize("airstack_env", params, ids=ids, indirect=True, scope="class")
 
 
-# Sort autonomy tests by (airstack_env, velocity, phase) so the stack comes
-# up once per env and the drone goes ground→air→ground per velocity.
-_AUTONOMY_PHASE_ORDER = {
-    "test_px4_ready": 0,
-    "test_takeoff":   1,
-    "test_hover":     2,
-    "test_landing":   3,
-}
+# Run cheap/fast-fail tests first so real problems surface early:
+# docker image builds → colcon workspace builds → liveliness → autonomy.
+_MODULE_ORDER = [
+    "test_build_docker",
+    "test_build_packages",
+    "test_liveliness",
+    "test_autonomy",
+]
+
+# Within test_autonomy, each (env, velocity) runs phases in this chain order.
+_AUTONOMY_PHASE_ORDER = [
+    "test_px4_ready",
+    "test_takeoff",
+    "test_hover",
+    "test_landing",
+]
+
+
+def _rank(name, order):
+    """Index of `name` in `order`; `len(order)` if unknown (i.e., sort last)."""
+    return order.index(name) if name in order else len(order)
 
 
 def pytest_collection_modifyitems(items):
+    # 1. Cross-module: enforce `_MODULE_ORDER`. Stable sort keeps within-module
+    #    order intact, so pytest's default file/class order survives.
+    items.sort(key=lambda it: _rank(getattr(it.module, "__name__", ""), _MODULE_ORDER))
+
+    # 2. Within test_autonomy: sort by (airstack_env, velocity, phase) so each
+    #    (sim, robots, iter) env brings up the stack once and the drone goes
+    #    ground→air→ground per velocity.
     def phase(item):
         if getattr(item.module, "__name__", "") != "test_autonomy":
             return None
         name = item.originalname or item.name.split("[", 1)[0]
-        return _AUTONOMY_PHASE_ORDER.get(name)
+        return _rank(name, _AUTONOMY_PHASE_ORDER)
 
     def sort_key(item):
         cs = getattr(item, "callspec", None)
@@ -176,7 +196,7 @@ def sort_key(item):
         for (i, _), new_item in zip(slots, sorted_items):
             items[i] = new_item
 
-    # Rewrite bracketed test IDs into a consistent hierarchy: sim > robots >
+    # 3. Rewrite bracketed test IDs into a consistent hierarchy: sim > robots >
     # velocity > iteration. Bypasses pytest's own concatenation (which would
     # otherwise order by reverse-parametrize-call order). Keeps pytest console,
     # JUnit XML, and metrics.json all in the same natural order without

From 175891fc2129824df20df946f3c57e28c6f0feea Mon Sep 17 00:00:00 2001
From: OasisArtisan <oalama@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 11:50:25 -0400
Subject: [PATCH 19/24] Warn if testing build packages and packages are already
 built

---
 airstack.sh                  |  3 +++
 tests/test_build_packages.py | 24 +++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/airstack.sh b/airstack.sh
index d5be609a7..5d20e31a8 100755
--- a/airstack.sh
+++ b/airstack.sh
@@ -855,6 +855,9 @@ function cmd_clean {
         "$PROJECT_ROOT/simulation/ms-airsim/ros_ws/build"
         "$PROJECT_ROOT/simulation/ms-airsim/ros_ws/install"
         "$PROJECT_ROOT/simulation/ms-airsim/ros_ws/log"
+        "$PROJECT_ROOT/simulation/simple-sim/ros_ws/build"
+        "$PROJECT_ROOT/simulation/simple-sim/ros_ws/install"
+        "$PROJECT_ROOT/simulation/simple-sim/ros_ws/log"
     )
 
     log_info "Cleaning all ROS 2 build artifacts..."
diff --git a/tests/test_build_packages.py b/tests/test_build_packages.py
index 2f594d261..40bcf978b 100644
--- a/tests/test_build_packages.py
+++ b/tests/test_build_packages.py
@@ -1,5 +1,24 @@
+from pathlib import Path
+
 import pytest
-from conftest import airstack_cmd, wait_for_container, docker_exec, read_log_tail
+
+from conftest import (AIRSTACK_ROOT, airstack_cmd, docker_exec, logger,
+                      read_log_tail, wait_for_container)
+
+
+def _warn_if_prebuilt(*ws_paths):
+    """Log a warning if any of the given workspace directories already contain
+    build/install/log dirs. Doesn't fail the test — just signals that what we
+    measure may be an INCREMENTAL build, not a clean one."""
+    dirty = [p for p in ws_paths
+             if any((Path(AIRSTACK_ROOT) / p / sub).is_dir()
+                    for sub in ("build", "install"))]
+    if dirty:
+        logger.warning(
+            "Workspace(s) %s already have build artifacts — this test may "
+            "measure an incremental build, not a clean one. Run "
+            "`./airstack.sh clean` first if you want a cold-build measurement.",
+            dirty)
 
 
 @pytest.mark.build_packages
@@ -7,6 +26,7 @@
 class TestColconBuilds:
 
     def test_colcon_build_robot(self):
+        _warn_if_prebuilt("robot/ros_ws")
         try:
             result = airstack_cmd("up", "robot-desktop",
                                   env_overrides={"AUTOLAUNCH": "false", "DISPLAY": ""},
@@ -22,6 +42,7 @@ def test_colcon_build_robot(self):
             airstack_cmd("down")
 
     def test_colcon_build_gcs(self):
+        _warn_if_prebuilt("gcs/ros_ws")
         try:
             result = airstack_cmd("up", "gcs",
                                   env_overrides={"AUTOLAUNCH": "false", "DISPLAY": ""},
@@ -37,6 +58,7 @@ def test_colcon_build_gcs(self):
             airstack_cmd("down")
 
     def test_colcon_build_ms_airsim(self):
+        _warn_if_prebuilt("simulation/ms-airsim/ros_ws")
         try:
             result = airstack_cmd(
                 "up", "ms-airsim",

From 2b39a2bfea3d3c4d904cca163efe8e5e9bd8e941 Mon Sep 17 00:00:00 2001
From: Andrew Jong <ajong@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 16:24:56 -0400
Subject: [PATCH 20/24] Set version to 0.18.0-alpha.5

---
 .env | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.env b/.env
index 87aec262a..2c04121b6 100644
--- a/.env
+++ b/.env
@@ -12,7 +12,7 @@ PROJECT_NAME="airstack"
 # If you've run ./airstack.sh setup, then this will auto-generate from the git commit hash every time a change is made 
 # to a Dockerfile or docker-compose.yaml file. Otherwise this can also be set explicitly to make a release version.
 # auto-generated from git commit hash
-VERSION="a07950c6"
+VERSION="0.18.0-alpha.5"
 # Choose "dev" or "prebuilt". "dev" is for mounted code that must be built live. "prebuilt" is for built ros_ws baked into the image
 DOCKER_IMAGE_BUILD_MODE="dev"  
 # Where to push and pull images from. Can replace with your docker hub username if using docker hub.

From 3dd3bd9b3698b04d5f4c96d50cb5833d64ad9812 Mon Sep 17 00:00:00 2001
From: Andrew Jong <ajong@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 16:29:31 -0400
Subject: [PATCH 21/24] Rename test_autonomy to test_takeoff_hover_land

---
 tests/conftest.py                                      | 10 +++++-----
 tests/{test_autonomy.py => test_takeoff_hover_land.py} |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename tests/{test_autonomy.py => test_takeoff_hover_land.py} (99%)

diff --git a/tests/conftest.py b/tests/conftest.py
index d7a989f10..c5295c6b4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -73,7 +73,7 @@ def pytest_addoption(parser):
                           "Default: headless (no X, good for CI).")
     parser.addoption("--takeoff-velocities", default="0.5,1,2",
                      help="Comma-separated takeoff/land velocities (m/s) to "
-                          "sweep in test_autonomy. Default: 0.5,1,2")
+                          "sweep in test_takeoff_hover_land. Default: 0.5,1,2")
 
 
 def pytest_configure(config):
@@ -153,10 +153,10 @@ def pytest_generate_tests(metafunc):
     "test_build_docker",
     "test_build_packages",
     "test_liveliness",
-    "test_autonomy",
+    "test_takeoff_hover_land",
 ]
 
-# Within test_autonomy, each (env, velocity) runs phases in this chain order.
+# Within test_takeoff_hover_land, each (env, velocity) runs phases in this chain order.
 _AUTONOMY_PHASE_ORDER = [
     "test_px4_ready",
     "test_takeoff",
@@ -175,11 +175,11 @@ def pytest_collection_modifyitems(items):
     #    order intact, so pytest's default file/class order survives.
     items.sort(key=lambda it: _rank(getattr(it.module, "__name__", ""), _MODULE_ORDER))
 
-    # 2. Within test_autonomy: sort by (airstack_env, velocity, phase) so each
+    # 2. Within test_takeoff_hover_land: sort by (airstack_env, velocity, phase) so each
     #    (sim, robots, iter) env brings up the stack once and the drone goes
     #    ground→air→ground per velocity.
     def phase(item):
-        if getattr(item.module, "__name__", "") != "test_autonomy":
+        if getattr(item.module, "__name__", "") != "test_takeoff_hover_land":
             return None
         name = item.originalname or item.name.split("[", 1)[0]
         return _rank(name, _AUTONOMY_PHASE_ORDER)
diff --git a/tests/test_autonomy.py b/tests/test_takeoff_hover_land.py
similarity index 99%
rename from tests/test_autonomy.py
rename to tests/test_takeoff_hover_land.py
index 7432d84d3..428e5eeaf 100644
--- a/tests/test_autonomy.py
+++ b/tests/test_takeoff_hover_land.py
@@ -1,4 +1,4 @@
-"""Autonomy tests — 4-phase chain per velocity.
+"""Takeoff-hover-land tests — 4-phase chain per velocity.
 
 Per (sim, num_robots, iter, velocity): ready → takeoff → hover → land.
 Drone returns to ground at end of each velocity so the next velocity
@@ -427,7 +427,7 @@ def _landing_one_robot(n, robot_container, cfg, velocity):
 
 @pytest.mark.autonomy
 @pytest.mark.timeout(1800)
-class TestAutonomy:
+class TestTakeoffHoverLand:
 
     @pytest.fixture(scope="session")
     def _failed_envs(self):

From ba3c7a19fc8d8f6f121fdce7b5982ff3ea6ca4f8 Mon Sep 17 00:00:00 2001
From: Andrew Jong <ajong@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 17:39:32 -0400
Subject: [PATCH 22/24] Update docs

---
 mkdocs.yml                                    |  11 +-
 .../system_testing.md => tests/README.md      | 130 +++++++++++++++---
 2 files changed, 113 insertions(+), 28 deletions(-)
 rename docs/development/intermediate/testing/system_testing.md => tests/README.md (69%)

diff --git a/mkdocs.yml b/mkdocs.yml
index c14223f78..c4a9e36eb 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -63,14 +63,9 @@ nav:
           - docs/development/beginner/development_environment.md
           - docs/development/beginner/vscode/vscode_debug.md
           - docs/development/beginner/fork_your_own_project.md
-      - Intermediate Tutorials:
-          - Testing:
-              - docs/development/intermediate/testing/index.md
-              - docs/development/intermediate/testing/testing_frameworks.md
-              - docs/development/intermediate/testing/ci_cd.md
-              - docs/development/intermediate/testing/system_testing.md
-              - docs/development/intermediate/testing/integration_testing.md
-              - docs/development/intermediate/testing/unit_testing.md
+      - Intermediate Tutorials: 
+          - Testing: 
+              - tests/README.md
           - Frame Conventions: docs/development/intermediate/frame_conventions.md
           - Contributing: 
               - docs/development/intermediate/contributing.md
diff --git a/docs/development/intermediate/testing/system_testing.md b/tests/README.md
similarity index 69%
rename from docs/development/intermediate/testing/system_testing.md
rename to tests/README.md
index 07b2b0404..f1a126c68 100644
--- a/docs/development/intermediate/testing/system_testing.md
+++ b/tests/README.md
@@ -11,8 +11,10 @@ AirStack's system tests bring up the full Docker-based stack — simulator, robo
 | [`test_build_docker.py`](../../../../tests/test_build_docker.py) | `build_docker` | Docker image builds (robot-desktop, gcs, isaac-sim, ms-airsim); records image sizes | Docker daemon |
 | [`test_build_packages.py`](../../../../tests/test_build_packages.py) | `build_packages` | `colcon build` inside each container (robot, GCS, ms-airsim ROS workspace) | Docker daemon |
 | [`test_liveliness.py`](../../../../tests/test_liveliness.py) | `liveliness` | Full stack up: container health, tmux process liveness, sentinel ROS 2 nodes, sim topic publishing rates, compute usage, sustained stability | Docker daemon, GPU, sim license |
+| [`test_takeoff_hover_land.py`](../../../../tests/test_takeoff_hover_land.py) | `autonomy` | End-to-end flight: PX4 readiness gate, takeoff to 10 m, hover stability, land — one chain per (sim, num_robots, iteration, velocity) | Docker daemon, GPU, sim license |
 
-Marks can be combined with pytest logic: `-m "build_docker or build_packages"`, `-m liveliness`.
+Marks can be combined with pytest logic:
+`-m "build_docker or build_packages"`, `-m liveliness`, `-m autonomy`.
 
 ---
 
@@ -50,43 +52,68 @@ tests/results/
 
 ---
 
-## Running Locally
+## Running Tests
 
-### Prerequisites
-
-- Docker daemon running with the `runner` user (or your user) in the `docker` group
-- NVIDIA drivers + `nvidia-container-toolkit` for liveliness tests
-- `pip install -r tests/requirements.txt`
+### `airstack test` (primary interface)
 
-### Direct (recommended for development)
+`airstack test` is the standard way to run tests. It builds the containerized
+test runner from `tests/docker/`, mounts the repo read-only, and forwards all
+arguments directly to pytest. No local Python environment needed.
 
 ```bash
-# From the repo root:
-export AIRSTACK_ROOT=$(pwd)
+# From the repo root (AirStack must be set up: airstack setup):
 
-# Build tests only (fast, no GPU needed)
-pytest tests/ -m "build_docker or build_packages" -v
+# Build tests only — fast, no GPU needed
+airstack test -m "build_docker or build_packages" -v
 
-# Full liveliness run — ms-airsim, 1 robot, 1 iteration, 60s stability window
-pytest tests/ -m liveliness \
+# Liveliness run — ms-airsim, 1 robot, 1 iteration, 60 s stability window
+airstack test -m liveliness \
   --sim msairsim \
   --num-robots 1 \
   --stress-iterations 1 \
   --stable-duration 60 \
   -v
 
+# Autonomy run — takeoff/hover/land at three velocities
+airstack test -m autonomy \
+  --sim msairsim \
+  --num-robots 1 \
+  --stress-iterations 1 \
+  --takeoff-velocities 0.5,1,2 \
+  -v
+
 # Show GUI windows (for local visual inspection)
-pytest tests/ -m liveliness --gui -v
+airstack test -m liveliness --gui -v
 ```
 
-### Docker-compose wrapper
+`airstack test` calls `xhost +` automatically so GUI-mode sim containers
+can reach the host X server; it is a no-op when `DISPLAY` is not set.
+
+### Prerequisites
 
-The `tests/docker/` directory provides a containerized test runner that has Docker CLI and all Python dependencies pre-installed.
+- Docker daemon running with your user in the `docker` group
+- NVIDIA drivers + `nvidia-container-toolkit` for liveliness/autonomy tests
+- `airstack setup` completed (adds `airstack` to `PATH`)
+
+### Direct pytest (for development / debugging)
+
+Run pytest directly when you need faster iteration (no container rebuild) or
+want to attach a debugger. Requires a local Python environment.
 
 ```bash
-export AIRSTACK_PATH=$(pwd)
-docker compose -f tests/docker/docker-compose.yaml run --rm test \
-  pytest -m "build_docker or build_packages" -v
+export AIRSTACK_ROOT=$(pwd)
+pip install -r tests/requirements.txt
+
+# Build tests only
+pytest tests/ -m "build_docker or build_packages" -v
+
+# Liveliness run
+pytest tests/ -m liveliness \
+  --sim msairsim \
+  --num-robots 1 \
+  --stress-iterations 1 \
+  --stable-duration 60 \
+  -v
 ```
 
 ### CLI option reference
@@ -99,6 +126,69 @@ docker compose -f tests/docker/docker-compose.yaml run --rm test \
 | `--stable-duration` | `120` | Seconds `test_stable` polls for |
 | `--stable-interval` | `10` | Seconds between polls in `test_stable` |
 | `--gui` | off | Show simulator GUI (disables headless mode) |
+| `--takeoff-velocities` | `0.5,1,2` | Takeoff/land speeds in m/s |
+
+---
+
+## Autonomy Tests (`test_takeoff_hover_land.py`)
+
+`TestTakeoffHoverLand` runs a **4-phase flight chain** for every combination of
+`(sim, num_robots, iteration, velocity)`. The drone returns to the ground after
+each velocity so the next velocity starts from a clean state.
+
+### Phase order
+
+| Phase | Test | What happens |
+| ----- | ---- | ------------ |
+| 1 | `test_px4_ready` | Waits for MAVROS + PX4 EKF ready; once per env |
+| 2 | `test_takeoff` | Sends TakeoffTask; asserts altitude within 10 % |
+| 3 | `test_hover` | Captures odom for 10 s; asserts altitude drift < 0.5 m |
+| 4 | `test_landing` | Sends LandTask; asserts final altitude < 0.5 m |
+
+If any phase other than `test_hover` fails, the remaining phases for that env
+are skipped (the chain guard prevents a stuck-in-air drone from blocking later
+velocity sweeps). A hover failure does **not** skip landing, so the drone always
+returns to the ground.
+
+### Recorded metrics
+
+| Metric key | Unit | Description |
+| ---------- | ---- | ----------- |
+| `ready_duration_sys_s` | s | Wall-clock time from test start until PX4 ready |
+| `takeoff_duration_sim_s` | s | Sim-time from first motion to 95 % of target |
+| `land_duration_sim_s` | s | Sim time from 80 % peak descent to < 0.5 m |
+| `velocity_rmse_m_sim_s` | m/s | RMSE of dz/dt vs commanded velocity during climb/descent |
+| `altitude_error_m` | m | Signed steady-state error at takeoff success (+ = high) |
+| `overshoot_m` | m | Unsigned transient overshoot above target |
+| `hover_altitude_mean_error_m` | m | Mean altitude drift during hover |
+| `hover_position_stddev_m` | m | 3-D position jitter (sqrt of summed axis variances) |
+| `final_altitude_m` | m | Altitude at landing action completion |
+| `odometry_error_mean_m` | m | Mean 3-D position error vs ground-truth odom |
+| `odometry_error_max_m` | m | Peak 3-D error vs ground-truth odom |
+| `odometry_altitude_bias_m` | m | Signed z-axis bias vs ground-truth odom |
+
+Metrics are recorded per robot as `robot_N.<key>` and written to
+`tests/results/<timestamp>/metrics.json`.
+
+### Running autonomy tests
+
+```bash
+# Sweep velocities 0.5, 1, 2 m/s; 1 robot; ms-airsim
+airstack test -m autonomy \
+  --sim msairsim \
+  --num-robots 1 \
+  --stress-iterations 1 \
+  --takeoff-velocities 0.5,1,2 \
+  -v
+
+# Single velocity, Isaac Sim, 3 robots
+airstack test -m autonomy \
+  --sim isaacsim \
+  --num-robots 3 \
+  --stress-iterations 1 \
+  --takeoff-velocities 1 \
+  -v
+```
 
 ---
 

From 70651fd9113b1fbb8e8d0922b443e08fc7228d2b Mon Sep 17 00:00:00 2001
From: Andrew Jong <ajong@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 17:40:01 -0400
Subject: [PATCH 23/24] Add help for airstack test

---
 airstack.sh | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/airstack.sh b/airstack.sh
index 5d20e31a8..9ff7fdcf3 100755
--- a/airstack.sh
+++ b/airstack.sh
@@ -187,11 +187,40 @@ function print_command_help {
             echo "  airstack rmi -f myimage"
             ;;
         test)
-            echo "Usage: airstack test [options]"
+            echo "Usage: airstack test [pytest options]"
             echo ""
-            echo "Options:"
-            echo "  --path=PATH    Path to test directory"
-            echo "  --filter=PATTERN  Filter tests by pattern"
+            echo "Build the containerized test runner (tests/docker/) and run pytest"
+            echo "inside it. All arguments are forwarded directly to pytest."
+            echo "Results are written to tests/results/<timestamp>/."
+            echo ""
+            echo "Test marks (-m):"
+            echo "  build_docker    Docker image build tests (no GPU needed)"
+            echo "  build_packages  colcon workspace build tests (no GPU needed)"
+            echo "  liveliness      Full stack up: nodes, topics, compute, stability"
+            echo "  autonomy        Takeoff / hover / land flight chain"
+            echo ""
+            echo "AirStack-specific options:"
+            echo "  --sim=TARGETS              Comma-separated sim targets"
+            echo "                             (default: msairsim,isaacsim)"
+            echo "  --num-robots=COUNTS        Comma-separated robot counts (default: 1,3)"
+            echo "  --stress-iterations=N      Up/down cycles per config (default: 3)"
+            echo "  --stable-duration=SECS     Seconds test_stable polls for (default: 120)"
+            echo "  --stable-interval=SECS     Seconds between polls (default: 10)"
+            echo "  --takeoff-velocities=LIST  Comma-separated takeoff/land speeds in m/s"
+            echo "                             for autonomy tests (default: 0.5,1,2)"
+            echo "  --gui                      Show simulator GUI (default: headless)"
+            echo ""
+            echo "Examples:"
+            echo "  # Build tests only — fast, no GPU needed"
+            echo "  airstack test -m 'build_docker or build_packages' -v"
+            echo ""
+            echo "  # Liveliness run — ms-airsim, 1 robot, 60 s stability window"
+            echo "  airstack test -m liveliness --sim msairsim --num-robots 1 \\"
+            echo "    --stress-iterations 1 --stable-duration 60 -v"
+            echo ""
+            echo "  # Autonomy run — takeoff/hover/land at 0.5, 1, and 2 m/s"
+            echo "  airstack test -m autonomy --sim msairsim --num-robots 1 \\"
+            echo "    --stress-iterations 1 --takeoff-velocities 0.5,1,2 -v"
             ;;
         docs)
             echo "Usage: airstack docs [serve]"

From 31745355b319fa6e9ef4bc1b22694bcc741d553d Mon Sep 17 00:00:00 2001
From: Andrew Jong <ajong@andrew.cmu.edu>
Date: Thu, 23 Apr 2026 17:51:52 -0400
Subject: [PATCH 24/24] Add video

---
 tests/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/README.md b/tests/README.md
index f1a126c68..0c5b932ab 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -2,6 +2,8 @@
 
 AirStack's system tests bring up the full Docker-based stack — simulator, robot containers, and GCS — and verify end-to-end behavior: container health, ROS 2 node presence, sensor publishing rates, and compute resource usage. Tests are written in Python with pytest and live under `tests/` at the repo root.
 
+<iframe width="1120" height="630" src="https://www.youtube.com/embed/EzgGHnYDI_k?si=vpqER-TXud5XEMUX" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
+
 ---
 
 ## Test Suite Structure