From 19a2e455fb2672cb6c2991f9919801848d420d63 Mon Sep 17 00:00:00 2001 From: Andrew Jong Date: Mon, 27 Apr 2026 16:18:15 -0400 Subject: [PATCH 1/4] Add link to PAT --- tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index a5ad438ce..487cbaf6d 100644 --- a/tests/README.md +++ b/tests/README.md @@ -293,7 +293,7 @@ sudo chown -R runner:runner /opt/actions-runner ### 3. Store the GitHub PAT -Create a fine-grained or classic PAT with **`repo`** scope (for private repos) or **`public_repo`** scope (for public repos). Store it securely: +[Create a fine-grained PAT](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token) with **`repo`** scope (for private repos) or **`public_repo`** scope (for public repos). Store it securely: ```bash echo "ghp_YOUR_TOKEN_HERE" | sudo tee /etc/github-runner-pat From 0ebc02fc7cf93db6d69e40b5c0315c0d0b01da36 Mon Sep 17 00:00:00 2001 From: Andrew Jong Date: Mon, 27 Apr 2026 17:43:02 -0400 Subject: [PATCH 2/4] Change to new orchestrator instance workflow --- .github/orchestrator/README.md | 152 +++++++ .../airstack-orchestrator.service | 40 ++ .github/orchestrator/cloud-init.yaml.j2 | 71 ++++ .github/orchestrator/config.example.yaml | 56 +++ .github/orchestrator/orchestrator.py | 396 ++++++++++++++++++ .github/orchestrator/requirements.txt | 4 + .github/orchestrator/setup.sh | 84 ++++ .github/runners/airstack-runner.service | 26 -- .github/runners/register-runner.sh | 96 ----- .github/workflows/integration-tests.yml | 2 +- tests/README.md | 119 ++---- 11 files changed, 844 insertions(+), 202 deletions(-) create mode 100644 .github/orchestrator/README.md create mode 100644 .github/orchestrator/airstack-orchestrator.service create mode 100644 .github/orchestrator/cloud-init.yaml.j2 create mode 100644 .github/orchestrator/config.example.yaml create mode 100644 .github/orchestrator/orchestrator.py create mode 100644 .github/orchestrator/requirements.txt create mode 100755 .github/orchestrator/setup.sh delete mode 100644 .github/runners/airstack-runner.service delete mode 100644 .github/runners/register-runner.sh diff --git a/.github/orchestrator/README.md b/.github/orchestrator/README.md new file mode 100644 index 000000000..a5d961840 --- /dev/null +++ b/.github/orchestrator/README.md @@ -0,0 +1,152 @@ +# AirStack CI Orchestrator + +Long-running service that watches GitHub for queued workflow jobs and spawns truly ephemeral OpenStack instances to execute each one. The orchestrator VM is the only host that holds the GitHub PAT and the OpenStack credential; the workers are destroyed after a single job. + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Orchestrator VM (airstack-ci-cd-orchestrator) │ +│ │ +│ airstack-orchestrator.service → orchestrator.py │ +│ spawn loop (every 15s): │ +│ • GET /repos//actions/runs?status=queued │ +│ • POST /repos//actions/runners/generate-jitconfig│ +│ • openstack server create (image, flavor, user_data) │ +│ • record (job_id → server_id) in state.json │ +│ reap loop (every 30s): │ +│ • job completed → openstack server delete │ +│ • job age > N min → force delete (straggler) │ +│ • owned but not in state → orphan reap │ +│ │ +│ /etc/airstack-orchestrator/ │ +│ config.yaml │ +│ github-pat │ +│ /home/orchestrator/.config/openstack/clouds.yaml │ +│ /var/lib/airstack-orchestrator/state.json │ +└─────────┬─────────────────────────────────┬─────────────────┘ + │ Nova / Neutron API │ GitHub REST API + ▼ ▼ +┌──────────────────────────────────┐ ┌──────────────────────┐ +│ Ephemeral worker (per job) │ │ GitHub Actions │ +│ Image: Ubuntu-24.04-GPU-Headless│ │ workflow_job queue │ +│ cloud-init: │ └──────────────────────┘ +│ install docker + nv toolkit │ +│ download GH runner │ +│ run.sh --jitconfig │ +│ shutdown -h +1 │ +└──────────────────────────────────┘ +``` + +Key properties: + +- **Truly ephemeral**: every job runs on a clean VM. No Docker layer cache pollution, no leftover networks, no carry-over from prior runs. +- **PAT isolation**: the GitHub PAT lives only on the orchestrator. Workers receive a single-use [JIT runner config](https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-configuration-for-a-just-in-time-runner-for-a-repository) — a base64 token bound to one runner registration, valid only for a short window. +- **Application-credential auth**: the orchestrator authenticates to OpenStack with an application credential (revocable, scoped, no password), not the user's `openrc.sh`. +- **Crash-safe reaping**: every server we spawn is tagged with `airstack-role=ephemeral-runner`. The reap loop force-deletes any owned server not present in `state.json`, so a crashed orchestrator can't leak instances. + +## One-time setup + +### 1. Create OpenStack application credential + +On your local workstation (not the orchestrator VM): + +```bash +source ~/.airlabcloud/openrc.sh +openstack application credential create airstack-orchestrator \ + --description "AirStack CI orchestrator — spawns ephemeral test runners" +``` + +The output prints `id` and `secret`. Build a `clouds.yaml`: + +```yaml +clouds: + airstack: + auth_type: v3applicationcredential + auth: + auth_url: https://airlab-cloud.andrew.cmu.edu:5000/v3/ + application_credential_id: + application_credential_secret: + region_name: Airlab + interface: public + identity_api_version: 3 +``` + +### 2. Stage credentials on the orchestrator VM + +```bash +# clouds.yaml: install for the orchestrator user (created in step 3) +scp clouds.yaml ubuntu@:/tmp/clouds.yaml + +# GitHub PAT: needs `Actions: read/write` and `Administration: read/write` +# (fine-grained) or classic `repo` scope. +scp ~/.airlabcloud/airstack-github-pat.txt \ + ubuntu@:/tmp/github-pat +``` + +### 3. Run setup.sh + +On the orchestrator VM: + +```bash +git clone https://github.com/castacks/AirStack.git /tmp/airstack +sudo bash /tmp/airstack/.github/orchestrator/setup.sh +``` + +`setup.sh` creates the `orchestrator` system user, builds the Python venv, copies `orchestrator.py` and `cloud-init.yaml.j2` into `/opt/airstack-orchestrator/`, scaffolds `/etc/airstack-orchestrator/`, installs the systemd unit, and consumes `/tmp/github-pat`. + +You still need to put the `clouds.yaml` in place under the orchestrator user's home: + +```bash +sudo install -d -o orchestrator -g orchestrator -m 0700 \ + /home/orchestrator/.config/openstack +sudo install -o orchestrator -g orchestrator -m 0600 \ + /tmp/clouds.yaml /home/orchestrator/.config/openstack/clouds.yaml +sudo shred -u /tmp/clouds.yaml +``` + +### 4. Fill in `/etc/airstack-orchestrator/config.yaml` + +Edit the placeholders the example ships with: + +| Field | What goes here | How to find it | +|------|---------------|----------------| +| `flavor_name` | OpenStack flavor with GPU + enough disk | `openstack flavor list` | +| `network_name` | Network the workers attach to | `openstack network list` | +| `keypair_name` | SSH keypair for break-glass access | `openstack keypair list` | +| `security_group` | Outbound 443 must be allowed | `openstack security group list` | +| `repo` | `owner/name` of the repo to poll | from GitHub URL | +| `runner_version` | Version tag from [actions/runner releases](https://github.com/actions/runner/releases) | check before each major upgrade | + +### 5. Start the service + +```bash +sudo systemctl enable --now airstack-orchestrator.service +journalctl -u airstack-orchestrator.service -f +``` + +You should see `orchestrator started: repo=... labels=... max_concurrent=N` and then periodic poll activity. + +## End-to-end verification + +```bash +# Trigger a fast build-only run. +gh workflow run integration-tests.yml -f marks=build_docker + +# Within ~30s, a server should appear: +openstack server list --metadata airstack-role=ephemeral-runner + +# Watch GitHub → Actions → Runners — the ephemeral runner should appear, +# pick up the job, then disappear. + +# Within ~30s of job completion, the server should be gone: +openstack server list --metadata airstack-role=ephemeral-runner +``` + +## Operational notes + +- **State file**: `/var/lib/airstack-orchestrator/state.json` is the in-flight job tracker. Wiping it triggers an orphan sweep on the next reap iteration — owned servers will be force-deleted. Don't wipe it while jobs are mid-flight unless that's what you want. +- **Stuck instance**: any server older than `max_job_minutes` (default 90) is force-deleted regardless of GitHub job status. Bump this if liveliness/autonomy runs grow longer than ~75 minutes. +- **PAT rotation**: `sudo install -o root -g orchestrator -m 0640 /tmp/new-pat /etc/airstack-orchestrator/github-pat && sudo systemctl restart airstack-orchestrator.service`. +- **Pause spawning** (e.g. for maintenance): `sudo systemctl stop airstack-orchestrator.service`. Already-spawned workers will still complete their jobs and self-shutdown; on restart, the reap loop deletes them. +- **Logs**: `journalctl -u airstack-orchestrator.service -f`. Cloud-init logs from individual workers are visible only via `openstack console log show ` while the worker is running. diff --git a/.github/orchestrator/airstack-orchestrator.service b/.github/orchestrator/airstack-orchestrator.service new file mode 100644 index 000000000..7123232eb --- /dev/null +++ b/.github/orchestrator/airstack-orchestrator.service @@ -0,0 +1,40 @@ +[Unit] +Description=AirStack CI Orchestrator (spawns ephemeral OpenStack runners) +Documentation=https://github.com/castacks/AirStack/tree/main/.github/orchestrator +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=orchestrator +Group=orchestrator +WorkingDirectory=/opt/airstack-orchestrator + +# Application credential lives in the orchestrator user's home so openstacksdk +# finds it via the default cloud-config search path. +Environment=HOME=/home/orchestrator +Environment=OS_CLIENT_CONFIG_FILE=/home/orchestrator/.config/openstack/clouds.yaml + +ExecStart=/opt/airstack-orchestrator/venv/bin/python \ + /opt/airstack-orchestrator/orchestrator.py \ + --config /etc/airstack-orchestrator/config.yaml \ + --pat /etc/airstack-orchestrator/github-pat \ + --state /var/lib/airstack-orchestrator/state.json \ + --template /opt/airstack-orchestrator/cloud-init.yaml.j2 + +Restart=always +RestartSec=10 + +# Allow draining loops on stop (SIGTERM handled by orchestrator.py). +TimeoutStopSec=30 +KillSignal=SIGTERM + +# Hardening +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=read-only +ReadWritePaths=/var/lib/airstack-orchestrator +PrivateTmp=true + +[Install] +WantedBy=multi-user.target diff --git a/.github/orchestrator/cloud-init.yaml.j2 b/.github/orchestrator/cloud-init.yaml.j2 new file mode 100644 index 000000000..feefde51f --- /dev/null +++ b/.github/orchestrator/cloud-init.yaml.j2 @@ -0,0 +1,71 @@ +#cloud-config +# Rendered per-spawn by orchestrator.py with two Jinja variables: +# encoded_jit_config - single-use base64 JIT config from GitHub +# runner_version - GitHub Actions runner version (e.g. 2.319.1) +# +# The base image (Ubuntu-24.04-GPU-Headless) already has NVIDIA drivers. +# This cloud-init adds Docker (with the compose plugin), nvidia-container-toolkit, +# downloads the GitHub Actions runner, registers it with the JIT config, runs +# exactly one job (the JIT config + --ephemeral makes the runner exit after one +# job), and shuts the VM down. The orchestrator then deletes the server. + +package_update: true +package_upgrade: false +packages: + - jq + - curl + - ca-certificates + - gnupg + +write_files: + - path: /usr/local/bin/airstack-runner-bootstrap.sh + permissions: "0755" + owner: root:root + content: | + #!/usr/bin/env bash + set -euxo pipefail + + # Install Docker (with compose plugin) from Docker's official channel. + # get.docker.com handles apt repo setup + nvidia-container-toolkit-compatible + # docker-ce, plus the docker-compose-plugin we need for `airstack up`. + curl -fsSL https://get.docker.com | sh + + # nvidia-container-toolkit is required for GPU containers (liveliness / + # autonomy tests). The base image has the NVIDIA *drivers* but we still + # need the container runtime hooks here. + distribution=$(. /etc/os-release; echo "$ID$VERSION_ID") + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg + curl -fsSL "https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list" \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + > /etc/apt/sources.list.d/nvidia-container-toolkit.list + apt-get update + apt-get install -y nvidia-container-toolkit + nvidia-ctk runtime configure --runtime=docker + systemctl restart docker + + usermod -aG docker ubuntu + + # GitHub Actions runner. + RUNNER_VERSION="{{ runner_version }}" + RUNNER_DIR=/home/ubuntu/actions-runner + mkdir -p "$RUNNER_DIR" + cd "$RUNNER_DIR" + curl -fsSL -o runner.tar.gz \ + "https://github.com/actions/runner/releases/download/v${RUNNER_VERSION}/actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz" + tar xzf runner.tar.gz + rm runner.tar.gz + chown -R ubuntu:ubuntu "$RUNNER_DIR" + + # Run exactly one job under the ubuntu user. The JIT config is single-use + # and ephemeral, so run.sh exits after one job completes. + sudo -u ubuntu --preserve-env=HOME -H bash -c \ + "cd '$RUNNER_DIR' && ./run.sh --jitconfig '{{ encoded_jit_config }}'" \ + || echo "runner exited non-zero (job failure or runner error)" + + # Backstop: power down. The orchestrator's reap loop is the authoritative + # deleter — it sees the GitHub job complete and calls Nova delete. + shutdown -h +1 + +runcmd: + - /usr/local/bin/airstack-runner-bootstrap.sh diff --git a/.github/orchestrator/config.example.yaml b/.github/orchestrator/config.example.yaml new file mode 100644 index 000000000..21c8011ef --- /dev/null +++ b/.github/orchestrator/config.example.yaml @@ -0,0 +1,56 @@ +# AirStack CI orchestrator configuration. +# Copy to /etc/airstack-orchestrator/config.yaml and fill in placeholders. + +# --- OpenStack target --- + +# Cloud profile name in ~/.config/openstack/clouds.yaml. +openstack_cloud: airstack + +# Ubuntu-24.04-GPU-Headless (confirmed available on airlab-cloud). +image_id: a891a6fe-5e4f-4b84-a6c9-482848c8f972 + +# OpenStack flavor with GPU + enough disk for Docker + sim images. +# Look up with: openstack flavor list +flavor_name: "" + +# OpenStack network the ephemeral instance attaches to. Must allow outbound +# 443 to api.github.com (no inbound is required: the runner makes an outbound +# long-poll connection to GitHub). +network_name: "" + +# OpenStack keypair injected into the instance for break-glass SSH access. +# The orchestrator never SSHes into workers itself. +keypair_name: "" + +# Security group applied to spawned instances. Outbound 443 must be allowed. +security_group: "" + +# --- GitHub --- + +# owner/name of the repo whose queued workflow_jobs to pick up. +repo: "castacks/AirStack" + +# Labels the orchestrator polls for. A queued workflow_job whose `labels` +# array is a superset of this list gets a server spawned for it. +runner_labels: + - self-hosted + - airstack-ephemeral + +# GitHub Actions runner version (must exist as a release tag at +# https://github.com/actions/runner/releases). +runner_version: "2.319.1" + +# --- Limits --- + +# Maximum simultaneous in-flight ephemeral instances. +max_concurrent: 3 + +# Hard ceiling for a single job. Past this age the reaper force-deletes the +# server even if GitHub still reports the job as in-progress. Must comfortably +# exceed the longest expected job (autonomy/liveliness runs). +max_job_minutes: 90 + +# --- Polling intervals (seconds) --- + +spawn_poll_interval_s: 15 +reap_poll_interval_s: 30 diff --git a/.github/orchestrator/orchestrator.py b/.github/orchestrator/orchestrator.py new file mode 100644 index 000000000..13a6a020e --- /dev/null +++ b/.github/orchestrator/orchestrator.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 +"""AirStack CI orchestrator. + +Polls the GitHub API for queued workflow_jobs whose labels match this +orchestrator's runner_labels, and spawns truly ephemeral OpenStack instances +to execute them. Each ephemeral instance receives a single-use GitHub JIT +runner config via cloud-init; the GitHub PAT never leaves this orchestrator. + +Two cooperating loops: + - spawn loop: discover queued jobs, spawn one Nova server per job + - reap loop: delete servers whose jobs have completed, plus stragglers + older than max_job_minutes and orphans not in state.json + +State persists in /var/lib/airstack-orchestrator/state.json so the +orchestrator can survive restarts without leaking instances. +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import logging +import os +import signal +import sys +import threading +import time +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import openstack +import requests +import yaml +from jinja2 import Template + +DEFAULT_CONFIG_PATH = "/etc/airstack-orchestrator/config.yaml" +DEFAULT_PAT_PATH = "/etc/airstack-orchestrator/github-pat" +DEFAULT_STATE_PATH = "/var/lib/airstack-orchestrator/state.json" +DEFAULT_TEMPLATE_PATH = "/opt/airstack-orchestrator/cloud-init.yaml.j2" + +# Metadata key/value applied to every Nova server we spawn. Used by the +# orphan reaper to identify servers we own even when state.json is missing. +ROLE_META_KEY = "airstack-role" +ROLE_META_VAL = "ephemeral-runner" +JOB_META_KEY = "airstack-job-id" + +GITHUB_API = "https://api.github.com" + +log = logging.getLogger("orchestrator") + + +def load_yaml(path: str) -> dict: + with open(path) as f: + return yaml.safe_load(f) + + +def load_pat(path: str) -> str: + with open(path) as f: + return f.read().strip() + + +def load_state(path: str) -> dict: + if not os.path.exists(path): + return {"jobs": {}} + with open(path) as f: + return json.load(f) + + +def save_state(path: str, state: dict) -> None: + Path(path).parent.mkdir(parents=True, exist_ok=True) + tmp = path + ".tmp" + with open(tmp, "w") as f: + json.dump(state, f, indent=2, sort_keys=True) + os.replace(tmp, path) + + +def gh_request(method: str, path: str, pat: str, **kwargs: Any) -> Any: + url = f"{GITHUB_API}{path}" + headers = kwargs.pop("headers", {}) + headers.update( + { + "Authorization": f"Bearer {pat}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + ) + r = requests.request(method, url, headers=headers, timeout=30, **kwargs) + r.raise_for_status() + if not r.text: + return None + return r.json() + + +def find_queued_jobs(repo: str, runner_labels: list[str], pat: str) -> list[dict]: + """Return queued workflow_jobs whose labels include all runner_labels.""" + runs = gh_request( + "GET", f"/repos/{repo}/actions/runs?status=queued&per_page=20", pat + ) + label_set = set(runner_labels) + matches: list[dict] = [] + for run in runs.get("workflow_runs", []): + jobs = gh_request("GET", f"/repos/{repo}/actions/runs/{run['id']}/jobs", pat) + for job in jobs.get("jobs", []): + if job.get("status") != "queued": + continue + if not label_set.issubset(set(job.get("labels", []))): + continue + if job.get("runner_id"): + continue + matches.append( + { + "job_id": str(job["id"]), + "run_id": run["id"], + "name": job["name"], + "labels": job["labels"], + } + ) + return matches + + +def mint_jit_config( + repo: str, runner_name: str, runner_labels: list[str], pat: str, + runner_group_id: int = 1, +) -> str: + body = { + "name": runner_name, + "runner_group_id": runner_group_id, + "labels": runner_labels, + } + resp = gh_request( + "POST", + f"/repos/{repo}/actions/runners/generate-jitconfig", + pat, + json=body, + ) + return resp["encoded_jit_config"] + + +def get_job_status(repo: str, job_id: str, pat: str) -> dict | None: + """Return the job dict, or None if 404 (job purged).""" + url = f"{GITHUB_API}/repos/{repo}/actions/jobs/{job_id}" + r = requests.get( + url, + headers={ + "Authorization": f"Bearer {pat}", + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + }, + timeout=30, + ) + if r.status_code == 404: + return None + r.raise_for_status() + return r.json() + + +def render_cloud_init(template_path: str, encoded_jit_config: str, + runner_version: str) -> str: + with open(template_path) as f: + tmpl = Template(f.read()) + return tmpl.render( + encoded_jit_config=encoded_jit_config, + runner_version=runner_version, + ) + + +def spawn_server( + conn: openstack.connection.Connection, + config: dict, + name: str, + job_id: str, + user_data: str, +) -> str: + flavor = conn.compute.find_flavor(config["flavor_name"], ignore_missing=False) + network = conn.network.find_network(config["network_name"], ignore_missing=False) + server = conn.compute.create_server( + name=name, + image_id=config["image_id"], + flavor_id=flavor.id, + networks=[{"uuid": network.id}], + key_name=config["keypair_name"], + security_groups=[{"name": config["security_group"]}], + user_data=base64.b64encode(user_data.encode()).decode(), + metadata={ + ROLE_META_KEY: ROLE_META_VAL, + JOB_META_KEY: job_id, + }, + ) + return server.id + + +def delete_server(conn: openstack.connection.Connection, server_id: str) -> None: + try: + conn.compute.delete_server(server_id, ignore_missing=True, force=True) + except Exception as e: + log.warning("delete_server(%s) failed: %s", server_id, e) + + +def list_owned_servers(conn: openstack.connection.Connection) -> list[Any]: + """List all Nova servers that carry our role metadata.""" + owned = [] + for s in conn.compute.servers(details=True): + meta = getattr(s, "metadata", None) or {} + if meta.get(ROLE_META_KEY) == ROLE_META_VAL: + owned.append(s) + return owned + + +def now_utc_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def parse_iso(s: str) -> datetime: + return datetime.fromisoformat(s) + + +class Orchestrator: + def __init__(self, config: dict, pat: str, state_path: str, template_path: str): + self.config = config + self.pat = pat + self.state_path = state_path + self.template_path = template_path + self.conn = openstack.connect(cloud=config.get("openstack_cloud", "airstack")) + self.repo = config["repo"] + self.runner_labels = config["runner_labels"] + self.runner_version = config["runner_version"] + self.max_concurrent = int(config.get("max_concurrent", 3)) + self.max_job_minutes = int(config.get("max_job_minutes", 90)) + self.spawn_interval = int(config.get("spawn_poll_interval_s", 15)) + self.reap_interval = int(config.get("reap_poll_interval_s", 30)) + self.stop_evt = threading.Event() + + def stop(self, *_: Any) -> None: + log.info("stop signal received; draining loops") + self.stop_evt.set() + + def spawn_once(self) -> None: + state = load_state(self.state_path) + active = len(state["jobs"]) + if active >= self.max_concurrent: + return + try: + queued = find_queued_jobs(self.repo, self.runner_labels, self.pat) + except Exception as e: + log.warning("find_queued_jobs failed: %s", e) + return + + for job in queued: + if active >= self.max_concurrent: + break + job_id = job["job_id"] + if job_id in state["jobs"]: + continue + ts = int(time.time()) + runner_name = f"ephemeral-{job_id}-{ts}" + try: + jit = mint_jit_config( + self.repo, runner_name, self.runner_labels, self.pat + ) + user_data = render_cloud_init( + self.template_path, jit, self.runner_version + ) + server_id = spawn_server( + self.conn, self.config, runner_name, job_id, user_data + ) + except Exception as e: + log.exception("spawn failed for job %s: %s", job_id, e) + continue + + state["jobs"][job_id] = { + "run_id": job["run_id"], + "server_id": server_id, + "runner_name": runner_name, + "spawned_at": now_utc_iso(), + "name": job["name"], + } + save_state(self.state_path, state) + active += 1 + log.info( + "spawned server %s for job %s (%s)", server_id, job_id, job["name"] + ) + + def reap_once(self) -> None: + state = load_state(self.state_path) + now = datetime.now(timezone.utc) + + # 1. Delete servers for completed jobs. + for job_id in list(state["jobs"].keys()): + entry = state["jobs"][job_id] + try: + job = get_job_status(self.repo, job_id, self.pat) + except Exception as e: + log.warning("get_job_status(%s) failed: %s", job_id, e) + continue + if job is None or job.get("status") == "completed": + log.info("reaping server %s (job %s done)", entry["server_id"], job_id) + delete_server(self.conn, entry["server_id"]) + del state["jobs"][job_id] + continue + + # 2. Force-reap stragglers older than max_job_minutes. + spawned = parse_iso(entry["spawned_at"]) + age_min = (now - spawned).total_seconds() / 60.0 + if age_min > self.max_job_minutes: + log.warning( + "force-reaping server %s (job %s age %.1fm > %dm)", + entry["server_id"], job_id, age_min, self.max_job_minutes, + ) + delete_server(self.conn, entry["server_id"]) + del state["jobs"][job_id] + + save_state(self.state_path, state) + + # 3. Orphan sweep: any server we own that isn't in state and isn't + # in the brief just-spawned window. Catches state.json wipes and + # crashes between spawn and save_state. + try: + owned = list_owned_servers(self.conn) + except Exception as e: + log.warning("list_owned_servers failed: %s", e) + return + tracked_ids = {e["server_id"] for e in state["jobs"].values()} + for s in owned: + if s.id in tracked_ids: + continue + created = getattr(s, "created_at", None) + if created: + try: + age_min = (now - parse_iso(created.replace("Z", "+00:00"))).total_seconds() / 60.0 + except Exception: + age_min = self.max_job_minutes + 1 + else: + age_min = self.max_job_minutes + 1 + # Only reap orphans that have lived past one spawn interval + # (to avoid racing our own freshly-created server). + if age_min < 2: + continue + log.warning( + "orphan-reaping server %s (not in state, age %.1fm)", s.id, age_min + ) + delete_server(self.conn, s.id) + + def run(self) -> None: + log.info( + "orchestrator started: repo=%s labels=%s max_concurrent=%d", + self.repo, self.runner_labels, self.max_concurrent, + ) + last_spawn = 0.0 + last_reap = 0.0 + while not self.stop_evt.is_set(): + now = time.monotonic() + if now - last_spawn >= self.spawn_interval: + try: + self.spawn_once() + except Exception: + log.exception("spawn loop iteration failed") + last_spawn = now + if now - last_reap >= self.reap_interval: + try: + self.reap_once() + except Exception: + log.exception("reap loop iteration failed") + last_reap = now + self.stop_evt.wait(timeout=1.0) + log.info("orchestrator stopped") + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--config", default=DEFAULT_CONFIG_PATH) + parser.add_argument("--pat", default=DEFAULT_PAT_PATH) + parser.add_argument("--state", default=DEFAULT_STATE_PATH) + parser.add_argument("--template", default=DEFAULT_TEMPLATE_PATH) + parser.add_argument("--log-level", default="INFO") + args = parser.parse_args() + + logging.basicConfig( + level=getattr(logging, args.log_level.upper()), + format="%(asctime)s %(levelname)s %(name)s: %(message)s", + stream=sys.stdout, + ) + + config = load_yaml(args.config) + pat = load_pat(args.pat) + + orch = Orchestrator(config, pat, args.state, args.template) + signal.signal(signal.SIGINT, orch.stop) + signal.signal(signal.SIGTERM, orch.stop) + orch.run() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/.github/orchestrator/requirements.txt b/.github/orchestrator/requirements.txt new file mode 100644 index 000000000..b69702b59 --- /dev/null +++ b/.github/orchestrator/requirements.txt @@ -0,0 +1,4 @@ +openstacksdk>=3.0,<5 +requests>=2.31 +PyYAML>=6.0 +Jinja2>=3.1 diff --git a/.github/orchestrator/setup.sh b/.github/orchestrator/setup.sh new file mode 100755 index 000000000..4803b4a33 --- /dev/null +++ b/.github/orchestrator/setup.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# One-time orchestrator-VM setup. Run as root on the airstack-ci-cd-orchestrator +# OpenStack instance after cloning the repo. +# +# Pre-reqs (do these *before* running this script): +# 1. ~/.config/openstack/clouds.yaml staged for the orchestrator user +# (application credential — see .github/orchestrator/README.md). +# 2. /tmp/github-pat exists with the GitHub PAT contents. +# 3. This repo cloned somewhere readable (this script copies code out of +# its containing directory). + +set -euo pipefail + +INSTALL_DIR=/opt/airstack-orchestrator +CONFIG_DIR=/etc/airstack-orchestrator +STATE_DIR=/var/lib/airstack-orchestrator +USER_NAME=orchestrator + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [[ $EUID -ne 0 ]]; then + echo "ERROR: setup.sh must run as root" >&2 + exit 1 +fi + +echo "==> Creating orchestrator user" +if ! id "$USER_NAME" >/dev/null 2>&1; then + useradd --system --create-home --shell /usr/sbin/nologin "$USER_NAME" +fi + +echo "==> Installing system packages" +apt-get update +apt-get install -y python3 python3-venv python3-pip + +echo "==> Creating directories" +install -d -o "$USER_NAME" -g "$USER_NAME" -m 0750 "$INSTALL_DIR" +install -d -o root -g "$USER_NAME" -m 0750 "$CONFIG_DIR" +install -d -o "$USER_NAME" -g "$USER_NAME" -m 0750 "$STATE_DIR" + +echo "==> Copying orchestrator files to $INSTALL_DIR" +install -o "$USER_NAME" -g "$USER_NAME" -m 0755 \ + "$REPO_DIR/orchestrator.py" "$INSTALL_DIR/orchestrator.py" +install -o "$USER_NAME" -g "$USER_NAME" -m 0644 \ + "$REPO_DIR/cloud-init.yaml.j2" "$INSTALL_DIR/cloud-init.yaml.j2" + +echo "==> Building Python venv" +sudo -u "$USER_NAME" python3 -m venv "$INSTALL_DIR/venv" +sudo -u "$USER_NAME" "$INSTALL_DIR/venv/bin/pip" install --upgrade pip +sudo -u "$USER_NAME" "$INSTALL_DIR/venv/bin/pip" install -r "$REPO_DIR/requirements.txt" + +echo "==> Staging config (if not present)" +if [[ ! -f "$CONFIG_DIR/config.yaml" ]]; then + install -o root -g "$USER_NAME" -m 0640 \ + "$REPO_DIR/config.example.yaml" "$CONFIG_DIR/config.yaml" + echo " config.yaml installed from example — edit before starting service" +fi + +echo "==> Installing GitHub PAT (from /tmp/github-pat)" +if [[ ! -f /tmp/github-pat ]]; then + echo "ERROR: /tmp/github-pat not found. scp it over before running setup." >&2 + exit 1 +fi +install -o root -g "$USER_NAME" -m 0640 /tmp/github-pat "$CONFIG_DIR/github-pat" +shred -u /tmp/github-pat + +echo "==> Verifying clouds.yaml" +CLOUDS_YAML="/home/$USER_NAME/.config/openstack/clouds.yaml" +if [[ ! -f "$CLOUDS_YAML" ]]; then + echo "WARNING: $CLOUDS_YAML missing." >&2 + echo " Create it (application credential) before starting the service." >&2 +fi + +echo "==> Installing systemd unit" +install -o root -g root -m 0644 \ + "$REPO_DIR/airstack-orchestrator.service" \ + /etc/systemd/system/airstack-orchestrator.service +systemctl daemon-reload + +echo +echo "Setup complete. Next steps:" +echo " 1. Edit $CONFIG_DIR/config.yaml — fill flavor/network/keypair/security_group." +echo " 2. Verify $CLOUDS_YAML exists with the application credential." +echo " 3. systemctl enable --now airstack-orchestrator.service" +echo " 4. journalctl -u airstack-orchestrator.service -f" diff --git a/.github/runners/airstack-runner.service b/.github/runners/airstack-runner.service deleted file mode 100644 index 89cfe4d40..000000000 --- a/.github/runners/airstack-runner.service +++ /dev/null @@ -1,26 +0,0 @@ -[Unit] -Description=AirStack GitHub Actions Runner (ephemeral) -Documentation=https://docs.github.com/en/actions/hosting-your-own-runners -After=network-online.target docker.service -Wants=network-online.target - -[Service] -User=runner -Group=runner -WorkingDirectory=/opt/actions-runner - -# Place runtime configuration here (REPO_URL, RUNNER_LABELS, etc.). -# Each line: KEY=value (no quotes needed, no export keyword) -EnvironmentFile=/etc/github-runner-env - -ExecStart=/opt/actions-runner/register-runner.sh - -# Restart unconditionally so the loop survives transient API errors or reboots. -Restart=always -RestartSec=5 - -# Give the runner enough time to finish a long job before systemd kills it. -TimeoutStopSec=120 - -[Install] -WantedBy=multi-user.target diff --git a/.github/runners/register-runner.sh b/.github/runners/register-runner.sh deleted file mode 100644 index 1b5e0f3ac..000000000 --- a/.github/runners/register-runner.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env bash -# AirStack ephemeral GitHub Actions runner loop. -# -# Registers a fresh runner, executes exactly one job, then loops to re-register. -# The --ephemeral flag tells the GitHub API to remove the runner after one job, -# preventing stale registrations and cross-job state pollution. -# -# Setup (one-time, on the OpenStack VM): -# 1. Create a non-root runner user: -# sudo useradd -m -s /bin/bash runner -# sudo usermod -aG docker runner -# -# 2. Download and unpack the GitHub Actions runner into RUNNER_DIR: -# sudo mkdir -p /opt/actions-runner -# cd /opt/actions-runner -# # Get the latest runner URL from: -# # https://github.com/actions/runner/releases -# curl -Lo actions-runner.tar.gz -# tar xzf actions-runner.tar.gz -# sudo chown -R runner:runner /opt/actions-runner -# -# 3. Store a GitHub PAT (repo scope for private repos, public_repo for public): -# echo "ghp_YOUR_TOKEN_HERE" | sudo tee /etc/github-runner-pat -# sudo chmod 600 /etc/github-runner-pat -# sudo chown runner:runner /etc/github-runner-pat -# -# 4. Copy this script into the runner directory and make it executable: -# sudo cp register-runner.sh /opt/actions-runner/register-runner.sh -# sudo chown runner:runner /opt/actions-runner/register-runner.sh -# sudo chmod +x /opt/actions-runner/register-runner.sh -# -# 5. Install the systemd unit (see airstack-runner.service) and enable it: -# sudo cp airstack-runner.service /etc/systemd/system/ -# sudo systemctl daemon-reload -# sudo systemctl enable --now airstack-runner.service -# -# Configuration: set these in /etc/github-runner-env (loaded by the systemd unit) -# or export them before running this script manually. - -set -euo pipefail - -REPO_URL="${REPO_URL:-https://github.com/YOUR_ORG/AirStack}" -# Derived from REPO_URL for the registration token API call, e.g. "YOUR_ORG/AirStack" -REPO_PATH="${REPO_PATH:-$(echo "$REPO_URL" | sed 's|https://github.com/||')}" -RUNNER_DIR="${RUNNER_DIR:-/opt/actions-runner}" -PAT_FILE="${PAT_FILE:-/etc/github-runner-pat}" -RUNNER_LABELS="${RUNNER_LABELS:-self-hosted,airstack,gpu}" -RUNNER_GROUP="${RUNNER_GROUP:-Default}" - -if [ ! -f "$PAT_FILE" ]; then - echo "ERROR: PAT file not found at $PAT_FILE" >&2 - exit 1 -fi - -echo "Starting ephemeral runner loop for $REPO_URL" - -while true; do - echo "[$(date -u +%FT%TZ)] Requesting registration token..." - - TOKEN=$(curl -sf -X POST \ - -H "Authorization: token $(cat "$PAT_FILE")" \ - -H "Accept: application/vnd.github+json" \ - "https://api.github.com/repos/${REPO_PATH}/actions/runners/registration-token" \ - | jq -r .token) - - if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then - echo "ERROR: Failed to obtain registration token. Check PAT and repo path." >&2 - sleep 30 - continue - fi - - echo "[$(date -u +%FT%TZ)] Configuring runner (ephemeral)..." - - # --ephemeral: runner de-registers itself after completing one job. - # --replace: allows re-registration with the same name after a restart. - # Runner name encodes hostname + PID so parallel instances are unique. - "$RUNNER_DIR/config.sh" \ - --url "$REPO_URL" \ - --token "$TOKEN" \ - --name "openstack-$(hostname -s)-$$" \ - --labels "$RUNNER_LABELS" \ - --runnergroup "$RUNNER_GROUP" \ - --ephemeral \ - --unattended \ - --replace - - echo "[$(date -u +%FT%TZ)] Runner configured. Waiting for a job..." - - # run.sh blocks until the job completes, then returns (ephemeral runner exits cleanly). - "$RUNNER_DIR/run.sh" || true - - echo "[$(date -u +%FT%TZ)] Job finished. Re-registering..." - - # Brief pause to avoid hammering the API if config.sh / run.sh fail immediately. - sleep 2 -done diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index b90643c75..b4b2bb68a 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -33,7 +33,7 @@ on: jobs: run-tests: name: Run Tests - runs-on: [self-hosted, airstack, gpu] + runs-on: [self-hosted, airstack-ephemeral] # Only run on PRs from the same repo (not forks) to prevent arbitrary code # execution on the self-hosted runner from untrusted contributors. if: > diff --git a/tests/README.md b/tests/README.md index 487cbaf6d..25d32e21b 100644 --- a/tests/README.md +++ b/tests/README.md @@ -251,7 +251,7 @@ Regressions are flagged with :red_circle:, improvements with :green_circle:. #### Jobs -**`run-tests`** runs on the self-hosted GPU runner (`[self-hosted, airstack, gpu]`). It installs dependencies, runs pytest, and uploads `tests/results/` as an artifact named `test-results--` with 90-day retention. +**`run-tests`** runs on a freshly-spawned ephemeral OpenStack instance (`[self-hosted, airstack-ephemeral]`). The instance is provisioned per-job by the orchestrator described below and destroyed once the job completes. It installs dependencies, runs pytest, and uploads `tests/results/` as an artifact named `test-results--` with 90-day retention. **`report`** runs on `ubuntu-latest` after `run-tests` (even if it failed). It: @@ -267,93 +267,54 @@ The workflow uses [`dawidd6/action-download-artifact@v6`](https://github.com/daw --- -## Self-Hosted Runner Setup +## CI/CD Orchestrator (OpenStack-backed ephemeral runners) -AirStack's tests require a GPU and Docker, so they run on a self-hosted OpenStack VM. The setup uses the **ephemeral runner** pattern: each runner process registers, executes exactly one job, and then de-registers. This prevents cross-job environment contamination and stale runner accumulation. +AirStack's tests require a GPU, Docker, and a clean filesystem per run, so they execute on **truly ephemeral OpenStack instances** spawned per-job by an orchestrator. Each test job gets a fresh VM that is destroyed once the job completes — no Docker layer carryover, no leaked containers, no shared host state. -### 1. Create the runner user +### Architecture -```bash -sudo useradd -m -s /bin/bash runner -sudo usermod -aG docker runner # allows Docker commands without sudo -``` - -### 2. Install the GitHub Actions runner binary - -Download the latest runner tarball from [github.com/actions/runner/releases](https://github.com/actions/runner/releases) and unpack it: - -```bash -sudo mkdir -p /opt/actions-runner -cd /opt/actions-runner -# Replace the URL with the current release for linux-x64: -curl -Lo actions-runner.tar.gz https://github.com/actions/runner/releases/download/vX.Y.Z/actions-runner-linux-x64-X.Y.Z.tar.gz -sudo tar xzf actions-runner.tar.gz -C /opt/actions-runner -sudo chown -R runner:runner /opt/actions-runner -``` - -### 3. Store the GitHub PAT - -[Create a fine-grained PAT](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token) with **`repo`** scope (for private repos) or **`public_repo`** scope (for public repos). Store it securely: - -```bash -echo "ghp_YOUR_TOKEN_HERE" | sudo tee /etc/github-runner-pat -sudo chmod 600 /etc/github-runner-pat -sudo chown runner:runner /etc/github-runner-pat ``` - -### 4. Configure runner environment - -Create `/etc/github-runner-env` (loaded by the systemd unit): - -```ini -REPO_URL=https://github.com/YOUR_ORG/AirStack -REPO_PATH=YOUR_ORG/AirStack -RUNNER_LABELS=self-hosted,airstack,gpu -RUNNER_GROUP=Default -RUNNER_DIR=/opt/actions-runner -PAT_FILE=/etc/github-runner-pat +┌──────────────────────────────────────────────────────────────┐ +│ Orchestrator VM (airstack-ci-cd-orchestrator) │ +│ • polls GitHub for queued workflow_jobs │ +│ • mints single-use JIT runner tokens │ +│ • spawns / reaps ephemeral instances via OpenStack Nova │ +│ • holds the GitHub PAT and OpenStack application credential│ +└────────────┬───────────────────────────────────┬─────────────┘ + │ │ + ▼ ▼ +┌──────────────────────────────┐ ┌────────────────────────────────┐ +│ Ephemeral worker (per job) │ │ GitHub Actions queue │ +│ Image: Ubuntu-24.04-GPU- │ │ workflow_job status=queued │ +│ Headless │ │ labels: [self-hosted, │ +│ cloud-init bootstraps Docker │ │ airstack-ephemeral] │ +│ + nvidia-container-toolkit + │ └────────────────────────────────┘ +│ GH Actions runner; runs ONE │ +│ job, then is destroyed. │ +└──────────────────────────────┘ ``` -```bash -sudo chmod 600 /etc/github-runner-env -sudo chown runner:runner /etc/github-runner-env -``` +### Why this instead of a long-lived self-hosted runner -### 5. Install the registration script - -```bash -sudo cp .github/runners/register-runner.sh /opt/actions-runner/register-runner.sh -sudo chown runner:runner /opt/actions-runner/register-runner.sh -sudo chmod +x /opt/actions-runner/register-runner.sh -``` - -### 6. Install and enable the systemd service - -```bash -sudo cp .github/runners/airstack-runner.service /etc/systemd/system/ -sudo systemctl daemon-reload -sudo systemctl enable --now airstack-runner.service -``` - -Check status: - -```bash -sudo systemctl status airstack-runner.service -sudo journalctl -u airstack-runner.service -f -``` +| Concern | Mitigation | +|---------|------------| +| Cross-job state pollution (Docker cache, dangling networks, leftover artifacts) | Each job runs on a fresh VM. Spent VM is destroyed within ~30 s of job completion. | +| Fork PRs executing arbitrary code | Workflow's `if: github.event.pull_request.head.repo.full_name == github.repository` — fork PRs skipped. | +| Runner running as root | The runner runs as the unprivileged `ubuntu` user inside an instance whose only purpose is one job. | +| Docker socket gives root-equivalent access | Bounded to a single one-shot VM. The orchestrator host doesn't expose Docker at all. | +| Long-lived PAT on the runner host | The PAT lives only on the orchestrator. Workers receive a single-use **JIT runner config** — a base64 token bound to one runner registration. | +| Persistent OpenStack creds tied to a user password | Orchestrator authenticates with an **application credential** (revocable, scoped) instead of `openrc.sh`. | -### 7. Verify runner registration +### Setup -After the service starts it will loop waiting for a job. Confirm it appears in **GitHub → Repository → Settings → Actions → Runners** with the labels `self-hosted`, `airstack`, `gpu` and status **Idle**. +The orchestrator service code, cloud-init template, systemd unit, and full setup runbook live in [`.github/orchestrator/`](../../../../.github/orchestrator/). See [`.github/orchestrator/README.md`](../../../../.github/orchestrator/README.md) for: -Trigger a `workflow_dispatch` run and watch the runner pick it up, complete the job, and re-register. +- creating the OpenStack application credential and `clouds.yaml` +- staging the GitHub PAT +- running `setup.sh` on the orchestrator VM +- filling in flavor / network / keypair / security-group in `/etc/airstack-orchestrator/config.yaml` +- enabling and verifying the `airstack-orchestrator.service` systemd unit -### Security considerations +### Runner labels -| Concern | Mitigation | -|---------|------------| -| Fork PRs executing arbitrary code on the runner | Workflow has `if: github.event.pull_request.head.repo.full_name == github.repository` — fork PRs are skipped entirely | -| Cross-job state pollution | `--ephemeral` flag: runner de-registers and the process exits after each job; the systemd loop starts a clean process for the next job | -| Runner running as root | Dedicated non-root `runner` user; never set `RUNNER_ALLOW_RUNASROOT=1` | -| Docker socket gives root-equivalent access | Accepted risk for lab use; the fork PR guard above limits who can reach the runner | -| Long-lived PAT stored on disk | Scope the PAT to the minimum required; rotate it periodically; `chmod 600` and owned by `runner` only | +The workflow file requests `runs-on: [self-hosted, airstack-ephemeral]`. The orchestrator polls for queued jobs whose labels are a superset of `runner_labels` in its config, mints a JIT config registering the ephemeral runner under those same labels, and spawns the worker. To route jobs to a different pool (e.g. CPU-only workers) in the future, add a second label set in config and adjust the workflow's `runs-on`. From 5f34c94efa56f9916277a9c58ab71ee52c7db955 Mon Sep 17 00:00:00 2001 From: Andrew Jong Date: Mon, 27 Apr 2026 18:02:54 -0400 Subject: [PATCH 3/4] Add availability zone --- .github/orchestrator/README.md | 1 + .github/orchestrator/config.example.yaml | 4 ++++ .github/orchestrator/orchestrator.py | 6 +++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/orchestrator/README.md b/.github/orchestrator/README.md index a5d961840..ee0a9f8b3 100644 --- a/.github/orchestrator/README.md +++ b/.github/orchestrator/README.md @@ -115,6 +115,7 @@ Edit the placeholders the example ships with: | `network_name` | Network the workers attach to | `openstack network list` | | `keypair_name` | SSH keypair for break-glass access | `openstack keypair list` | | `security_group` | Outbound 443 must be allowed | `openstack security group list` | +| `availability_zone` | Optional AZ for the spawned instance; leave empty to let Nova pick | `openstack availability zone list` | | `repo` | `owner/name` of the repo to poll | from GitHub URL | | `runner_version` | Version tag from [actions/runner releases](https://github.com/actions/runner/releases) | check before each major upgrade | diff --git a/.github/orchestrator/config.example.yaml b/.github/orchestrator/config.example.yaml index 21c8011ef..f4088393e 100644 --- a/.github/orchestrator/config.example.yaml +++ b/.github/orchestrator/config.example.yaml @@ -25,6 +25,10 @@ keypair_name: "" # Security group applied to spawned instances. Outbound 443 must be allowed. security_group: "" +# OpenStack availability zone to spawn instances in (e.g. nova, gpu-zone-1). +# Leave empty to let Nova pick. +availability_zone: "" + # --- GitHub --- # owner/name of the repo whose queued workflow_jobs to pick up. diff --git a/.github/orchestrator/orchestrator.py b/.github/orchestrator/orchestrator.py index 13a6a020e..0169223e6 100644 --- a/.github/orchestrator/orchestrator.py +++ b/.github/orchestrator/orchestrator.py @@ -175,7 +175,7 @@ def spawn_server( ) -> str: flavor = conn.compute.find_flavor(config["flavor_name"], ignore_missing=False) network = conn.network.find_network(config["network_name"], ignore_missing=False) - server = conn.compute.create_server( + create_kwargs = dict( name=name, image_id=config["image_id"], flavor_id=flavor.id, @@ -188,6 +188,10 @@ def spawn_server( JOB_META_KEY: job_id, }, ) + az = config.get("availability_zone") + if az: + create_kwargs["availability_zone"] = az + server = conn.compute.create_server(**create_kwargs) return server.id From 69e485e17491e5f6a22064b9bef75032f2a9606f Mon Sep 17 00:00:00 2001 From: Andrew Jong Date: Mon, 27 Apr 2026 18:07:52 -0400 Subject: [PATCH 4/4] Bump version to 0.18.0-alpha.7 --- .env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env b/.env index 1edaa22f7..d0e7d3db5 100644 --- a/.env +++ b/.env @@ -11,7 +11,7 @@ PROJECT_NAME="airstack" # If you've run ./airstack.sh setup, then this will auto-generate from the git commit hash every time a change is made # to a Dockerfile or docker-compose.yaml file. Otherwise this can also be set explicitly to make a release version. -VERSION="0.18.0-alpha.6" +VERSION="0.18.0-alpha.7" # Choose "dev" or "prebuilt". "dev" is for mounted code that must be built live. "prebuilt" is for built ros_ws baked into the image DOCKER_IMAGE_BUILD_MODE="dev" # Where to push and pull images from. Can replace with your docker hub username if using docker hub.