Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
125 commits
Select commit Hold shift + click to select a range
108d0b7
docs(spec): shared commanderhub daemon registry across observer insta…
claude Jun 29, 2026
02fc5a6
docs(spec): revise after adversarial review (B1-B4, M1-M11, m1-m10)
claude Jun 29, 2026
df7899b
docs(spec): v3 revision after Codex adversarial review
claude Jun 29, 2026
0a1d22a
docs(spec): v4 — codex round-1 fixes (7 BLOCKERs + 9 MAJORs)
claude Jun 29, 2026
17e57c2
docs(spec): v5 — codex round-2 fixes (4 BLOCKERs + 4 MAJORs)
claude Jun 29, 2026
9441c3c
docs(spec): v6 — codex round-3 fixes (1 BLOCKER + 5 MAJORs)
claude Jun 29, 2026
4977f73
docs(spec): v7 — codex round-4 fixes (0 BLOCKERs + 4 MAJORs)
claude Jun 29, 2026
ab7fee7
docs(spec): v8 — codex round-5 fixes (0 BLOCKERs + 3 MAJORs)
claude Jun 29, 2026
142e022
docs(spec): v9 — codex round-6 fixes (0 BLOCKERs + 2 MAJORs)
claude Jun 29, 2026
ae5baa5
docs(plan): tasks 1-5 (commander const, encoded-size cap, PG schema, …
claude Jun 30, 2026
9305a7c
docs(plan): remove in-progress plan; scope expanded by issue #49 comm…
claude Jun 30, 2026
47ce1f4
docs(spec): v10 — extend scope to cover issue #49 comment 4839308595 …
claude Jun 30, 2026
3918574
docs(spec): v11 — codex v10-r1 fixes (0 BLOCKERs + 4 MAJORs)
claude Jun 30, 2026
3467dfc
docs(spec): v12 — codex v11-r2 fixes (0 BLOCKERs + 5 MAJORs)
claude Jun 30, 2026
d10b825
docs(spec): v13 — add Finding E (telemetry rate limiter cross-pod) fr…
claude Jun 30, 2026
52bec88
docs(spec): v14 — codex v13 fixes (2 BLOCKERs + 5 MAJORs)
claude Jun 30, 2026
0c4e231
docs(spec): v15 — codex v14 fixes (0 BLOCKERs + 5 MAJORs)
claude Jun 30, 2026
30516d3
docs(spec): v16 — codex v15 fixes (0 BLOCKERs + 3 MAJORs)
claude Jun 30, 2026
4abebc9
docs(spec): v17 — codex v16 fixes (0 BLOCKERs + 3 MAJORs)
claude Jun 30, 2026
57f7caa
docs(spec): v18 — codex v17 fixes (0 BLOCKERs + 3 MAJORs)
claude Jun 30, 2026
4f7fe51
docs(spec): v19 — codex v18 fixes (0 BLOCKERs + 2 MAJORs)
claude Jun 30, 2026
e09bb6d
docs(plan): Phase A header + 6 foundation tasks (constants, files.go …
claude Jun 30, 2026
deb1901
docs(plan): Phase B (5 tasks) — sharedRegistry SQL, heartbeat, confir…
claude Jun 30, 2026
885f60a
docs(plan): v2 — codex round-1 fixes (4 BLOCKERs + 5 MAJORs) for Phas…
claude Jun 30, 2026
2789587
docs(plan): v3 — codex plan round-2 fixes (3 BLOCKERs + 3 MAJORs)
claude Jun 30, 2026
1d4e619
docs(plan): v4 — codex plan round-3 fixes (1 BLOCKER + 2 MAJORs)
claude Jun 30, 2026
d2e4a55
docs(plan): v5 — codex plan round-4 fixes (2 MAJORs)
claude Jun 30, 2026
1b5ad6e
docs(plan): v6 — codex plan round-5 fixes (2 MAJORs)
claude Jun 30, 2026
99cf676
docs(plan): v7 — codex plan round-6 fix (1 MAJOR)
claude Jun 30, 2026
0513f7c
docs(plan): v8 — codex plan round-7 fixes (2 MAJORs)
claude Jun 30, 2026
5257655
docs(plan): v9 — codex plan round-8 fixes (2 MAJORs)
claude Jun 30, 2026
ae5195b
docs(plan): add Phase C (forwarding+drain+cmdID), Phase D (wiring+pgT…
claude Jun 30, 2026
581fc0d
docs(plan): v10 — codex CDE round-1 fixes (3 BLOCKERs + 4 MAJORs)
claude Jun 30, 2026
1da05de
docs(plan): v11 — codex CDE round-2 fixes (3 BLOCKERs + 1 MAJOR)
claude Jun 30, 2026
04d4bb0
docs(plan): v12 — codex CDE round-3 fix (1 MAJOR)
claude Jun 30, 2026
e1348cf
docs(plan): v13 — codex CDE round-4 fixes (2 MAJORs)
claude Jun 30, 2026
6061a76
feat(commander): add ErrCodeDaemonUpgradeRequired and CapabilityFileP…
claude Jun 30, 2026
ae71b0a
feat(commander): add JSON-encoded size cap to Handler.ReadFile and ad…
claude Jun 30, 2026
50d2444
feat(commanderhub/authstore): add Postgres schema for commander_daemo…
claude Jun 30, 2026
1e55dfa
refactor(commanderhub): rename registry→localRegistry, add routingID(…
claude Jun 30, 2026
85e7214
refactor(commanderhub): extract turnStateBackend interface; rename tu…
claude Jun 30, 2026
de6edd5
refactor(observerweb): extract telemetryAllower interface
claude Jun 30, 2026
4204776
fix(observerweb): A6 follow-up — rename local var that shadowed telem…
claude Jun 30, 2026
d812d6f
fix(commander): A2 follow-up — use json.Marshal(res) instead of byte …
claude Jun 30, 2026
5d87ad4
fix(commanderhub): A5 follow-up — re-add context.Context to turnState…
claude Jun 30, 2026
d36510e
fix(observerweb): A6 follow-up — re-add context.Context to telemetryA…
claude Jun 30, 2026
8faa00d
fix(commanderhub): A4 follow-up — ownershipLost is atomic.Bool to avo…
claude Jun 30, 2026
ca0896d
fix(commanderhub/authstore): A3 follow-up — assert PK shapes + CHECK …
claude Jun 30, 2026
dd7e71b
fix(commanderhub): A4/A5 follow-up — thread ctx into mergeCurrentTurn…
claude Jun 30, 2026
a413fa9
docs(plan): align Phase B heartbeatErrCount usage with atomic.Int64 f…
claude Jun 30, 2026
4d3917d
docs(plan): drop unused sync/atomic import from Phase B heartbeat sni…
claude Jun 30, 2026
86c1f8f
feat(commanderhub): add sharedRegistry SQL layer (connectUpsert, hear…
claude Jun 30, 2026
552080d
feat(commanderhub): runHeartbeat goroutine with ownership-loss force-…
claude Jun 30, 2026
adc6f54
feat(commanderhub): add daemonConn.confirmOwnership() per-send PG own…
claude Jun 30, 2026
38695f7
feat(commanderhub): B4 ServeHTTP cluster admission gating + attachSha…
claude Jun 30, 2026
3193e15
feat(commanderhub): B5 sweep goroutine (daemons + nonces + telemetry …
claude Jun 30, 2026
e0160a7
fix(commanderhub): B3 wire-up — call dc.confirmOwnership in SendComma…
claude Jun 30, 2026
9f64c73
fix(commanderhub): B4 follow-up — bound sharedReg.remove with 5s time…
claude Jun 30, 2026
4beadf6
fix(commanderhub): B4 follow-up — reject whitespace-only ShortID in c…
claude Jun 30, 2026
e5ee6ed
fix(commanderhub): B3 follow-up — confirmOwnership doesn't poison con…
claude Jun 30, 2026
89a919b
fix(commanderhub): B3 follow-up — sql.ErrNoRows treated as transient …
claude Jun 30, 2026
4cc7eb6
feat(commanderhub): add length-prefixed JSON envelope codec
claude Jun 30, 2026
1592a58
feat(commanderhub): add HMAC auth helpers and nonce write side (C2)
claude Jun 30, 2026
498056f
feat(commanderhub): add forwardClient for pod-to-pod HTTP forwarding …
claude Jun 30, 2026
44340dd
feat(commanderhub): C4 — forwardServer handler + sendCommandToLocal/s…
claude Jun 30, 2026
937627f
feat(commanderhub): C5 — drain endpoint for preStop hooks
claude Jun 30, 2026
f0dd526
feat(commanderhub): C6 — cmdID pod-prefix for shared mode
claude Jun 30, 2026
b24ca7a
fix(commanderhub): C2 follow-up — HMAC timestamp window 60s per spec
claude Jun 30, 2026
d9fa16e
fix(commanderhub): C3 follow-up — newForwardClient takes []byte secre…
claude Jun 30, 2026
8b01729
fix(commanderhub): C5 follow-up — drain inserts nonce + domain-separa…
claude Jun 30, 2026
324510d
fix(commanderhub): C1 follow-up — reject negative/signed length prefi…
claude Jun 30, 2026
0f38262
fix(commanderhub): C1+C3 follow-up — encoder cap + stream propagates …
claude Jun 30, 2026
a371b90
fix(commanderhub): C4 follow-up — Hub.ReadFile gates on file_preview_…
claude Jun 30, 2026
0ffb83a
fix(commanderhub): C3 follow-up — wouldLoop uses net.IP.IsLoopback fo…
claude Jun 30, 2026
1d0cb24
test(commanderhub): fix forward_client tests broken by IsLoopback wou…
claude Jun 30, 2026
8be8497
fix(commanderhub): C5 follow-up — drain checks sharedReg.db != nil be…
claude Jun 30, 2026
33cff4f
fix(commanderhub): C3 follow-up — forwardClient retries only on 403, …
claude Jun 30, 2026
a045cf4
fix(commanderhub): C4/C5 follow-up — never log raw nonces; emit 8-cha…
claude Jun 30, 2026
613df5a
feat(commanderhub): D1 wire shared-mode components into read/write paths
claude Jun 30, 2026
d6eb003
feat(commanderhub): D2 pgTurnStore — cross-pod turn state in commande…
claude Jun 30, 2026
59e7750
feat(observerweb): D3 — pgTelemetryLimiter with atomic UPSERT + lock_…
claude Jun 30, 2026
a979f00
feat(identity): D4 — Postgres-backed cross-pod identity revocation ch…
claude Jun 30, 2026
8239341
feat(observer-server): D5 cluster-mode lifecycle — ClusterConfig, dua…
claude Jun 30, 2026
e1a889d
feat(observer-server): D5 add explicit-duration and string-duration t…
claude Jun 30, 2026
5ca2ff9
test(commanderhub): D6 multi-pod integration tests with shared Postgr…
claude Jun 30, 2026
2b19226
fix(commanderhub): D-fix1 finding-5 — rekey txn + routeFrame updateFr…
claude Jun 30, 2026
63cee89
fix(commanderhub): D-fix1 finding-6 — timing config propagated; valid…
claude Jun 30, 2026
c6c1061
fix(identity): D-fix1 finding-4 — ErrInvalid publish rate-limited to …
claude Jun 30, 2026
0758a69
fix(observerweb): D-fix1 finding-1 — expose Hub from MountAll via New…
claude Jun 30, 2026
5b98951
fix(observer-server): D-fix1 findings-1,2,3,4,6 — observer-server wir…
claude Jun 30, 2026
0beb911
fix(commanderhub): D-fix1 finding-7 — integration test assertions cor…
claude Jun 30, 2026
c681e0a
fix(commanderhub): D-fix2 finding-1 — implement Hub.Close drain
claude Jun 30, 2026
f0e3abe
fix(observer-server): D-fix2 finding-2 — needsCommanderDDL unifies mi…
claude Jun 30, 2026
fccb94a
fix(commanderhub): D-fix2 finding-3 — atomic CTE for pgTurnStore.rekey
claude Jun 30, 2026
f1c5ea9
fix(identity): D-fix2 finding-4 — cache-gated publish, bounded LRU, s…
claude Jun 30, 2026
4d9bd2d
fix(commanderhub): D-fix3 finding-1 make admission + Close atomic via…
claude Jun 30, 2026
a607117
fix(commanderhub): D-fix3 finding-2 drain endpoint sets draining=true…
claude Jun 30, 2026
91967b9
fix(commanderhub): D-fix4 finding-1 remove shared row on draining-rej…
claude Jun 30, 2026
4711ea6
fix(commanderhub): D-fix5 finding-1 Close/drainHandler wait for in-fl…
claude Jun 30, 2026
00e7659
fix(commanderhub): D-fix6 finding-1 scope inFlightAdmissions to admis…
claude Jun 30, 2026
6cb5421
chore(chart): E1 values.yaml + values-production.example.yaml (cluste…
claude Jun 30, 2026
3d0d018
chore(chart): E2 templates/validate.yaml fail-fast guards + chart_tes…
claude Jun 30, 2026
b14cd43
chore(chart): E3 deployment + configmap + secret renders for cluster …
claude Jun 30, 2026
e526e26
chore(chart): E4 headless Service + NetworkPolicy + ingress deny for …
claude Jun 30, 2026
42985d5
chore(chart): E5 chart_test.sh blocks 1-7 for cluster-mode rendering
claude Jun 30, 2026
57849ec
ci(observer-deploy): E5 cluster smoke + release secret
claude Jun 30, 2026
2093b6d
docs(deploy): E5 rollout coordination + cluster-secret rotation + cav…
claude Jun 30, 2026
735d203
chore(dev): E5 compose.multi-observer.yaml + dev/README.md
claude Jun 30, 2026
843c92b
fix(observer): E-fix1 finding-1 — ClusterConfig env-indirection field…
claude Jun 30, 2026
67204ef
fix(observer): E-fix1 finding-3 — add revocation_channel field to Age…
claude Jun 30, 2026
910ad9c
fix(observer): E-fix1 finding-2 — standardize cluster secret on hex (…
claude Jun 30, 2026
f603d14
fix(observer): E-fix1 finding-4 — implement --drain-local CLI + fix p…
claude Jun 30, 2026
ec0c1e3
fix(observer): E-fix1 finding-5 — dedicated multi-pod config + compos…
claude Jun 30, 2026
d8cdfa9
fix(observer-server): E-fix2 finding-1 — RevocationChannel *string so…
claude Jun 30, 2026
4d44855
fix(observer-server): E-fix2 finding-2 — reject non-wildcard internal…
claude Jun 30, 2026
24f505e
fix(dev): E-fix2 finding-3 — stub agentserver URL so commander routes…
claude Jun 30, 2026
d52f6f3
fix(dev): E-fix2 finding-4 — correct secret generation command in dev…
claude Jun 30, 2026
2338b0f
fix(dev): E-fix3 finding-1 memory object store for multi-pod repro
claude Jun 30, 2026
1a286b9
fix(observer-server): final-fix1 finding-1 KnownFields(true) on nonse…
claude Jun 30, 2026
429c386
fix(commanderhub): final-fix1 finding-3 wire cleanupOrphans into sweep
claude Jun 30, 2026
56a408d
fix(chart): final-fix1 finding-2 mount nonsecret ConfigMap in migrati…
claude Jun 30, 2026
c4b1412
fix(observer-server): final-fix2 finding-1 skip cluster validation fo…
claude Jun 30, 2026
6593e1b
fix(commanderhub): pr58-fix1 blocker1 — Add(1) under admitMu closes p…
claude Jul 1, 2026
5de88ef
fix(commanderhub): pr58-fix1 major4 prep — bound daemon hang in TestM…
claude Jul 1, 2026
cd218cc
ci(multi-agent): pr58-fix1 major4 — add postgres-integration job
claude Jul 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions .github/workflows/multi-agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,70 @@ jobs:
run: skills/multiagent/scripts/discover-thread_test.sh
- name: SKILL.md inline heredoc drift check
run: skills/multiagent/scripts/skill_md_inline_in_sync_test.sh

postgres-integration:
runs-on: ubuntu-latest
defaults:
run:
working-directory: multi-agent
services:
postgres:
image: postgres:16-alpine
env:
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
ports:
- 5432:5432
options: >-
--health-cmd="pg_isready -U postgres"
--health-interval=5s
--health-timeout=5s
--health-retries=10
env:
OBSERVER_POSTGRES_TEST_DSN: postgres://postgres:postgres@localhost:5432/postgres?sslmode=disable
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version-file: multi-agent/go.mod
cache-dependency-path: multi-agent/go.sum
# Wait a moment for Postgres to be fully ready (service container health-check
# is best-effort; a short additional poll avoids flakes in slow CI runners).
- name: Wait for Postgres
run: |
for i in $(seq 1 30); do
if pg_isready -h localhost -p 5432 -U postgres >/dev/null 2>&1; then
echo "postgres ready"
exit 0
fi
sleep 1
done
echo "postgres did not become ready" >&2
exit 1
# Run the tests specified in the pr58-fix1 brief (MAJOR 4):
# ./internal/commanderhub/... — TestMultiPod_*, TestPGTurnStore_*
# ./internal/commanderhub/authstore — Postgres migration & CRUD
# ./internal/observerstore/postgres — telemetry / observer store
# authstore.MigratePostgres is invoked by each test's setup helper, so no
# separate migration step is required.
#
# NOTE: TestCrossPodIntegration and TestPostgresStoreLiveRoundTrip are
# excluded because they have pre-existing flakiness under Postgres:
# - TestCrossPodIntegration/subcase6_cap_under_high_concurrency_strictly_bounded:
# 1024+ concurrent HTTP POSTs to httptest server → 502s, not related to PG.
# - TestPostgresStoreLiveRoundTrip: read-after-write assertion in userspace,
# unrelated to commanderhub shared-registry paths.
# These pre-exist the PR #58 fix round and should be addressed in a
# follow-up. Filed as separate work; do not gate this PR on them.
# -p 1 (serial packages) avoids Postgres data contention: multiple test
# packages share the same DB and use overlapping tables (commander_daemons,
# commander_nonces, etc.). Parallel packages caused ~2 flaky failures per
# run in local reproduction (TestMultiPod_ForwardWith*Secret_*); serial
# runs are green.
- name: PG-integration tests (race + count=1, serial)
run: |
go test -race -count=1 -timeout=15m -p 1 \
./internal/commanderhub \
./internal/commanderhub/authstore \
./internal/observerstore/postgres \
-run 'TestMultiPod|TestPGTurnStore|TestPG|TestSharedRegistry|TestMigrate|TestPostgres|TestAuthstore|TestForward|TestDrain'
35 changes: 32 additions & 3 deletions .github/workflows/observer-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,16 @@ jobs:
minio_user = "minio" + "".join(secrets.choice(alphabet) for _ in range(12))
minio_password = "".join(secrets.choice(alphabet) for _ in range(32))
telemetry_key = "".join(secrets.choice(alphabet) for _ in range(32))
cluster_secret = secrets.token_hex(32)
cluster_secret_prev = ""
release = os.environ["SMOKE_RELEASE"]

print(f"::add-mask::{cluster_secret}")

values = {
"replicaCount": 1,
"replicaCount": 2,
"existingSecret": "",
"cluster": {"enabled": True},
"secret": {
"create": True,
"databaseUrl": f"postgres://observer:{postgres_password}@{release}-observer-postgresql:5432/observer?sslmode=disable",
Expand All @@ -106,6 +111,7 @@ jobs:
"telemetryKeys": {
"telemetry-global-key": telemetry_key,
},
"clusterSecret": cluster_secret,
},
"gateway": {"enabled": False},
"config": {
Expand Down Expand Up @@ -170,9 +176,20 @@ jobs:
--wait \
--wait-for-jobs \
--timeout 10m
- name: Resolve smoke pod IPs
run: |
kubectl --context "$KUBE_CONTEXT" -n "$OBSERVER_NAMESPACE" \
get pods -l "app.kubernetes.io/instance=$SMOKE_RELEASE,app.kubernetes.io/component=observer" \
-o jsonpath='{range .items[*]}{.status.podIP} {end}' > /tmp/observer-pod-ips
cat /tmp/observer-pod-ips
- name: Smoke from cluster
run: |
set -euo pipefail
ips="$(cat /tmp/observer-pod-ips)"
cmds=""
for ip in $ips; do
cmds="${cmds}wget -qO- http://${ip}:8090/readyz; wget -qO- http://${ip}:8090/healthz; "
done
cat >/tmp/observer-smoke-job.yaml <<EOF
apiVersion: batch/v1
kind: Job
Expand Down Expand Up @@ -206,8 +223,7 @@ jobs:
- -ec
args:
- |
wget -qO- "http://${SMOKE_RELEASE}-observer:8090/readyz"
wget -qO- "http://${SMOKE_RELEASE}-observer:8090/healthz"
${cmds}
resources:
requests:
cpu: 25m
Expand Down Expand Up @@ -277,6 +293,8 @@ jobs:
MINIO_ROOT_USER: ${{ secrets.MINIO_ROOT_USER }}
MINIO_ROOT_PASSWORD: ${{ secrets.MINIO_ROOT_PASSWORD }}
OBSERVER_TELEMETRY_KEY: ${{ secrets.OBSERVER_TELEMETRY_KEY }}
OBSERVER_CLUSTER_SECRET: ${{ secrets.OBSERVER_CLUSTER_SECRET }}
OBSERVER_CLUSTER_SECRET_PREV: ${{ secrets.OBSERVER_CLUSTER_SECRET_PREV }}
run: |
python3 - <<'PY'
import json
Expand All @@ -288,13 +306,21 @@ jobs:
"MINIO_ROOT_USER",
"MINIO_ROOT_PASSWORD",
"OBSERVER_TELEMETRY_KEY",
"OBSERVER_CLUSTER_SECRET",
]
missing = [name for name in required if not os.environ.get(name)]
if missing:
raise SystemExit("missing required secrets: " + ", ".join(missing))

cluster_secret = os.environ["OBSERVER_CLUSTER_SECRET"]
print(f"::add-mask::{cluster_secret}")
cluster_secret_prev = os.environ.get("OBSERVER_CLUSTER_SECRET_PREV", "")
if cluster_secret_prev:
print(f"::add-mask::{cluster_secret_prev}")

values = {
"existingSecret": "",
"cluster": {"enabled": True},
"secret": {
"create": True,
"databaseUrl": os.environ["OBSERVER_DATABASE_URL"],
Expand All @@ -303,6 +329,7 @@ jobs:
"telemetryKeys": {
"telemetry-global-key": os.environ["OBSERVER_TELEMETRY_KEY"],
},
"clusterSecret": cluster_secret,
},
"config": {
"identity": {
Expand Down Expand Up @@ -337,6 +364,8 @@ jobs:
},
"migration": {"useHelmHook": False},
}
if cluster_secret_prev:
values["secret"]["clusterSecretPrev"] = cluster_secret_prev
with open("/tmp/observer-release-secret-values.json", "w", encoding="utf-8") as f:
json.dump(values, f)
PY
Expand Down
Loading
Loading