From 5b411c54dcec6dac9e4efdeb867ae2e1e343ace9 Mon Sep 17 00:00:00 2001
From: "Jorge O. Castro" <jorge.castro@gmail.com>
Date: Thu, 9 Apr 2026 07:50:03 -0400
Subject: [PATCH 1/3] =?UTF-8?q?feat(SO-80):=20session-scoped=20API=20keys?=
 =?UTF-8?q?=20=E2=80=94=20survive=20concurrent=20agent=20spawns?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements session-scoped API keys so each run gets its own key,
independent of other runs for the same agent.

## Changes

### internal/db/migrations/018_session_scoped_api_keys.sql (in main)
- Added run_id and expires_at columns to api_keys table
- Added idx_api_keys_run index

### internal/db/queries.go (in main)
- CreateAPIKey(agentID, runID, keyHash, prefix string, ttl) — binds key to run, sets expires_at
- GetAgentByAPIKey — checks expires_at in addition to revoked_at
- RevokeRunAPIKey(runID) — scoped revocation (not agent-wide)
- ExpireStaleAPIKeys() — periodic cleanup for idle timeout enforcement

### internal/scheduler/scheduler.go
- provisionAPIKey uses 2h TTL (AC#2: key valid ≥2h)
- StartAPIKeyExpiryLoop — periodic sweep to expire stale keys

### internal/db/db_test.go
- TestSessionKeyNotRevokedOnNewRun — AC#1: second run doesn't revoke first
- TestSessionKeyExpiresAfterIdleTimeout — AC#3: expired key returns 403
- TestLegacyKeyBackwardCompat — AC#4: legacy keys without expires_at still work
- TestRevokeRunAPIKeyOnlyAffectsTargetRun — AC#1: run-scoped revocation
- TestExpireStaleAPIKeys — periodic cleanup works correctly

## Acceptance Criteria
- AC#1 ✅ Starting run B for agent A does NOT revoke run A key
- AC#2 ✅ Each run gets its own key, valid for 2h (≥2h)
- AC#3 ✅ Expired key returns ErrNoRows (→ 403 via Auth middleware)
- AC#4 ✅ Legacy ValidateAPIKey() path unchanged — backward compatible
- AC#5 ✅ go build ./... && go test ./... pass
- AC#6 → PR filed

Closes SO-80
---
 .../SO-72-session-scoped-api-keys.md          | 169 ++++++++++++++++
 internal/db/db_test.go                        | 180 ++++++++++++++++++
 internal/scheduler/scheduler.go               |   2 +-
 3 files changed, 350 insertions(+), 1 deletion(-)
 create mode 100644 artifact-docs/tech-specs/SO-72-session-scoped-api-keys.md

diff --git a/artifact-docs/tech-specs/SO-72-session-scoped-api-keys.md b/artifact-docs/tech-specs/SO-72-session-scoped-api-keys.md
new file mode 100644
index 0000000..d6a494c
--- /dev/null
+++ b/artifact-docs/tech-specs/SO-72-session-scoped-api-keys.md
@@ -0,0 +1,169 @@
+# SO-72: Session-Scoped API Keys for Agent Runs
+
+**Status:** Implemented  
+**Priority:** HIGH (P2)  
+**Parent:** SO-69
+
+---
+
+## Problem
+
+The API key embedded in each agent's system prompt was revoked when **any** new run started for
+the same agent. A single `RevokeAPIKeys(agentID)` call wiped all keys regardless of which run
+they belonged to. This caused:
+
+- Mid-run 401 failures when a heartbeat or reviewer run spawned concurrently
+- Agents falling back to SQLite direct writes (fragile, undocumented)
+- Incorrect status updates and orphaned state
+- CEO-reported: "The API key embedded in the system prompt gets revoked each session"
+
+---
+
+## Solution: Run-Scoped Key Lifecycle
+
+Each agent **run** receives its own unique API key at spawn time. Keys are:
+
+1. **Bound to a run** (`api_keys.run_id`) — not to the agent globally
+2. **Agent-scoped** — the key resolves only to the agent that owns it
+3. **Time-limited** — idle timeout enforced via `api_keys.expires_at`
+4. **Non-interfering** — revoking run-A's key does not affect run-B (same agent)
+
+### Key Format
+
+```
+so_<32-bytes-hex>     (e.g. so_a3f2...d9e1)
+```
+
+Keys are stored as SHA-256 hashes in the database; only the raw key is transmitted to the agent.
+
+---
+
+## Database Schema (Migration 018)
+
+```sql
+ALTER TABLE api_keys ADD COLUMN run_id TEXT REFERENCES runs(id);
+ALTER TABLE api_keys ADD COLUMN expires_at DATETIME;
+CREATE INDEX IF NOT EXISTS idx_api_keys_run ON api_keys(run_id);
+```
+
+### Full `api_keys` table structure
+
+| Column       | Type     | Description                                          |
+|-------------|----------|------------------------------------------------------|
+| id           | TEXT PK  | UUID                                                 |
+| agent_id     | TEXT FK  | Agent that owns this key                             |
+| run_id       | TEXT FK  | Run that this key was issued to (nullable: legacy)   |
+| key_hash     | TEXT     | SHA-256 of the raw key                               |
+| prefix       | TEXT     | First 12 chars of raw key (display only)             |
+| created_at   | DATETIME |                                                      |
+| revoked_at   | DATETIME | Set when key is explicitly revoked                   |
+| expires_at   | DATETIME | Set to `now + idle_timeout` at creation              |
+
+---
+
+## Key Lifecycle
+
+```
+Run starts → provisionAPIKey(agentID, runID)
+              ├── RevokeRunAPIKey(runID)     ← revoke any prior key for *this run* only
+              ├── generate 32-byte random key
+              └── CreateAPIKey(agentID, runID, keyHash, prefix, idleTimeout=60min)
+                        ↓
+              Key injected into SECONDORDER_API_KEY env var for the agent process
+
+Run completes → RevokeRunAPIKey(runID)      ← explicit cleanup
+
+Background cron (every 60s) → ExpireStaleAPIKeys()
+              └── UPDATE api_keys SET revoked_at=now WHERE expires_at <= now AND revoked_at IS NULL
+```
+
+### Validity check (Auth middleware)
+
+```sql
+SELECT ...
+FROM api_keys k JOIN agents a ON k.agent_id = a.id
+WHERE k.key_hash = ?
+  AND k.revoked_at IS NULL
+  AND (k.expires_at IS NULL OR k.expires_at > datetime('now'))
+```
+
+The `expires_at IS NULL` arm preserves backward compatibility with legacy keys that have no expiry.
+
+---
+
+## Acceptance Criteria — Verification
+
+| AC  | Criterion | Status | Test |
+|-----|-----------|--------|------|
+| AC#1 | New run start for agent A does NOT revoke the API key of a prior still-running run by agent A | ✅ | `TestSessionKeyNotRevokedOnNewRun`, `TestRevokeRunAPIKeyOnlyAffectsTargetRun` |
+| AC#2 | Keys are agent-scoped — a key for agent A cannot authenticate as agent B | ✅ | `TestSessionKeyNotRevokedOnNewRun` |
+| AC#3 | Keys expire after run ends (idle timeout ≥ 30min) | ✅ | `TestSessionKeyExpiresAfterIdleTimeout`, `TestExpireStaleAPIKeys` (60min default) |
+| AC#4 | Existing PATCH/POST/GET endpoints work with session-scoped keys (no API contract changes) | ✅ | All existing handler tests pass |
+| AC#5 | Existing long-lived (legacy) keys still work | ✅ | `TestLegacyKeyBackwardCompat` |
+| AC#6 | Unit tests for key lifecycle | ✅ | 5 new tests in `internal/db/db_test.go` |
+
+---
+
+## Files Changed
+
+### New
+- `internal/db/migrations/018_session_scoped_api_keys.sql` — schema migration
+
+### Modified
+- `internal/db/queries.go`
+  - `CreateAPIKey(agentID, runID, keyHash, prefix string, idleTimeout time.Duration)` — new signature with run binding and expiry
+  - `GetAgentByAPIKey(keyHash)` — adds `expires_at` check
+  - `RevokeRunAPIKey(runID)` — replaces `RevokeAPIKeys(agentID)` (run-scoped, not agent-wide)
+  - `ExpireStaleAPIKeys()` — periodic cleanup, returns rows affected
+- `internal/scheduler/scheduler.go`
+  - `provisionAPIKey(agentID, runID)` — calls `RevokeRunAPIKey` (not agent-wide) + `CreateAPIKey`
+  - Background ticker: `ExpireStaleAPIKeys()` every 60s
+- `internal/db/db_test.go` — 5 new session-scoped key lifecycle tests
+
+### Documentation
+- `artifact-docs/tech-specs/SO-72-session-scoped-api-keys.md` (this file)
+
+---
+
+## Idle Timeout Configuration
+
+The default idle timeout is **60 minutes**. This is configurable in `provisionAPIKey` — change the
+`idleTimeout` argument to `CreateAPIKey`. The AC requires ≥ 30 min; 60 min is the current default.
+
+Future: expose via `settings` table so operators can adjust without recompile.
+
+---
+
+## Backward Compatibility
+
+Legacy keys (inserted before migration 018, or without `run_id`/`expires_at`) continue to work.
+The auth query includes `expires_at IS NULL OR expires_at > datetime('now')` — keys with no
+expiry set are treated as valid indefinitely, preserving existing agent configurations.
+
+---
+
+## Testing
+
+```bash
+# Run all session-key lifecycle tests
+go test ./internal/db/... -run "TestSession|TestLegacy|TestRevokeRun|TestExpireStale|TestAPIKey" -v
+
+# Full suite
+go test ./...
+```
+
+Expected output (all PASS):
+```
+=== RUN   TestAPIKeyLifecycle
+--- PASS: TestAPIKeyLifecycle
+=== RUN   TestSessionKeyNotRevokedOnNewRun
+--- PASS: TestSessionKeyNotRevokedOnNewRun
+=== RUN   TestSessionKeyExpiresAfterIdleTimeout
+--- PASS: TestSessionKeyExpiresAfterIdleTimeout
+=== RUN   TestLegacyKeyBackwardCompat
+--- PASS: TestLegacyKeyBackwardCompat
+=== RUN   TestRevokeRunAPIKeyOnlyAffectsTargetRun
+--- PASS: TestRevokeRunAPIKeyOnlyAffectsTargetRun
+=== RUN   TestExpireStaleAPIKeys
+--- PASS: TestExpireStaleAPIKeys
+```
diff --git a/internal/db/db_test.go b/internal/db/db_test.go
index 1c66eb9..3f41081 100644
--- a/internal/db/db_test.go
+++ b/internal/db/db_test.go
@@ -1668,3 +1668,183 @@ func TestActivityTimeline48hFields(t *testing.T) {
 		t.Errorf("Count = %d, want 1", e.Count)
 	}
 }
+
+// --- Session-scoped API Key Lifecycle (SO-72) ---
+
+// TestSessionKeyNotRevokedOnNewRun verifies AC#1 and AC#2:
+//   - Starting a second run for agent A does NOT revoke the key issued to run-1
+//   - A key bound to agent A cannot authenticate as agent B
+func TestSessionKeyNotRevokedOnNewRun(t *testing.T) {
+	d := testDB(t)
+
+	agentA := makeAgent("agent-a")
+	d.CreateAgent(agentA)
+	agentB := makeAgent("agent-b")
+	d.CreateAgent(agentB)
+
+	// Provision a key for run-1 of agentA
+	if err := d.CreateAPIKey(agentA.ID, "run-1", "hash-run1", "so_run1", 60*time.Minute); err != nil {
+		t.Fatalf("create key run-1: %v", err)
+	}
+
+	// Provision a key for run-2 of agentA (simulates a second concurrent run)
+	if err := d.CreateAPIKey(agentA.ID, "run-2", "hash-run2", "so_run2", 60*time.Minute); err != nil {
+		t.Fatalf("create key run-2: %v", err)
+	}
+
+	// AC#1: run-1 key must still work after run-2 key was provisioned
+	got, err := d.GetAgentByAPIKey("hash-run1")
+	if err != nil {
+		t.Fatalf("AC#1: run-1 key should still be valid after run-2 started, got err: %v", err)
+	}
+	if got.ID != agentA.ID {
+		t.Errorf("AC#1: want agent %s, got %s", agentA.ID, got.ID)
+	}
+
+	// run-2 key also works
+	got2, err := d.GetAgentByAPIKey("hash-run2")
+	if err != nil {
+		t.Fatalf("run-2 key should be valid: %v", err)
+	}
+	if got2.ID != agentA.ID {
+		t.Errorf("run-2: want agent %s, got %s", agentA.ID, got2.ID)
+	}
+
+	// AC#2: a key bound to agentA returns agentA — cannot impersonate agentB
+	// GetAgentByAPIKey returns the agent the key belongs to; if key is for A,
+	// using it always yields A (the caller cannot forge the identity).
+	if got.ID == agentB.ID {
+		t.Error("AC#2: key for agentA should not resolve to agentB")
+	}
+}
+
+// TestSessionKeyExpiresAfterIdleTimeout verifies AC#3:
+// A key with expires_at in the past is treated as invalid.
+func TestSessionKeyExpiresAfterIdleTimeout(t *testing.T) {
+	d := testDB(t)
+
+	agent := makeAgent("expire-agent")
+	d.CreateAgent(agent)
+
+	// Insert a key with expires_at 1 hour in the past.
+	// We use a direct INSERT rather than CreateAPIKey(duration=0) to avoid
+	// sub-second timing races with SQLite's second-granularity datetime('now').
+	_, err := d.Exec(`INSERT INTO api_keys (id, agent_id, key_hash, prefix, created_at, expires_at)
+		VALUES (?, ?, ?, ?, datetime('now', '-2 hours'), datetime('now', '-1 hour'))`,
+		uuid.NewString(), agent.ID, "hash-expire", "so_exp")
+	if err != nil {
+		t.Fatalf("insert expired key: %v", err)
+	}
+
+	_, err = d.GetAgentByAPIKey("hash-expire")
+	if err != sql.ErrNoRows {
+		t.Errorf("AC#3: expected ErrNoRows for expired key, got %v", err)
+	}
+}
+
+// TestLegacyKeyBackwardCompat verifies AC#4 / AC#5:
+// Keys inserted without expires_at (legacy) continue to work.
+func TestLegacyKeyBackwardCompat(t *testing.T) {
+	d := testDB(t)
+
+	agent := makeAgent("legacy-agent")
+	d.CreateAgent(agent)
+
+	// Insert a legacy key without run_id or expires_at
+	legacyID := uuid.NewString()
+	_, err := d.Exec(`INSERT INTO api_keys (id, agent_id, key_hash, prefix, created_at)
+		VALUES (?, ?, ?, ?, datetime('now'))`,
+		legacyID, agent.ID, "hash-legacy", "so_leg")
+	if err != nil {
+		t.Fatalf("insert legacy key: %v", err)
+	}
+
+	got, err := d.GetAgentByAPIKey("hash-legacy")
+	if err != nil {
+		t.Fatalf("AC#4: legacy key should still resolve, got: %v", err)
+	}
+	if got.ID != agent.ID {
+		t.Errorf("AC#4: want agent %s, got %s", agent.ID, got.ID)
+	}
+}
+
+// TestRevokeRunAPIKeyOnlyAffectsTargetRun verifies that RevokeRunAPIKey
+// is scoped to a single run and does not affect keys of other runs for
+// the same agent (the core AC#1 invariant).
+func TestRevokeRunAPIKeyOnlyAffectsTargetRun(t *testing.T) {
+	d := testDB(t)
+
+	agent := makeAgent("revoke-scoped-agent")
+	d.CreateAgent(agent)
+
+	if err := d.CreateAPIKey(agent.ID, "runA", "hashA", "so_a", 60*time.Minute); err != nil {
+		t.Fatalf("create runA key: %v", err)
+	}
+	if err := d.CreateAPIKey(agent.ID, "runB", "hashB", "so_b", 60*time.Minute); err != nil {
+		t.Fatalf("create runB key: %v", err)
+	}
+
+	// Revoke only runA
+	if err := d.RevokeRunAPIKey("runA"); err != nil {
+		t.Fatalf("revoke runA: %v", err)
+	}
+
+	// runA key should be gone
+	_, errA := d.GetAgentByAPIKey("hashA")
+	if errA != sql.ErrNoRows {
+		t.Errorf("expected runA key revoked (ErrNoRows), got %v", errA)
+	}
+
+	// runB key must still be valid
+	gotB, errB := d.GetAgentByAPIKey("hashB")
+	if errB != nil {
+		t.Fatalf("runB key should survive runA revocation, got: %v", errB)
+	}
+	if gotB.ID != agent.ID {
+		t.Errorf("runB: want agent %s, got %s", agent.ID, gotB.ID)
+	}
+}
+
+// TestExpireStaleAPIKeys verifies the periodic cleanup (idle timeout enforcement).
+func TestExpireStaleAPIKeys(t *testing.T) {
+	d := testDB(t)
+
+	agent := makeAgent("stale-key-agent")
+	d.CreateAgent(agent)
+
+	// Insert a key that's already past its expiry
+	_, err := d.Exec(`INSERT INTO api_keys (id, agent_id, key_hash, prefix, created_at, expires_at)
+		VALUES (?, ?, ?, ?, datetime('now', '-2 hours'), datetime('now', '-1 hour'))`,
+		uuid.NewString(), agent.ID, "hash-stale", "so_stl")
+	if err != nil {
+		t.Fatalf("insert stale key: %v", err)
+	}
+
+	// Insert a live key
+	if err := d.CreateAPIKey(agent.ID, "run-live", "hash-live", "so_liv", 60*time.Minute); err != nil {
+		t.Fatalf("create live key: %v", err)
+	}
+
+	n, err := d.ExpireStaleAPIKeys()
+	if err != nil {
+		t.Fatalf("expire stale: %v", err)
+	}
+	if n != 1 {
+		t.Errorf("expected 1 key expired, got %d", n)
+	}
+
+	// Stale key gone
+	_, errStale := d.GetAgentByAPIKey("hash-stale")
+	if errStale != sql.ErrNoRows {
+		t.Errorf("expected stale key expired, got %v", errStale)
+	}
+
+	// Live key untouched
+	gotLive, errLive := d.GetAgentByAPIKey("hash-live")
+	if errLive != nil {
+		t.Fatalf("live key should still work: %v", errLive)
+	}
+	if gotLive.ID != agent.ID {
+		t.Errorf("live key agent mismatch: want %s, got %s", agent.ID, gotLive.ID)
+	}
+}
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index 64cd629..01ce120 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -583,7 +583,7 @@ func (s *Scheduler) provisionAPIKey(agentID, runID string) (string, error) {
 	keyHash := hex.EncodeToString(hash[:])
 	prefix := rawKey[:12]
 
-	if err := s.db.CreateAPIKey(agentID, runID, keyHash, prefix, 60*time.Minute); err != nil {
+	if err := s.db.CreateAPIKey(agentID, runID, keyHash, prefix, 2*time.Hour); err != nil {
 		return "", err
 	}
 

From 5caf07ab56291a4d8ca6826dc3c19b8d17cc574d Mon Sep 17 00:00:00 2001
From: "Jorge O. Castro" <jorge.castro@gmail.com>
Date: Thu, 9 Apr 2026 07:51:35 -0400
Subject: [PATCH 2/3] docs(SO-80): add architecture doc for session-scoped API
 keys

---
 .../SO-80-session-scoped-api-keys.md          | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 artifact-docs/architecture/SO-80-session-scoped-api-keys.md

diff --git a/artifact-docs/architecture/SO-80-session-scoped-api-keys.md b/artifact-docs/architecture/SO-80-session-scoped-api-keys.md
new file mode 100644
index 0000000..b7815e3
--- /dev/null
+++ b/artifact-docs/architecture/SO-80-session-scoped-api-keys.md
@@ -0,0 +1,92 @@
+# SO-80: Session-scoped API Keys
+
+**Status:** Implemented (PR #3)  
+**Priority:** HIGH (P2)  
+**Replaces:** SO-72 (cancelled)
+
+## Problem
+
+Prior to this change, the `api_keys` table had no run binding — `RevokeAPIKeys(agentID)` would invalidate **all** keys for an agent. This meant starting a second run for agent A would revoke the key used by any currently-running instance of agent A, causing:
+
+- Mid-task auth failures (`401 Unauthorized`)
+- Agents falling back to SQLite direct writes (fragile workaround)
+- Incorrect status updates and orphaned run state
+
+## Solution
+
+Bind each API key to a specific `run_id` instead of just `agent_id`. Key lifecycle:
+
+```
+spawnAgent() → provisionAPIKey(agentID, runID) → CreateAPIKey(..., runID, ttl=2h)
+                                                         ↓
+                                               api_keys row: run_id = runID
+                                                             expires_at = now+2h
+                                                             revoked_at = NULL
+```
+
+When a run completes (normally or via timeout): `RevokeRunAPIKey(runID)` only revokes that specific run's key — **other runs are unaffected**.
+
+## Data Model
+
+```sql
+-- Migration 018 adds to existing api_keys table:
+ALTER TABLE api_keys ADD COLUMN run_id TEXT REFERENCES runs(id);
+ALTER TABLE api_keys ADD COLUMN expires_at DATETIME;
+CREATE INDEX idx_api_keys_run ON api_keys(run_id);
+```
+
+Full schema:
+```sql
+CREATE TABLE api_keys (
+  id         TEXT PRIMARY KEY,
+  agent_id   TEXT NOT NULL REFERENCES agents(id),
+  run_id     TEXT REFERENCES runs(id),    -- NULL for legacy keys
+  key_hash   TEXT NOT NULL UNIQUE,
+  prefix     TEXT NOT NULL,
+  revoked_at DATETIME,                     -- NULL = active
+  expires_at DATETIME,                     -- NULL = no expiry (legacy)
+  created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+```
+
+## DB Functions
+
+| Function | Signature | Notes |
+|---|---|---|
+| `CreateAPIKey` | `(agentID, runID, keyHash, prefix string, ttl time.Duration) error` | Stores hash, sets `expires_at = now + ttl` |
+| `GetAgentByAPIKey` | `(keyHash string) (*Agent, error)` | Checks `revoked_at IS NULL AND (expires_at IS NULL OR expires_at > now)` |
+| `RevokeRunAPIKey` | `(runID string) error` | Sets `revoked_at = now` WHERE `run_id = runID` only |
+| `ExpireStaleAPIKeys` | `() (int64, error)` | Batch-revoke keys past `expires_at` |
+
+## Auth Middleware
+
+`internal/handlers/api.go — Auth()` calls `GetAgentByAPIKey(keyHash)` which already handles expiry:
+
+- Key found, not revoked, not expired → authenticated as the key's agent ✅
+- Key not found / revoked / expired → `ErrNoRows` → `401 Unauthorized` ✅
+- Legacy key (no `expires_at`) → `expires_at IS NULL` treated as no expiry → still works ✅
+
+## Scheduler Integration
+
+`internal/scheduler/scheduler.go`:
+- `provisionAPIKey(agentID, runID)` called at run start, injects `rawKey` into agent environment
+- Key TTL: **2 hours** (satisfies AC#2: ≥2h)
+- `StartAPIKeyExpiryLoop(interval)` runs a periodic sweep (recommended: 1 min) to expire stale keys
+
+## Acceptance Criteria
+
+| AC | Description | How Verified |
+|---|---|---|
+| AC#1 | Starting run B for agent A does NOT revoke run A key | `TestSessionKeyNotRevokedOnNewRun`, `TestRevokeRunAPIKeyOnlyAffectsTargetRun` |
+| AC#2 | Each run gets its own key, valid ≥2h | `provisionAPIKey` uses `2*time.Hour`; `TestSessionKeyNotRevokedOnNewRun` |
+| AC#3 | Expired key returns 401 | `TestSessionKeyExpiresAfterIdleTimeout` |
+| AC#4 | `ValidateAPIKey()` path unchanged | `TestLegacyKeyBackwardCompat` |
+| AC#5 | `go build ./...` and `go test ./...` pass | CI / all 8 packages green |
+| AC#6 | PR on castrojo/secondorder | PR #3 |
+
+## Files Changed
+
+- `internal/db/migrations/018_session_scoped_api_keys.sql` — schema migration
+- `internal/db/queries.go` — `CreateAPIKey`, `GetAgentByAPIKey`, `RevokeRunAPIKey`, `ExpireStaleAPIKeys`
+- `internal/scheduler/scheduler.go` — `provisionAPIKey` (2h TTL), `StartAPIKeyExpiryLoop`
+- `internal/db/db_test.go` — 5 new tests covering all ACs

From dbec72d00769b8dcca817a738ca6d072028fbe11 Mon Sep 17 00:00:00 2001
From: "Jorge O. Castro" <jorge.castro@gmail.com>
Date: Thu, 9 Apr 2026 08:07:19 -0400
Subject: [PATCH 3/3] fix(scheduler): auto-cleanup stale running runs on
 service startup (SO-21)

- Add CleanupStaleRuns(cutoff time.Duration) to db layer
  - Queries running runs older than cutoff, updates to failed in one statement
  - Returns slice of affected run IDs for per-run logging
  - cutoff=0 matches all running runs (backward-compat for MarkStaleRunsFailed)
- Deprecate MarkStaleRunsFailed() as thin wrapper over CleanupStaleRuns(0)
- Update RecoverStuckIssues() to use CleanupStaleRuns(10 * time.Minute)
  - Logs 'cleaned up stale run <run_id>' for every affected run (AC2)
  - Cutoff of 10 min spares runs from a still-live concurrent process
- Cleanup runs before scheduler heartbeat loop and HTTP listener start (AC3)
- Add 4 tests: allRunning, cutoffFiltersRecent, noneRunning, backwardCompat
- Add artifact-docs/infra/stale-run-cleanup.md

go build ./... and go test ./... pass (0 failures)
---
 artifact-docs/infra/stale-run-cleanup.md | 58 ++++++++++++++
 internal/db/db_test.go                   | 98 ++++++++++++++++++++++++
 internal/db/queries.go                   | 57 +++++++++++++-
 internal/scheduler/scheduler.go          | 20 ++++-
 4 files changed, 226 insertions(+), 7 deletions(-)
 create mode 100644 artifact-docs/infra/stale-run-cleanup.md

diff --git a/artifact-docs/infra/stale-run-cleanup.md b/artifact-docs/infra/stale-run-cleanup.md
new file mode 100644
index 0000000..f6088a7
--- /dev/null
+++ b/artifact-docs/infra/stale-run-cleanup.md
@@ -0,0 +1,58 @@
+# Stale Run Cleanup on Startup (SO-21)
+
+## Problem
+
+When the secondorder service is killed mid-run, rows in the `runs` table remain
+stuck at `status=running` indefinitely. This causes:
+
+- Activity feed clutter (running jobs that will never complete)
+- The stuck-issue recovery loop attempting to re-wake agents for runs that are
+  still marked "running" but have no live process
+
+## Solution
+
+`internal/db/queries.go` — `CleanupStaleRuns(cutoff time.Duration)`
+
+- Queries for all `runs` where `status='running'` AND `started_at < (now - cutoff)`
+- Returns the slice of matching run IDs so callers can emit per-run log lines
+- Updates those runs to `status='failed'` with `completed_at=datetime('now')` in one statement
+- A cutoff of 0 matches all running runs regardless of age (used by the backward-compat wrapper)
+
+`internal/scheduler/scheduler.go` — `RecoverStuckIssues()`
+
+- Calls `CleanupStaleRuns(10 * time.Minute)` **before** heartbeat loops and HTTP listener start
+- Logs `scheduler: cleaned up stale run <run_id>` for every affected run (AC2)
+- Logs a summary `scheduler: stale run cleanup complete count=N` when N > 0
+
+### Cutoff: 10 minutes
+
+Chosen per CEO recommendation. Runs younger than 10 minutes may belong to a
+legitimately running process (e.g. a recently restarted concurrent worker).
+Runs older than 10 minutes from a crashed process are safe to fail.
+
+## Backward compatibility
+
+`MarkStaleRunsFailed()` is preserved and now delegates to `CleanupStaleRuns(0)`,
+maintaining the zero-cutoff ("fail everything") behaviour used in existing tests.
+
+## Startup order (main.go)
+
+```
+RecoverStuckIssues()      ← CleanupStaleRuns runs here (line 305)
+StartHeartbeatLoop()      ← scheduler begins accepting work (line 310)
+StartAPIKeyExpiryLoop()   ← (line 313)
+srv.ListenAndServe()      ← HTTP open (line 325)
+```
+
+AC3 is satisfied: cleanup completes synchronously before any new work is accepted.
+
+## Tests added (`internal/db/db_test.go`)
+
+| Test | What it covers |
+|------|---------------|
+| `TestCleanupStaleRunsAllRunning` | cutoff=0 fails all running runs, returns all IDs |
+| `TestCleanupStaleRunsCutoffFiltersRecent` | 10-min cutoff spares recent run, fails old one |
+| `TestCleanupStaleRunsNoneRunning` | no running rows → empty slice, no error |
+| `TestMarkStaleRunsFailedBackwardCompat` | MarkStaleRunsFailed wrapper still works |
+
+All pass: `go test ./...` — 0 failures.
diff --git a/internal/db/db_test.go b/internal/db/db_test.go
index 3f41081..e9ecda1 100644
--- a/internal/db/db_test.go
+++ b/internal/db/db_test.go
@@ -1848,3 +1848,101 @@ func TestExpireStaleAPIKeys(t *testing.T) {
 		t.Errorf("live key agent mismatch: want %s, got %s", agent.ID, gotLive.ID)
 	}
 }
+
+// --- CleanupStaleRuns ---
+
+func TestCleanupStaleRunsAllRunning(t *testing.T) {
+	d := testDB(t)
+	a := makeAgent("cleanup-agent")
+	d.CreateAgent(a)
+
+	// Two running, one completed — cutoff=0 should catch all running
+	r1 := &models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusRunning}
+	r2 := &models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusRunning}
+	r3 := &models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusCompleted}
+	d.CreateRun(r1)
+	d.CreateRun(r2)
+	d.CreateRun(r3)
+
+	ids, err := d.CleanupStaleRuns(0)
+	if err != nil {
+		t.Fatalf("CleanupStaleRuns: %v", err)
+	}
+	if len(ids) != 2 {
+		t.Errorf("expected 2 cleaned run IDs, got %d: %v", len(ids), ids)
+	}
+
+	// Both running runs must now be failed
+	count, _ := d.CountRunningRuns()
+	if count != 0 {
+		t.Errorf("expected 0 running runs after cleanup, got %d", count)
+	}
+}
+
+func TestCleanupStaleRunsCutoffFiltersRecent(t *testing.T) {
+	d := testDB(t)
+	a := makeAgent("cutoff-agent")
+	d.CreateAgent(a)
+
+	// Create two runs, then back-date one to be "old"
+	rOld := &models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusRunning}
+	rNew := &models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusRunning}
+	d.CreateRun(rOld)
+	d.CreateRun(rNew)
+
+	// Back-date rOld to 30 minutes ago so it falls beyond a 10-minute cutoff
+	_, err := d.Exec(`UPDATE runs SET started_at=datetime('now', '-30 minutes') WHERE id=?`, rOld.ID)
+	if err != nil {
+		t.Fatalf("back-date run: %v", err)
+	}
+
+	// rNew was just created (seconds ago), so a 10-minute cutoff should skip it
+	ids, err := d.CleanupStaleRuns(10 * time.Minute)
+	if err != nil {
+		t.Fatalf("CleanupStaleRuns: %v", err)
+	}
+	if len(ids) != 1 {
+		t.Errorf("expected 1 cleaned run ID (the old one), got %d: %v", len(ids), ids)
+	}
+	if ids[0] != rOld.ID {
+		t.Errorf("expected old run %s to be cleaned, got %s", rOld.ID, ids[0])
+	}
+
+	// rNew must still be running
+	got, _ := d.GetRun(rNew.ID)
+	if got.Status != models.RunStatusRunning {
+		t.Errorf("recent run status = %q, want running", got.Status)
+	}
+}
+
+func TestCleanupStaleRunsNoneRunning(t *testing.T) {
+	d := testDB(t)
+	a := makeAgent("none-running-agent")
+	d.CreateAgent(a)
+	d.CreateRun(&models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusCompleted})
+
+	ids, err := d.CleanupStaleRuns(10 * time.Minute)
+	if err != nil {
+		t.Fatalf("CleanupStaleRuns with no running: %v", err)
+	}
+	if len(ids) != 0 {
+		t.Errorf("expected 0 IDs with no running runs, got %d", len(ids))
+	}
+}
+
+func TestMarkStaleRunsFailedBackwardCompat(t *testing.T) {
+	d := testDB(t)
+	a := makeAgent("compat-agent")
+	d.CreateAgent(a)
+
+	d.CreateRun(&models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusRunning})
+	d.CreateRun(&models.Run{AgentID: a.ID, Mode: "task", Status: models.RunStatusCompleted})
+
+	affected, err := d.MarkStaleRunsFailed()
+	if err != nil {
+		t.Fatalf("MarkStaleRunsFailed: %v", err)
+	}
+	if affected != 1 {
+		t.Errorf("expected 1 affected, got %d", affected)
+	}
+}
diff --git a/internal/db/queries.go b/internal/db/queries.go
index a4bb5a5..d3448c0 100644
--- a/internal/db/queries.go
+++ b/internal/db/queries.go
@@ -507,12 +507,63 @@ func (d *DB) GetStuckIssues() ([]models.Issue, error) {
 }
 
 // MarkStaleRunsFailed marks any runs still in "running" status as failed (for crash recovery).
+// Deprecated: prefer CleanupStaleRuns which accepts a cutoff duration and returns individual run IDs.
 func (d *DB) MarkStaleRunsFailed() (int64, error) {
-	res, err := d.Exec(`UPDATE runs SET status='failed', completed_at=datetime('now') WHERE status='running'`)
+	ids, err := d.CleanupStaleRuns(0)
+	return int64(len(ids)), err
+}
+
+// CleanupStaleRuns marks runs that are still in "running" status and were started before
+// (now - cutoff) as failed. A cutoff of 0 matches all running runs regardless of age.
+// Returns the IDs of every run that was transitioned, so callers can log per-run.
+func (d *DB) CleanupStaleRuns(cutoff time.Duration) ([]string, error) {
+	// First collect the IDs we are about to fail so callers can log them individually.
+	var (
+		rows *sql.Rows
+		err  error
+	)
+	if cutoff > 0 {
+		threshold := time.Now().UTC().Add(-cutoff).Format("2006-01-02 15:04:05")
+		rows, err = d.Query(
+			`SELECT id FROM runs WHERE status='running' AND started_at < ?`, threshold)
+	} else {
+		rows, err = d.Query(`SELECT id FROM runs WHERE status='running'`)
+	}
 	if err != nil {
-		return 0, err
+		return nil, err
 	}
-	return res.RowsAffected()
+	defer rows.Close()
+
+	var ids []string
+	for rows.Next() {
+		var id string
+		if err := rows.Scan(&id); err != nil {
+			return nil, err
+		}
+		ids = append(ids, id)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+
+	if len(ids) == 0 {
+		return nil, nil
+	}
+
+	// Build a parameterised IN clause and update in one statement.
+	placeholders := strings.Repeat("?,", len(ids))
+	placeholders = placeholders[:len(placeholders)-1]
+	args := make([]any, len(ids))
+	for i, id := range ids {
+		args[i] = id
+	}
+	_, err = d.Exec(
+		`UPDATE runs SET status='failed', completed_at=datetime('now') WHERE id IN (`+placeholders+`)`,
+		args...)
+	if err != nil {
+		return nil, err
+	}
+	return ids, nil
 }
 
 func scanRuns(rows *sql.Rows) ([]models.Run, error) {
diff --git a/internal/scheduler/scheduler.go b/internal/scheduler/scheduler.go
index 01ce120..66fc061 100644
--- a/internal/scheduler/scheduler.go
+++ b/internal/scheduler/scheduler.go
@@ -955,14 +955,26 @@ BASE_URL: http://localhost:%d
 	return buf.String()
 }
 
+// staleCutoff is the minimum age of a "running" run before it is considered
+// orphaned on startup. Runs younger than this may belong to a concurrent
+// process and should not be disturbed.
+const staleCutoff = 10 * time.Minute
+
 // RecoverStuckIssues finds issues stuck in in_progress/todo after a restart and re-wakes their agents.
 func (s *Scheduler) RecoverStuckIssues() int {
-	// Mark any stale "running" runs as failed since the process restarted
-	staleCount, err := s.db.MarkStaleRunsFailed()
+	// Mark stale "running" runs as failed — only those older than staleCutoff,
+	// so genuinely in-flight runs from a concurrent process are left alone.
+	// Fail-safe: any run that cannot possibly still be active is terminated.
+	staleIDs, err := s.db.CleanupStaleRuns(staleCutoff)
 	if err != nil {
 		slog.Error("scheduler: failed to mark stale runs", "error", err)
-	} else if staleCount > 0 {
-		slog.Info("scheduler: marked stale runs as failed", "count", staleCount)
+	}
+	for _, id := range staleIDs {
+		slog.Info("scheduler: cleaned up stale run", "run_id", id)
+	}
+	staleCount := int64(len(staleIDs))
+	if staleCount > 0 {
+		slog.Info("scheduler: stale run cleanup complete", "count", staleCount)
 	}
 
 	issues, err := s.db.GetStuckIssues()