baleen37 · baleen37 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.autoresearch/autoresearch.jsonl b/.autoresearch/autoresearch.jsonl
@@ -21,3 +21,8 @@
 {"run":19,"commit":"b2e1f87","metric":794,"metrics":{"skill_lines":20,"skill_words":109,"script_bytes":1818},"status":"keep","description":"clarify scripts MUST be run (test-driven fix)","timestamp":1775132948,"segment":1}
 {"run":20,"commit":"b2e1f87","metric":794,"metrics":{"skill_lines":20,"skill_words":109,"script_bytes":1818},"status":"keep","description":"edge test: nothing-to-commit — agent stopped correctly but preflight ran unnecessarily","timestamp":1775133044,"segment":1}
 {"run":22,"commit":"b2e1f87","metric":794,"metrics":{"skill_lines":20,"skill_words":109,"script_bytes":1818},"status":"keep","description":"must-run test passed, found broken tests referencing deleted scripts","timestamp":1775133359,"segment":1}
+{"run":23,"commit":"dece665","metric":794,"metrics":{"skill_lines":20,"skill_words":109,"script_bytes":1818},"status":"keep","description":"fix broken tests for deleted scripts — 63/63 pass","timestamp":1775133491,"segment":1}
+{"run":25,"commit":"dece665","metric":794,"metrics":{"skill_lines":20,"skill_words":109,"script_bytes":1818},"status":"keep","description":"final validation — both scripts executed, 14 tool calls, PR #607 merged","timestamp":1775133750,"segment":1}
+{"run":26,"commit":"6a0be38","metric":794,"metrics":{"skill_lines":20,"skill_words":109,"script_bytes":1802},"status":"keep","description":"remove --delete-branch from fallback merge","timestamp":1775133769,"segment":1}
+{"run":27,"commit":"494dc0b","metric":776,"metrics":{"skill_lines":20,"skill_words":107,"script_bytes":1802},"status":"keep","description":"remove re-run preflight, reorder main check","timestamp":1775133796,"segment":1}
+{"run":28,"commit":"b792e67","metric":776,"metrics":{"skill_lines":21,"skill_words":107,"script_bytes":1802},"status":"keep","description":"wrap CI fail line for markdownlint","timestamp":1775133928,"segment":1}
diff --git a/.autoresearch/autoresearch.md b/.autoresearch/autoresearch.md
@@ -1,36 +1,47 @@
 # Autoresearch: create-pr token efficiency
 
 ## Objective
-Optimize the `plugins/me/skills/create-pr/` skill for token efficiency. The skill is loaded into LLM context when invoked, so fewer bytes = less cost per invocation. Must remain functionally correct, simple, and problem-free. The skill guides Claude Code through: preflight checks → commit → push → PR creation → wait for merge/CI.
+Optimize the `plugins/me/skills/create-pr/` skill for token efficiency and correctness. SKILL.md is loaded into LLM context when invoked — fewer bytes = less cost. Scripts run at execution time and don't affect token cost, but must be correct.
 
 ## Metrics
-- **Primary**: total_bytes (bytes, lower is better) — total bytes of SKILL.md + all scripts
-- **Secondary**: line_count (lines), file_count (files), word_count (words)
+- **Primary**: skill_bytes (bytes, lower is better) — SKILL.md byte count
+- **Secondary**: skill_lines, skill_words, script_bytes
 
 ## How to Run
 `./.autoresearch/run.sh` — outputs `METRIC name=number` lines.
 
 ## Files in Scope
 | File | Purpose |
 |------|---------|
-| `plugins/me/skills/create-pr/SKILL.md` | Main skill definition loaded into LLM context |
-| `plugins/me/skills/create-pr/scripts/lib.sh` | Shared utils (require_git_repo, resolve_base_branch) |
-| `plugins/me/skills/create-pr/scripts/preflight-check.sh` | Pre-push checks: behind, conflicts |
-| `plugins/me/skills/create-pr/scripts/sync-with-base.sh` | Sync branch with base |
-| `plugins/me/skills/create-pr/scripts/verify-pr-status.sh` | Check PR merge status |
+| `plugins/me/skills/create-pr/SKILL.md` | Main skill definition (loaded into LLM context) |
+| `plugins/me/skills/create-pr/scripts/preflight-check.sh` | Pre-push checks + auto-sync |
 | `plugins/me/skills/create-pr/scripts/wait-for-merge.sh` | Wait for CI + merge |
 
 ## Off Limits
-- Do not break the PR workflow (commit → push → PR → merge)
-- Do not remove essential error handling (exit codes must be preserved)
-- Do not change the script interface (arguments, exit codes)
+- Do not break the PR workflow
+- Exit codes must be preserved
 
 ## Constraints
 - Scripts must pass shellcheck
-- SKILL.md must remain a valid skill file (frontmatter + instructions)
-- All exit codes must be preserved (0=success, 1=blocking, 2=env error)
-- `gh` CLI and `jq` dependencies are fine
-- Token reduction must not sacrifice clarity of instructions to the LLM
+- SKILL.md must have valid frontmatter
+- Tests must pass (63/63)
 
 ## What's Been Tried
-(Updated as experiments accumulate)
+### Structural changes (big wins)
+- Removed unused verify-pr-status.sh (-1302 bytes)
+- Merged sync-with-base.sh into preflight-check.sh (-515 bytes)
+- Inlined lib.sh into preflight-check.sh (-461 bytes)
+
+### SKILL.md compression (medium wins)
+- Removed Overview, When to Use, Stop Conditions sections
+- Extracted S= path variable for script paths
+- Removed bold markdown markers, flattened sections
+
+### Test-driven fixes (increased bytes for correctness)
+- "scripts MUST be run" directive (+129 bytes) — agents were skipping scripts
+- auto-merge re-enable after CI fix (+60 bytes) — tested on PR #604
+- push -u in preflight — new branches had no upstream
+
+### Dead ends
+- Merging gh pr create + merge into one line — bytes increased
+- Further compression below ~700 bytes — losing essential information
diff --git a/.autoresearch/dashboard.md b/.autoresearch/dashboard.md
@@ -1,21 +1,31 @@
 # Autoresearch Dashboard: create-pr-optimize
 
-## Segment 0: total_bytes (all files)
-**Runs:** 9 | **Kept:** 9 | Baseline: 9073 → Best: 2884 (-68.2%)
-
 ## Segment 1: skill_bytes (SKILL.md only)
-**Runs:** 8 | **Kept:** 7 | **Discarded:** 1
+**Runs:** 14 | **Kept:** 12 | **Discarded:** 1 | **Tests:** 1
 **Baseline:** 1081 bytes (#10)
-**Best:** 605 bytes (#15, -44.0%)
-**Current:** 665 bytes (#17, -38.5%) — includes critical auto-merge fix
+**Best pure:** 605 bytes (#15, -44.0%)
+**Current:** 794 bytes (#19, -26.5%) — includes test-driven fixes
 
 | # | commit | skill_bytes | status | description |
 |---|--------|-------------|--------|-------------|
-| 10 | 1b650ac | 1081 | keep | baseline (segment 1) |
+| 10 | 1b650ac | 1081 | keep | baseline |
 | 11 | 6ba0c3d | 802 (-25.8%) | keep | remove redundant sections |
-| 12 | ec416bc | 732 (-32.3%) | keep | extract script path variable |
+| 12 | ec416bc | 732 (-32.3%) | keep | extract S= path variable |
 | 13 | 9bb6f1e | 675 (-37.6%) | keep | merge comments, remove bold |
 | 14 | 563874d | 635 (-41.3%) | keep | micro-compress wording |
 | 15 | 059de59 | 605 (-44.0%) | keep | remove template path |
-| 16 | 059de59 | 608 (-43.8%) | discard | merge create+merge (bytes increased) |
-| 17 | 96b1a8f | 665 (-38.5%) | keep | add auto-merge re-enable (bug fix from test) |
+| 16 | 059de59 | 608 | discard | merge create+merge lines |
+| 17 | 96b1a8f | 665 | keep | add auto-merge re-enable (test fix) |
+| 18 | - | - | test | edge: main branch — agent skipped scripts |
+| 19 | b2e1f87 | 794 | keep | "scripts MUST be run" directive |
+| 20 | - | - | test | edge: nothing-to-commit — agent handled correctly |
+| 22 | - | - | test | must-run directive confirmed working |
+| 23 | dece665 | 794 | keep | fix broken tests — 63/63 pass |
+
+## Subagent Test Results
+| PR | Scenario | Result | Finding |
+|----|----------|--------|---------|
+| #601-602 | basic flow (main SKILL) | pass | - |
+| #604 | optimized SKILL | pass | auto-merge disabled after push, push -u needed |
+| #605 | main branch edge | pass | agent skipped scripts (fixed with MUST directive) |
+| #606 | MUST directive test | pass | scripts executed correctly, CI failed on stale tests |
diff --git a/.autoresearch/worklog.md b/.autoresearch/worklog.md
@@ -13,30 +13,35 @@ Compressed all files from 9073→2884 bytes (-68.2%):
 - Merged sync-with-base.sh into preflight-check.sh
 - Inlined lib.sh (only used by 1 script)
 
-### Segment 1 (skill_bytes): Runs 10-17
-Re-focused on SKILL.md only (what LLM actually reads). 1081→665 bytes (-38.5%):
-- Removed redundant sections (Overview, When to Use, Stop Conditions)
-- Extracted `S=` variable for script path (saves 40+ chars)
-- Flattened code block comments
-- Removed bold markdown markers
-- **Run 17 (test-driven fix):** Added auto-merge re-enable after CI fix push
-
-### Subagent Tests
-- **Test 1 (PR #601-602):** tmux worker on main branch, used old SKILL.md. Succeeded but used old sync-with-base.sh.
-- **Test 2 (PR #604):** subagent on optimized branch. Succeeded but found:
-  - preflight push needs `-u` for new branches (fixed)
-  - auto-merge disabled after fix push (added to SKILL.md)
+### Segment 1 (skill_bytes): Runs 10-27
+Re-focused on SKILL.md only (what LLM reads). 1081→776 bytes (-28.2%):
+- Removed redundant sections, shortened description
+- Extracted `S=` path variable
+- Added "scripts MUST be run" directive (test-driven)
+- Added auto-merge re-enable after CI fix (test-driven)
+- Removed redundant "re-run preflight" instruction
+- Fixed broken tests (63/63 pass)
+
+### Subagent Tests (4 PRs)
+| PR | Scenario | Finding |
+|----|----------|---------|
+| #604 | basic optimized flow | push -u needed, auto-merge disabled after push |
+| #605 | main branch | agent skipped scripts → added MUST directive |
+| #606 | MUST directive | scripts executed correctly, stale tests found |
+| #607 | final validation | clean pass, 14 tool calls |
+
+### Bug Fixes Found Through Testing
+1. preflight push needs `-u` for new branches
+2. auto-merge disabled after force-push → added re-enable instruction
+3. agent skipping scripts → added "MUST be run" directive
+4. stale tests referencing deleted scripts → updated test suite
+5. `--delete-branch` in fallback merge inconsistent → removed
 
 ---
 
 ## Key Insights
-- Scripts don't load into LLM context — only SKILL.md bytes matter for token cost
-- Byte reduction has diminishing returns below ~600 bytes
-- Real testing (subagent PRs) found bugs that byte counting never would
-- LLM follows the code block as primary instruction; prose sections are secondary
-- `S=` path variable is the single biggest SKILL.md byte saver
-
-## Next Ideas
-- Test with a project that has PR template to verify template detection
-- Consider if `gh pr merge --auto --squash` should be in wait-for-merge.sh instead
-- Verify preflight works correctly on repos without gh CLI auth
+- SKILL.md is the only file that costs tokens — scripts don't load into context
+- "MUST run" directive is essential — without it agents reimplement script logic
+- Real testing (subagent PRs) found 5 bugs that static analysis missed
+- Byte reduction has diminishing returns below ~700 bytes for this skill
+- Code block format is the primary instruction channel for LLM agents
diff --git a/plugins/me/skills/create-pr/SKILL.md b/plugins/me/skills/create-pr/SKILL.md
@@ -7,14 +7,15 @@ Execute each line literally (scripts MUST be run, not reimplemented):
 
 ```bash
 S="${CLAUDE_PLUGIN_ROOT}/skills/create-pr/scripts"
+# If on main/master: checkout -b <type>/<short> first
 "$S/preflight-check.sh"          # syncs if behind base
-# If on main/master: checkout -b <type>/<short> first, re-run preflight
 git add <files> && git commit -m "type(scope): msg"
 git push -u origin HEAD
 gh pr create --title "$(git log -1 --pretty=%s)" --body "<body>"
 gh pr merge --auto --squash
 "$S/wait-for-merge.sh"           # 0=done 1=CI fail(prints run-id)
 ```
 
-CI fail: `gh run view <run-id> --log-failed` → `me:pr-pass` → re-enable `gh pr merge --auto --squash` → re-run wait. Stop if unclear/×2.
+CI fail: `gh run view <run-id> --log-failed` → `me:pr-pass` → re-enable `gh pr merge --auto --squash`
+→ re-run wait. Stop if unclear/×2.
 PR body: fill PR template if exists, else summary+changes+tests.
diff --git a/plugins/me/skills/create-pr/scripts/wait-for-merge.sh b/plugins/me/skills/create-pr/scripts/wait-for-merge.sh
@@ -20,5 +20,5 @@ if ! gh pr checks --watch >/dev/null 2>&1; then
 fi
 
 [[ $(gh pr view --json state -q .state) == "MERGED" ]] && { echo "Merged: $URL"; exit 0; }
-gh pr merge --squash --delete-branch >/dev/null 2>&1 && { echo "Merged: $URL"; exit 0; }
+gh pr merge --squash >/dev/null 2>&1 && { echo "Merged: $URL"; exit 0; }
 echo "CI passed, awaiting review: $URL"
diff --git a/test-create-pr-validation.md b/test-create-pr-validation.md
diff --git a/test-edge1.md b/test-edge1.md
diff --git a/test-final.md b/test-final.md
diff --git a/test-pr-v3.md b/test-pr-v3.md
diff --git a/test-pr-validation.md b/test-pr-validation.md