From 26083557d34b106e69776e72ac2ca10f37eaa77e Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 03:50:52 +0000 Subject: [PATCH 01/30] feat(skillopt): read-side skill cohorts + session reconstruction Read side of the attribution the foundation writes. listSkillSessions reads the skills_active rows (filtered on description='skills_active', deduped per session) into per-session {skills, bucket}; cohortsForSkill partitions a skill's treatment/control; reconstructSession rebuilds the raw transcript from the session's captured rows. All queries injected (unit-testable); nothing touches the model channel. First piece of the deficiency detector. 10 unit tests. --- src/skillify/skill-cohorts.ts | 153 +++++++++++++++++++++++++++++ tests/shared/skill-cohorts.test.ts | 130 ++++++++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 src/skillify/skill-cohorts.ts create mode 100644 tests/shared/skill-cohorts.test.ts diff --git a/src/skillify/skill-cohorts.ts b/src/skillify/skill-cohorts.ts new file mode 100644 index 00000000..e436a8e2 --- /dev/null +++ b/src/skillify/skill-cohorts.ts @@ -0,0 +1,153 @@ +/** + * Read side of skill attribution. The foundation's skills-active.ts WRITES a + * `skills_active` row per session (which org skills were active + an A/B bucket); + * this module READS those rows + the captured transcript rows and turns them into + * the inputs the deficiency detector needs: + * + * - per-session attribution (which skills were active, the bucket), + * - treatment/control cohorts for a given skill, + * - raw-session reconstruction for the outcome scorer. + * + * Scope discipline (see design notes): we only ever touch sessions that have a + * skills_active row — never the whole table — so the detector scores a small, + * relevant cohort, not every session. + * + * Every query goes through an injected `QueryFn` (DeeplakeApi.query bound), so the + * cohort + reconstruction logic is unit-testable with zero live Deeplake. + */ +import type { ActiveSkill } from "./skills-active.js"; +import { sqlStr } from "../utils/sql.js"; + +export type QueryFn = (sql: string) => Promise>>; + +export interface SessionAttribution { + sessionId: string; + skills: ActiveSkill[]; + bucket: number; + ts: string; // last_update_date of the skills_active row +} + +/** Stable identity for an org skill (matches the `--` dir convention). */ +export function skillKey(name: string, author: string): string { + return `${name}--${author}`; +} + +interface ParsedMsg { + type?: string; + content?: unknown; + session_id?: unknown; + skills?: unknown; + ab_bucket?: unknown; +} + +/** Deeplake may hand `message` back as a JSON string or an already-parsed object. */ +function parseMessage(m: unknown): ParsedMsg | null { + if (m == null) return null; + if (typeof m === "string") { + try { return JSON.parse(m) as ParsedMsg; } catch { return null; } + } + if (typeof m === "object") return m as ParsedMsg; + return null; +} + +function asActiveSkills(v: unknown): ActiveSkill[] { + if (!Array.isArray(v)) return []; + const out: ActiveSkill[] = []; + for (const s of v) { + if (s && typeof s === "object" + && typeof (s as ActiveSkill).name === "string" + && typeof (s as ActiveSkill).author === "string") { + const sk = s as ActiveSkill; + out.push({ name: sk.name, author: sk.author, version: typeof sk.version === "number" ? sk.version : 1 }); + } + } + return out; +} + +/** + * Every session that has a skills_active attribution row, newest first. + * `sinceIso` bounds the lookback window; `limit` caps the rows pulled. + * The `description = 'skills_active'` column filter is the index — it's the value + * skills-active.ts writes into the row's `description`, so this never scans + * transcript rows. + */ +export async function listSkillSessions( + query: QueryFn, + sessionsTable: string, + opts: { sinceIso?: string; limit?: number } = {}, +): Promise { + const where = ["description = 'skills_active'"]; + if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`); + const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : ""; + const rows = await query( + `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`, + ); + const out: SessionAttribution[] = []; + const seen = new Set(); // a session can have >1 skills_active row (one per start); keep the newest + for (const r of rows) { + const m = parseMessage(r.message); + if (!m || m.type !== "skills_active" || typeof m.session_id !== "string") continue; + if (seen.has(m.session_id)) continue; + seen.add(m.session_id); + out.push({ + sessionId: m.session_id, + skills: asActiveSkills(m.skills), + bucket: typeof m.ab_bucket === "number" ? m.ab_bucket : 0, + ts: typeof r.last_update_date === "string" ? r.last_update_date : "", + }); + } + return out; +} + +/** + * Partition sessions into treatment (the skill was active) vs control (it wasn't). + * NOTE: this is OBSERVATIONAL (the foundation records availability, it does not yet + * randomize withholding), so control is not a clean counterfactual — the detector + * treats treatment's ABSOLUTE outcome as the primary signal and uses control only + * as weak context until a real withholding arm lands. + */ +export function cohortsForSkill( + sessions: SessionAttribution[], + name: string, + author: string, +): { treatment: SessionAttribution[]; control: SessionAttribution[] } { + const key = skillKey(name, author); + const treatment: SessionAttribution[] = []; + const control: SessionAttribution[] = []; + for (const s of sessions) { + const has = s.skills.some((sk) => skillKey(sk.name, sk.author) === key); + (has ? treatment : control).push(s); + } + return { treatment, control }; +} + +/** + * Reconstruct a session's transcript (USER/ASSISTANT turns, tool noise dropped) + * from its captured rows, oldest-first. Long transcripts are head+tail elided to + * `maxChars` so a giant session can't blow the judge's context. + */ +export async function reconstructSession( + query: QueryFn, + sessionsTable: string, + sessionId: string, + maxChars = 14_000, +): Promise { + const sid = sqlStr(sessionId); + const rows = await query( + `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`, + ); + const parts: string[] = []; + for (const r of rows) { + const j = parseMessage(r.message); + if (!j) continue; + const text = typeof j.content === "string" ? j.content.trim() : ""; + if (!text) continue; + if (j.type === "user_message") parts.push(`USER: ${text}`); + else if (j.type === "assistant_message") parts.push(`ASSISTANT: ${text}`); + } + const joined = parts.join("\n\n"); + if (joined.length <= maxChars) return joined; + const head = joined.slice(0, Math.floor(maxChars * 0.55)); + const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45)); + return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`; +} diff --git a/tests/shared/skill-cohorts.test.ts b/tests/shared/skill-cohorts.test.ts new file mode 100644 index 00000000..329ec964 --- /dev/null +++ b/tests/shared/skill-cohorts.test.ts @@ -0,0 +1,130 @@ +import { describe, it, expect, vi } from "vitest"; +import { + listSkillSessions, + cohortsForSkill, + reconstructSession, + skillKey, + type SessionAttribution, +} from "../../src/skillify/skill-cohorts.js"; + +const TABLE = "sessions"; + +/** A query mock that returns canned rows and records the SQL it was asked. */ +function mockQuery(rows: Array>) { + const calls: string[] = []; + const fn = vi.fn(async (sql: string) => { calls.push(sql); return rows; }); + return { fn, calls }; +} + +const activeRow = (sessionId: string, skills: unknown, bucket: number, ts: string, asString = false) => { + const msg = { type: "skills_active", session_id: sessionId, skills, ab_bucket: bucket }; + return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts }; +}; + +describe("listSkillSessions", () => { + it("filters on description='skills_active' and orders newest-first with the limit", async () => { + const { fn, calls } = mockQuery([]); + await listSkillSessions(fn, TABLE, { sinceIso: "2026-06-01T00:00:00Z", limit: 50 }); + expect(calls[0]).toContain(`FROM "sessions"`); + expect(calls[0]).toContain("description = 'skills_active'"); + expect(calls[0]).toContain("last_update_date >= '2026-06-01T00:00:00Z'"); + expect(calls[0]).toContain("ORDER BY last_update_date DESC"); + expect(calls[0]).toContain("LIMIT 50"); + }); + + it("parses both JSON-string and object message payloads", async () => { + const { fn } = mockQuery([ + activeRow("S1", [{ name: "a", author: "x", version: 2 }], 1, "t2", false), // object + activeRow("S2", [{ name: "b", author: "y", version: 3 }], 0, "t1", true), // JSON string + ]); + const got = await listSkillSessions(fn, TABLE); + expect(got).toEqual([ + { sessionId: "S1", skills: [{ name: "a", author: "x", version: 2 }], bucket: 1, ts: "t2" }, + { sessionId: "S2", skills: [{ name: "b", author: "y", version: 3 }], bucket: 0, ts: "t1" }, + ]); + }); + + it("dedups a session to its newest row (rows arrive newest-first) and drops malformed", async () => { + const { fn } = mockQuery([ + activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "newer"), + activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "older"), // same session → skipped + { message: "not json", last_update_date: "t" }, // unparseable → skipped + { message: { type: "user_message", content: "hi" }, last_update_date: "t" }, // wrong type → skipped + { message: { type: "skills_active", skills: [] }, last_update_date: "t" }, // no session_id → skipped + ]); + const got = await listSkillSessions(fn, TABLE); + expect(got).toHaveLength(1); + expect(got[0]).toMatchObject({ sessionId: "S1", ts: "newer" }); + }); + + it("coerces missing/garbage skill fields safely (defaults version 1, drops non-objects)", async () => { + const { fn } = mockQuery([ + activeRow("S1", [{ name: "a", author: "x" }, "garbage", { name: "b" /* no author */ }], 0, "t"), + ]); + const got = await listSkillSessions(fn, TABLE); + expect(got[0].skills).toEqual([{ name: "a", author: "x", version: 1 }]); + }); + + it("omits the LIMIT clause when no limit is given", async () => { + const { fn, calls } = mockQuery([]); + await listSkillSessions(fn, TABLE); + expect(calls[0]).not.toContain("LIMIT"); + }); +}); + +describe("cohortsForSkill", () => { + const S = (id: string, skills: Array<[string, string]>): SessionAttribution => ({ + sessionId: id, bucket: 0, ts: "t", + skills: skills.map(([name, author]) => ({ name, author, version: 1 })), + }); + + it("splits sessions into treatment (skill present) and control (absent)", () => { + const sessions = [ + S("s1", [["posthog", "kamo"], ["other", "z"]]), // treatment + S("s2", [["other", "z"]]), // control + S("s3", [["posthog", "kamo"]]), // treatment + S("s4", []), // control (no skills) + S("s5", [["posthog", "DIFFERENT"]]), // control (same name, other author) + ]; + const { treatment, control } = cohortsForSkill(sessions, "posthog", "kamo"); + expect(treatment.map((s) => s.sessionId)).toEqual(["s1", "s3"]); + expect(control.map((s) => s.sessionId)).toEqual(["s2", "s4", "s5"]); // s5: name matches, author doesn't + }); + + it("skillKey is name--author", () => { + expect(skillKey("posthog", "kamo")).toBe("posthog--kamo"); + }); +}); + +describe("reconstructSession", () => { + it("orders by creation_date, keeps user/assistant turns, drops tool noise + empty", async () => { + const { fn, calls } = mockQuery([ + { message: { type: "user_message", content: "do X" } }, + { message: { type: "tool_call", tool_input: "{}", tool_response: "{}" } }, // dropped (no content) + { message: { type: "assistant_message", content: "did X" } }, + { message: { type: "assistant_message", content: " " } }, // dropped (blank) + { message: JSON.stringify({ type: "user_message", content: "thanks" }) }, // string payload + ]); + const out = await reconstructSession(fn, TABLE, "abc-123"); + expect(calls[0]).toContain("path LIKE '/sessions/%abc-123%'"); + expect(calls[0]).toContain("ORDER BY creation_date ASC"); + expect(out).toBe("USER: do X\n\nASSISTANT: did X\n\nUSER: thanks"); + }); + + it("head+tail elides a transcript longer than maxChars", async () => { + const big = "x".repeat(500); + const { fn } = mockQuery([ + { message: { type: "user_message", content: big } }, + { message: { type: "assistant_message", content: big } }, + ]); + const out = await reconstructSession(fn, TABLE, "s", 200); + expect(out).toContain("chars elided"); + expect(out.length).toBeLessThan(400); // ~maxChars + the elision marker, far below the ~1000 raw + }); + + it("escapes single quotes in the session id (no SQL break)", async () => { + const { fn, calls } = mockQuery([]); + await reconstructSession(fn, TABLE, "a'b"); + expect(calls[0]).toContain("/sessions/%a''b%"); + }); +}); From 338909acbe795215a7951f882ffd2053f5e8df15 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:14:05 +0000 Subject: [PATCH 02/30] =?UTF-8?q?feat(skillopt):=20skill-invocation=20data?= =?UTF-8?q?=20layer=20(Skill=20tool=5Fuse=20=E2=86=92=20windows)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Read side keyed on actual invocations: capture.ts persists each Skill tool_use as a tool_call row (tool_name=Skill, tool_input={skill}). listSkillInvocations pulls org-skill (--) invocations; windowAroundInvocation slices the transcript before/after the invocation turn for tight, cheap judging. Supersedes availability-based attribution. Injected queries; 8 unit tests. --- src/skillify/skill-cohorts.ts | 153 ------------------ src/skillify/skill-invocations.ts | 158 ++++++++++++++++++ src/skillify/skills-active.ts | Bin 7404 -> 0 bytes tests/shared/skill-cohorts.test.ts | 130 --------------- tests/shared/skill-invocations.test.ts | 103 ++++++++++++ tests/shared/skills-active.test.ts | 211 ------------------------- 6 files changed, 261 insertions(+), 494 deletions(-) delete mode 100644 src/skillify/skill-cohorts.ts create mode 100644 src/skillify/skill-invocations.ts delete mode 100644 src/skillify/skills-active.ts delete mode 100644 tests/shared/skill-cohorts.test.ts create mode 100644 tests/shared/skill-invocations.test.ts delete mode 100644 tests/shared/skills-active.test.ts diff --git a/src/skillify/skill-cohorts.ts b/src/skillify/skill-cohorts.ts deleted file mode 100644 index e436a8e2..00000000 --- a/src/skillify/skill-cohorts.ts +++ /dev/null @@ -1,153 +0,0 @@ -/** - * Read side of skill attribution. The foundation's skills-active.ts WRITES a - * `skills_active` row per session (which org skills were active + an A/B bucket); - * this module READS those rows + the captured transcript rows and turns them into - * the inputs the deficiency detector needs: - * - * - per-session attribution (which skills were active, the bucket), - * - treatment/control cohorts for a given skill, - * - raw-session reconstruction for the outcome scorer. - * - * Scope discipline (see design notes): we only ever touch sessions that have a - * skills_active row — never the whole table — so the detector scores a small, - * relevant cohort, not every session. - * - * Every query goes through an injected `QueryFn` (DeeplakeApi.query bound), so the - * cohort + reconstruction logic is unit-testable with zero live Deeplake. - */ -import type { ActiveSkill } from "./skills-active.js"; -import { sqlStr } from "../utils/sql.js"; - -export type QueryFn = (sql: string) => Promise>>; - -export interface SessionAttribution { - sessionId: string; - skills: ActiveSkill[]; - bucket: number; - ts: string; // last_update_date of the skills_active row -} - -/** Stable identity for an org skill (matches the `--` dir convention). */ -export function skillKey(name: string, author: string): string { - return `${name}--${author}`; -} - -interface ParsedMsg { - type?: string; - content?: unknown; - session_id?: unknown; - skills?: unknown; - ab_bucket?: unknown; -} - -/** Deeplake may hand `message` back as a JSON string or an already-parsed object. */ -function parseMessage(m: unknown): ParsedMsg | null { - if (m == null) return null; - if (typeof m === "string") { - try { return JSON.parse(m) as ParsedMsg; } catch { return null; } - } - if (typeof m === "object") return m as ParsedMsg; - return null; -} - -function asActiveSkills(v: unknown): ActiveSkill[] { - if (!Array.isArray(v)) return []; - const out: ActiveSkill[] = []; - for (const s of v) { - if (s && typeof s === "object" - && typeof (s as ActiveSkill).name === "string" - && typeof (s as ActiveSkill).author === "string") { - const sk = s as ActiveSkill; - out.push({ name: sk.name, author: sk.author, version: typeof sk.version === "number" ? sk.version : 1 }); - } - } - return out; -} - -/** - * Every session that has a skills_active attribution row, newest first. - * `sinceIso` bounds the lookback window; `limit` caps the rows pulled. - * The `description = 'skills_active'` column filter is the index — it's the value - * skills-active.ts writes into the row's `description`, so this never scans - * transcript rows. - */ -export async function listSkillSessions( - query: QueryFn, - sessionsTable: string, - opts: { sinceIso?: string; limit?: number } = {}, -): Promise { - const where = ["description = 'skills_active'"]; - if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`); - const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : ""; - const rows = await query( - `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`, - ); - const out: SessionAttribution[] = []; - const seen = new Set(); // a session can have >1 skills_active row (one per start); keep the newest - for (const r of rows) { - const m = parseMessage(r.message); - if (!m || m.type !== "skills_active" || typeof m.session_id !== "string") continue; - if (seen.has(m.session_id)) continue; - seen.add(m.session_id); - out.push({ - sessionId: m.session_id, - skills: asActiveSkills(m.skills), - bucket: typeof m.ab_bucket === "number" ? m.ab_bucket : 0, - ts: typeof r.last_update_date === "string" ? r.last_update_date : "", - }); - } - return out; -} - -/** - * Partition sessions into treatment (the skill was active) vs control (it wasn't). - * NOTE: this is OBSERVATIONAL (the foundation records availability, it does not yet - * randomize withholding), so control is not a clean counterfactual — the detector - * treats treatment's ABSOLUTE outcome as the primary signal and uses control only - * as weak context until a real withholding arm lands. - */ -export function cohortsForSkill( - sessions: SessionAttribution[], - name: string, - author: string, -): { treatment: SessionAttribution[]; control: SessionAttribution[] } { - const key = skillKey(name, author); - const treatment: SessionAttribution[] = []; - const control: SessionAttribution[] = []; - for (const s of sessions) { - const has = s.skills.some((sk) => skillKey(sk.name, sk.author) === key); - (has ? treatment : control).push(s); - } - return { treatment, control }; -} - -/** - * Reconstruct a session's transcript (USER/ASSISTANT turns, tool noise dropped) - * from its captured rows, oldest-first. Long transcripts are head+tail elided to - * `maxChars` so a giant session can't blow the judge's context. - */ -export async function reconstructSession( - query: QueryFn, - sessionsTable: string, - sessionId: string, - maxChars = 14_000, -): Promise { - const sid = sqlStr(sessionId); - const rows = await query( - `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`, - ); - const parts: string[] = []; - for (const r of rows) { - const j = parseMessage(r.message); - if (!j) continue; - const text = typeof j.content === "string" ? j.content.trim() : ""; - if (!text) continue; - if (j.type === "user_message") parts.push(`USER: ${text}`); - else if (j.type === "assistant_message") parts.push(`ASSISTANT: ${text}`); - } - const joined = parts.join("\n\n"); - if (joined.length <= maxChars) return joined; - const head = joined.slice(0, Math.floor(maxChars * 0.55)); - const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45)); - return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`; -} diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts new file mode 100644 index 00000000..3cb0fd31 --- /dev/null +++ b/src/skillify/skill-invocations.ts @@ -0,0 +1,158 @@ +/** + * Read side of skill *invocation* attribution — the basis for deficiency detection. + * + * A skill can only help or hurt if the agent actually INVOKED it. Claude Code + * records each invocation as a `Skill` tool_use, which capture.ts persists as a + * tool_call row: `message.tool_name === "Skill"`, `message.tool_input` a JSON + * string `{ skill: "--", args? }`. We key on these real invocations + * rather than availability (the dropped skills_active) because: + * - it's accurate — availability-without-invocation is pure noise, and + * - it pins the exact turn, so we can window the judge tightly around it. + * + * Org skills only: the invoked `skill` is `--`. Plugin-namespaced + * (`hivemind:...`) and bare skills are not org-mined skills and are skipped. + * + * Every query is injected (QueryFn), so this is unit-testable with no live Deeplake. + */ +import { sqlStr } from "../utils/sql.js"; + +export type QueryFn = (sql: string) => Promise>>; + +export interface SkillInvocation { + sessionId: string; + name: string; + author: string; + ts: string; // invocation timestamp (message.timestamp, else the row's last_update_date) +} + +interface ParsedMsg { + type?: string; + tool_name?: string; + tool_input?: unknown; + content?: unknown; + session_id?: unknown; + timestamp?: unknown; +} + +function parseMessage(m: unknown): ParsedMsg | null { + if (m == null) return null; + if (typeof m === "string") { + try { return JSON.parse(m) as ParsedMsg; } catch { return null; } + } + if (typeof m === "object") return m as ParsedMsg; + return null; +} + +/** The skill ref invoked by a tool_call message (e.g. "name--author"), else null. */ +export function invokedSkillRef(msg: ParsedMsg): string | null { + if (msg.type !== "tool_call" || msg.tool_name !== "Skill") return null; + let input: unknown = msg.tool_input; + if (typeof input === "string") { try { input = JSON.parse(input); } catch { return null; } } + const skill = (input as { skill?: unknown })?.skill; + return typeof skill === "string" && skill.length > 0 ? skill : null; +} + +/** Split "--" → parts. null for plugin-namespaced / bare / malformed refs. */ +export function splitOrgSkill(skill: string): { name: string; author: string } | null { + if (skill.includes(":")) return null; // plugin-namespaced (e.g. hivemind:hivemind-memory) + const i = skill.lastIndexOf("--"); + if (i <= 0 || i + 2 >= skill.length) return null; // bare or malformed + return { name: skill.slice(0, i), author: skill.slice(i + 2) }; +} + +/** + * Org-skill invocations across captured sessions, newest first. Coarse prefilter + * on `"Skill"` (robust to JSONB colon-spacing) then a precise in-code check, so a + * stray "Skill" in prose can't slip through as a real invocation. + */ +export async function listSkillInvocations( + query: QueryFn, + sessionsTable: string, + opts: { sinceIso?: string; limit?: number } = {}, +): Promise { + const where = [`CAST(message AS TEXT) LIKE '%"Skill"%'`]; + if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`); + const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : ""; + const rows = await query( + `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`, + ); + const out: SkillInvocation[] = []; + for (const r of rows) { + const m = parseMessage(r.message); + if (!m) continue; + const ref = invokedSkillRef(m); + if (!ref) continue; + const parts = splitOrgSkill(ref); + if (!parts) continue; + const sessionId = typeof m.session_id === "string" ? m.session_id : ""; + if (!sessionId) continue; + out.push({ + sessionId, + name: parts.name, + author: parts.author, + ts: typeof m.timestamp === "string" ? m.timestamp + : (typeof r.last_update_date === "string" ? r.last_update_date : ""), + }); + } + return out; +} + +interface Turn { role: "USER" | "ASSISTANT"; text: string } + +/** + * Reconstruct the transcript turns of a session, and mark where (between which two + * turns) the given invocation happened — so callers can window around it. + */ +async function sessionTurns( + query: QueryFn, sessionsTable: string, inv: SkillInvocation, +): Promise<{ turns: Turn[]; invIndex: number }> { + const sid = sqlStr(inv.sessionId); + const rows = await query( + `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`, + ); + const turns: Turn[] = []; + let invIndex = -1; + for (const r of rows) { + const j = parseMessage(r.message); + if (!j) continue; + // The invocation itself is a tool_call (not a turn): mark its position then skip. + const ref = invokedSkillRef(j); + if (ref) { + const p = splitOrgSkill(ref); + if (invIndex < 0 && p && p.name === inv.name && p.author === inv.author + && (typeof j.timestamp !== "string" || !inv.ts || j.timestamp === inv.ts)) { + invIndex = turns.length; + } + continue; + } + const text = typeof j.content === "string" ? j.content.trim() : ""; + if (!text) continue; + if (j.type === "user_message") turns.push({ role: "USER", text }); + else if (j.type === "assistant_message") turns.push({ role: "ASSISTANT", text }); + } + if (invIndex < 0) invIndex = turns.length; // invocation not located → treat as session end + return { turns, invIndex }; +} + +/** + * The transcript window around an invocation: `before` turns before it and `after` + * turns after — where the help-or-harm signal lives — head+tail elided to maxChars. + * `before`/`after` are tunable; defaults chosen as a small starting point. + */ +export async function windowAroundInvocation( + query: QueryFn, + sessionsTable: string, + inv: SkillInvocation, + opts: { before?: number; after?: number; maxChars?: number } = {}, +): Promise { + const before = opts.before ?? 3; + const after = opts.after ?? 6; + const maxChars = opts.maxChars ?? 4000; + const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv); + const slice = turns.slice(Math.max(0, invIndex - before), invIndex + after); + const joined = slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"); + if (joined.length <= maxChars) return joined; + const head = joined.slice(0, Math.floor(maxChars * 0.55)); + const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45)); + return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`; +} diff --git a/src/skillify/skills-active.ts b/src/skillify/skills-active.ts deleted file mode 100644 index d9b3c46d26b7f47f7ce2ecccaa210c04a94f91fa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7404 zcma)B?Q+}3742_5#VyAfftCPD>2xNcEqmnHZq>+gYTE9Pe25^i6cK>{g9RirGCTd% z2k7((^CUgz?k)++&P+XHNn9-M-p_N-UJf2U+>?hgy3UJ2(x$2NN!#RADYL4Ug-LB& zn}sQxq10wt)!CseO)b_~i%o}eJI|+cdHv=u@#wFoZ(ckXOF@<*Z%u6^y-D*TofJm$ zQl?eem=6t(k&zQHYSOxa$fC89N@f~UFY+?CO+J;=!84h((`(Zt5J!psn6HNJ0J%wv z*2pw1r7DY+Obl1@ENx6ClNC-{EYmtKuY#slnmLZ0ry0&s9py%~Z040U?qQZ^<$fbk zHL<3?N$G{Okb%@`nN^GYSBQpA{Y2hYS-O(4YUVt_`NcX{P)+9oXMe})-~RPqe{x>_ zY^LhuBEK<~u4(3G$0uS}wlND?*07{Gq*$D%PE3~#Vv-4dmR3}1wG+7<5*Nz~5rW^L z?HmhfmeB=97CF|THG*P_;Mmc$TACrl7?h`16Zx(!3Y=8c@T0~Rua)p%BjbM#l4+5) znHjhzgU4mMFi+z6aoWPC^^?JvN=MchSwc+G*#09as%cup8ZyKgqP^G@sWQQjm_l#> z4#|-#rHcUqY(7j29G1~u%0&=iMPXuhniK5ZJ5Mc~CDR%%h+D+cXG>FsgbsFp2S>^Q zL}ey6vl)UC&uS<}yh zHKnqugf%PO0CNi{05@pC#Fhz9U--&N3m&kNc~$rTU1<(KH)dI+*XDGYQ`x~DBU(2y zvob?UNUy9iGo0Dpqwr*zHuLT?e%?O4!wK+gUdy_>sp<&06Z=asYPLI)WB@2A>;UV@ z&pSJdD$QP{Wj-?&<=d>5#^*{{+#Hh(7Ji=-v1dN0Aq;OlOQ%LoQ3^L^qyTicC&Cm! znX9uO;j@cycjd;^M4zFQ?P6l;M|bUkr2IyDx&ic-tZ1Gx)PH^8tQ$lYUi_+K9xAeGk?ZMr{@^<6X$q(dWFoR zG$w;Z7@PT_*j-d#v`8Dvj>pFi6z<1D4`mGO%c}n7Hj7}z(-4hKSib*JW9t&pIrc=p z|4bZzWICWSD5md1kStp}@1!Z7d_IhtYd-dRMmMhK={ilB-0<8QMs- zisK3BQhbHpRxF|y-kPj(SI^aCv^@Kfd74e-53kK)y-+aE{BxP+NsgD zH%kjJDew*`Oje_W>$Yl(42wn4Hbx9EX|=;u>grY^y2vtiNPk;tS&iG0Wi?i~bTExg zo9Hn=P%4ZPUk*ws8AAUK``Bnhn!F}Y3|`Nn-?BCY{7r_PUOa~a489h#3J!vV1jS{u z$xsy!yPk(oD^b`QyFSv_R|0Ked_hrN=U`-6=$q7Z!3Y^2aGn5ga~$8;_~uLOpB2a# zvdpyEM`lc$76rmep^HeXgP|i}g!Qgw<#EecOK>ca$H;>KMgj1G=k6m&Uttp-X_v}< zRxZB;jHy#}2chJMS$;RSTV2-BUxo2cNW+0ubSt0Wf!@IUTW^5ZnTy z0myy_xS;hW35NHn$q9nDr7!41dw|9c3-S0k7%*Kfn3uo(`#-)AJW$9TivR$w=mAQF zT6D=xvt#4}AUzZKs9wQ7-}JppPCy%PUVIic#cnSjWcv@UXVb;in{tzvLlRWU(6GScym;KOi9*r>*OcmJGjL)`SB zwfnGr7$~vAuY$^ZPv%{^xB@|ggwk6I5@4GsU`pN$>;{C4FzabZq0Rm?X!LNU=qW8asLAMxBN^2rXj z+(%mao7nTNb}$(9%BW@-9>t!tcLqEXy}WoIAE#2m*MZbu<{4%!pJ@&L{bxEG3ODPs zEWjnK(I*z^`{nK}jJPt#^3jK{j*q8bXVX73#k6GkDCeU``1LsGaF!*7DX*G2Ru2vW zJk8~&lg^GQg7*!sMG+?~~dMcG82nh*d zjNI*s!jW2)G9KG57Aew&W3St|!R*UU>wMX0$)H%2>AI;OQVcg~N*nWbC+BD1zL5Rx zXB@n4>@8$JVzoVAtl==VA`a5Ap%O=GgL6$>8M5E zM1m*sl0LyW5PYz{%`FBx7@UEwI2$``lnO5}S_$*E7r2DgtsB8P>rr1dX`#-kFyyoH zx6$j1N6ziQ)_Q?!W)Pe=q9m7M1J5NV?dZrJq|OCv12vA7WC4f&k>k0dS97aP?1|wC@QZd0G+nwbfNIf~>`aI{- z$(G|Ml@^j2B?vGWR!L+j@g8?k*Fz+2Xrv!G!LbA13U$9#Sz$ zzsgJB)w=sPhIALZ;{p0o6B8_HhyC$^L)0+Q;>wnVcX8?&XQK{B(9Up4^@C9btZ;kx zAsXMWY!pLY*A*~k>So*kg?+s)1SdKKVQYqqF%w4W7b|=T!TK8Gvd==fG?)oy8IB*z z#k=!!j0GymQnNro*SL0M0JVIHvsqjthYll7WpQKUY=RFibM&@py;?R^;=YS`_wMXD?=7S}-Z$u$Lp113nB7AJn5&%9 z=;nD9H`}=NI>al{5CNlQ9d3N#kwe%j`ZJg(7b={LF=hs~drlBkkK`b@RHh*mM zEnEiyz33p9lT~BTgPrvv=xn6pPd59>Z?_t9wzsWkpw=K50Y?u6? zD}5Qufm8nd>G``CBZ>9_VV|bPdjC-H#0~VLjb#Lz{{3gcwD4f-&Gr6p$WN0)t)yL% z0hIK$jam=xp0%W|39Ua@>J%{|2m(gdF=p25T`?UTa|*Z|@)$ug+}BJHh#j^3+=Z diff --git a/tests/shared/skill-cohorts.test.ts b/tests/shared/skill-cohorts.test.ts deleted file mode 100644 index 329ec964..00000000 --- a/tests/shared/skill-cohorts.test.ts +++ /dev/null @@ -1,130 +0,0 @@ -import { describe, it, expect, vi } from "vitest"; -import { - listSkillSessions, - cohortsForSkill, - reconstructSession, - skillKey, - type SessionAttribution, -} from "../../src/skillify/skill-cohorts.js"; - -const TABLE = "sessions"; - -/** A query mock that returns canned rows and records the SQL it was asked. */ -function mockQuery(rows: Array>) { - const calls: string[] = []; - const fn = vi.fn(async (sql: string) => { calls.push(sql); return rows; }); - return { fn, calls }; -} - -const activeRow = (sessionId: string, skills: unknown, bucket: number, ts: string, asString = false) => { - const msg = { type: "skills_active", session_id: sessionId, skills, ab_bucket: bucket }; - return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts }; -}; - -describe("listSkillSessions", () => { - it("filters on description='skills_active' and orders newest-first with the limit", async () => { - const { fn, calls } = mockQuery([]); - await listSkillSessions(fn, TABLE, { sinceIso: "2026-06-01T00:00:00Z", limit: 50 }); - expect(calls[0]).toContain(`FROM "sessions"`); - expect(calls[0]).toContain("description = 'skills_active'"); - expect(calls[0]).toContain("last_update_date >= '2026-06-01T00:00:00Z'"); - expect(calls[0]).toContain("ORDER BY last_update_date DESC"); - expect(calls[0]).toContain("LIMIT 50"); - }); - - it("parses both JSON-string and object message payloads", async () => { - const { fn } = mockQuery([ - activeRow("S1", [{ name: "a", author: "x", version: 2 }], 1, "t2", false), // object - activeRow("S2", [{ name: "b", author: "y", version: 3 }], 0, "t1", true), // JSON string - ]); - const got = await listSkillSessions(fn, TABLE); - expect(got).toEqual([ - { sessionId: "S1", skills: [{ name: "a", author: "x", version: 2 }], bucket: 1, ts: "t2" }, - { sessionId: "S2", skills: [{ name: "b", author: "y", version: 3 }], bucket: 0, ts: "t1" }, - ]); - }); - - it("dedups a session to its newest row (rows arrive newest-first) and drops malformed", async () => { - const { fn } = mockQuery([ - activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "newer"), - activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "older"), // same session → skipped - { message: "not json", last_update_date: "t" }, // unparseable → skipped - { message: { type: "user_message", content: "hi" }, last_update_date: "t" }, // wrong type → skipped - { message: { type: "skills_active", skills: [] }, last_update_date: "t" }, // no session_id → skipped - ]); - const got = await listSkillSessions(fn, TABLE); - expect(got).toHaveLength(1); - expect(got[0]).toMatchObject({ sessionId: "S1", ts: "newer" }); - }); - - it("coerces missing/garbage skill fields safely (defaults version 1, drops non-objects)", async () => { - const { fn } = mockQuery([ - activeRow("S1", [{ name: "a", author: "x" }, "garbage", { name: "b" /* no author */ }], 0, "t"), - ]); - const got = await listSkillSessions(fn, TABLE); - expect(got[0].skills).toEqual([{ name: "a", author: "x", version: 1 }]); - }); - - it("omits the LIMIT clause when no limit is given", async () => { - const { fn, calls } = mockQuery([]); - await listSkillSessions(fn, TABLE); - expect(calls[0]).not.toContain("LIMIT"); - }); -}); - -describe("cohortsForSkill", () => { - const S = (id: string, skills: Array<[string, string]>): SessionAttribution => ({ - sessionId: id, bucket: 0, ts: "t", - skills: skills.map(([name, author]) => ({ name, author, version: 1 })), - }); - - it("splits sessions into treatment (skill present) and control (absent)", () => { - const sessions = [ - S("s1", [["posthog", "kamo"], ["other", "z"]]), // treatment - S("s2", [["other", "z"]]), // control - S("s3", [["posthog", "kamo"]]), // treatment - S("s4", []), // control (no skills) - S("s5", [["posthog", "DIFFERENT"]]), // control (same name, other author) - ]; - const { treatment, control } = cohortsForSkill(sessions, "posthog", "kamo"); - expect(treatment.map((s) => s.sessionId)).toEqual(["s1", "s3"]); - expect(control.map((s) => s.sessionId)).toEqual(["s2", "s4", "s5"]); // s5: name matches, author doesn't - }); - - it("skillKey is name--author", () => { - expect(skillKey("posthog", "kamo")).toBe("posthog--kamo"); - }); -}); - -describe("reconstructSession", () => { - it("orders by creation_date, keeps user/assistant turns, drops tool noise + empty", async () => { - const { fn, calls } = mockQuery([ - { message: { type: "user_message", content: "do X" } }, - { message: { type: "tool_call", tool_input: "{}", tool_response: "{}" } }, // dropped (no content) - { message: { type: "assistant_message", content: "did X" } }, - { message: { type: "assistant_message", content: " " } }, // dropped (blank) - { message: JSON.stringify({ type: "user_message", content: "thanks" }) }, // string payload - ]); - const out = await reconstructSession(fn, TABLE, "abc-123"); - expect(calls[0]).toContain("path LIKE '/sessions/%abc-123%'"); - expect(calls[0]).toContain("ORDER BY creation_date ASC"); - expect(out).toBe("USER: do X\n\nASSISTANT: did X\n\nUSER: thanks"); - }); - - it("head+tail elides a transcript longer than maxChars", async () => { - const big = "x".repeat(500); - const { fn } = mockQuery([ - { message: { type: "user_message", content: big } }, - { message: { type: "assistant_message", content: big } }, - ]); - const out = await reconstructSession(fn, TABLE, "s", 200); - expect(out).toContain("chars elided"); - expect(out.length).toBeLessThan(400); // ~maxChars + the elision marker, far below the ~1000 raw - }); - - it("escapes single quotes in the session id (no SQL break)", async () => { - const { fn, calls } = mockQuery([]); - await reconstructSession(fn, TABLE, "a'b"); - expect(calls[0]).toContain("/sessions/%a''b%"); - }); -}); diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts new file mode 100644 index 00000000..ecdfadbb --- /dev/null +++ b/tests/shared/skill-invocations.test.ts @@ -0,0 +1,103 @@ +import { describe, it, expect, vi } from "vitest"; +import { + invokedSkillRef, + splitOrgSkill, + listSkillInvocations, + windowAroundInvocation, + type SkillInvocation, +} from "../../src/skillify/skill-invocations.js"; + +const TABLE = "sessions"; +function mockQuery(rows: Array>) { + const calls: string[] = []; + return { fn: vi.fn(async (sql: string) => { calls.push(sql); return rows; }), calls }; +} +const toolCall = (skill: string, sessionId = "S1", ts = "t", asString = false) => { + const msg = { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sessionId, timestamp: ts }; + return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts }; +}; + +describe("invokedSkillRef", () => { + it("returns the skill ref for a Skill tool_call (object or stringified input)", () => { + expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "a--b" }) })).toBe("a--b"); + expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: { skill: "a--b" } as unknown })).toBe("a--b"); + }); + it("returns null for non-Skill tools and non-tool_call messages", () => { + expect(invokedSkillRef({ type: "tool_call", tool_name: "Bash", tool_input: "{}" })).toBeNull(); + expect(invokedSkillRef({ type: "assistant_message", content: "use the Skill tool" })).toBeNull(); + expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: "not json" })).toBeNull(); + }); +}); + +describe("splitOrgSkill", () => { + it("splits --, last -- wins", () => { + expect(splitOrgSkill("posthog-smoke--kamo.aghbalyan")).toEqual({ name: "posthog-smoke", author: "kamo.aghbalyan" }); + expect(splitOrgSkill("some-skill--first-last")).toEqual({ name: "some-skill", author: "first-last" }); + }); + it("rejects plugin-namespaced, bare, and malformed refs", () => { + expect(splitOrgSkill("hivemind:hivemind-memory")).toBeNull(); // plugin + expect(splitOrgSkill("update-config")).toBeNull(); // bare + expect(splitOrgSkill("baz--")).toBeNull(); // empty author + }); +}); + +describe("listSkillInvocations", () => { + it("coarse-prefilters on \"Skill\" then keeps only org-skill tool_calls", async () => { + const { fn, calls } = mockQuery([ + toolCall("posthog-smoke--kamo"), // org → kept + toolCall("hivemind:hivemind-memory"), // plugin → dropped + toolCall("update-config"), // bare → dropped + { message: { type: "assistant_message", content: "mentions Skill" }, last_update_date: "t" }, // prose → dropped + toolCall("pg-debug--sasun", "S2", "t2", true), // org, stringified message → kept + ]); + const got = await listSkillInvocations(fn, TABLE, { sinceIso: "2026-06-01", limit: 100 }); + expect(calls[0]).toContain(`CAST(message AS TEXT) LIKE '%"Skill"%'`); + expect(calls[0]).toContain("last_update_date >= '2026-06-01'"); + expect(calls[0]).toContain("LIMIT 100"); + expect(got).toEqual([ + { sessionId: "S1", name: "posthog-smoke", author: "kamo", ts: "t" }, + { sessionId: "S2", name: "pg-debug", author: "sasun", ts: "t2" }, + ]); + }); +}); + +describe("windowAroundInvocation", () => { + const inv: SkillInvocation = { sessionId: "S1", name: "posthog-smoke", author: "kamo", ts: "t5" }; + // turns: u1, a1, [skill invoked here], u2(pushback), a2 → window before=1/after=2 ⇒ a1..a2 + const rows = [ + { message: { type: "user_message", content: "first" } }, + { message: { type: "assistant_message", content: "ack" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "posthog-smoke--kamo" }), timestamp: "t5" } }, + { message: { type: "tool_call", tool_name: "Bash", tool_input: "{}" } }, // non-skill tool → ignored + { message: { type: "user_message", content: "no that's wrong" } }, + { message: { type: "assistant_message", content: "fixing" } }, + ]; + + it("windows `before` turns before and `after` after the invocation", async () => { + const { fn, calls } = mockQuery(rows); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 1, after: 2 }); + expect(calls[0]).toContain("path LIKE '/sessions/%S1%'"); + // invIndex = 2 (two turns before the skill tool_call). before 1 → from turn 1; after 2 → turns 2,3. + expect(out).toBe("ASSISTANT: ack\n\nUSER: no that's wrong\n\nASSISTANT: fixing"); + }); + + it("falls back to session end when the invocation can't be located", async () => { + const { fn } = mockQuery([ + { message: { type: "user_message", content: "hi" } }, + { message: { type: "assistant_message", content: "bye" } }, + ]); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5 }); + expect(out).toBe("USER: hi\n\nASSISTANT: bye"); // whole (short) transcript + }); + + it("elides a window longer than maxChars", async () => { + const big = "x".repeat(400); + const { fn } = mockQuery([ + { message: { type: "user_message", content: big } }, + { message: { type: "assistant_message", content: big } }, + ]); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5, maxChars: 150 }); + expect(out).toContain("chars elided"); + expect(out.length).toBeLessThan(300); + }); +}); diff --git a/tests/shared/skills-active.test.ts b/tests/shared/skills-active.test.ts deleted file mode 100644 index a7d1c295..00000000 --- a/tests/shared/skills-active.test.ts +++ /dev/null @@ -1,211 +0,0 @@ -import { describe, it, expect, beforeEach, afterEach } from "vitest"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; -import { - listActiveOrgSkills, - sessionBucket, - buildSkillsActiveInsert, - buildSkillsActivePath, - skillRootsForCwd, - defaultSkillsRoot, -} from "../../src/skillify/skills-active.js"; -import type { PulledManifest } from "../../src/skillify/manifest.js"; - -/** Build a pull manifest from `(dirName, name, author)` triples (fills the rest with defaults). */ -function manifestOf(...rows: Array<{ dirName: string; name: string; author: string }>): PulledManifest { - return { - version: 1, - entries: rows.map(r => ({ - dirName: r.dirName, - name: r.name, - author: r.author, - projectKey: "pk", - remoteVersion: 1, - install: "global" as const, - installRoot: "/install/root", - pulledAt: "2026-01-01T00:00:00.000Z", - symlinks: [], - })), - }; -} - -describe("listActiveOrgSkills", () => { - let root: string; - beforeEach(() => { - root = fs.mkdtempSync(path.join(os.tmpdir(), "skills-active-")); - }); - afterEach(() => { - fs.rmSync(root, { recursive: true, force: true }); - }); - - it("returns only manifest-recorded (pull-managed) dirs; excludes local-only + files", () => { - fs.mkdirSync(path.join(root, "posthog-event-smoke-testing--kamo.aghbalyan")); - fs.mkdirSync(path.join(root, "pg-deeplake-test-crash-debugging--sasun")); - fs.mkdirSync(path.join(root, "deploy--blue-green")); // local-only `--` dir, NOT pulled — excluded - fs.mkdirSync(path.join(root, "plan-confirm-then-execute")); // bare local — excluded - fs.writeFileSync(path.join(root, "notes--x.txt"), "x"); // file, not dir — excluded - const manifest = manifestOf( - { dirName: "posthog-event-smoke-testing--kamo.aghbalyan", name: "posthog-event-smoke-testing", author: "kamo.aghbalyan" }, - { dirName: "pg-deeplake-test-crash-debugging--sasun", name: "pg-deeplake-test-crash-debugging", author: "sasun" }, - ); - - const got = listActiveOrgSkills([root], manifest); - expect(got).toEqual([ - { name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 1 }, - { name: "posthog-event-smoke-testing", author: "kamo.aghbalyan", version: 1 }, - ]); // sorted by name; exactly the 2 manifest-recorded skills; version defaults to 1 (no SKILL.md) - expect(got).toHaveLength(2); // local `deploy--blue-green` + bare + file all dropped - }); - - it("excludes a local-only dir whose name contains `--` when the manifest is empty (no false positive)", () => { - fs.mkdirSync(path.join(root, "deploy--blue-green")); // org-shaped name, but never pulled - expect(listActiveOrgSkills([root], manifestOf())).toEqual([]); - }); - - it("takes name/author from the manifest, not a dirname split (multi-`--` dir stays correct)", () => { - fs.mkdirSync(path.join(root, "some--weird--dirname")); - const manifest = manifestOf({ dirName: "some--weird--dirname", name: "some-skill", author: "first-last" }); - expect(listActiveOrgSkills([root], manifest)).toEqual([{ name: "some-skill", author: "first-last", version: 1 }]); - }); - - it("returns [] for a missing skills root (never throws)", () => { - expect(listActiveOrgSkills([path.join(root, "does-not-exist")], manifestOf())).toEqual([]); - }); - - it("scans project + global roots and dedups a skill present in both (P2: --to project)", () => { - const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), "skills-active-proj-")); - try { - fs.mkdirSync(path.join(root, "a-skill--alice")); // global-pulled - fs.mkdirSync(path.join(projectRoot, "a-skill--alice")); // also project-pulled (dup) - fs.mkdirSync(path.join(projectRoot, "b-skill--bob")); // project-only org skill - fs.mkdirSync(path.join(projectRoot, "local-only--x")); // not in manifest → excluded - const manifest = manifestOf( - { dirName: "a-skill--alice", name: "a-skill", author: "alice" }, - { dirName: "b-skill--bob", name: "b-skill", author: "bob" }, - ); - const got = listActiveOrgSkills([root, projectRoot], manifest); - expect(got).toEqual([ - { name: "a-skill", author: "alice", version: 1 }, // counted once despite two roots - { name: "b-skill", author: "bob", version: 1 }, // picked up from the project root - ]); - expect(got).toHaveLength(2); - } finally { - fs.rmSync(projectRoot, { recursive: true, force: true }); - } - }); - - it("reads the skill version from the installed SKILL.md frontmatter (enables v1-vs-v2)", () => { - fs.mkdirSync(path.join(root, "evolving-skill--sasun")); - fs.writeFileSync( - path.join(root, "evolving-skill--sasun", "SKILL.md"), - "---\nname: evolving-skill\nversion: 5\n---\nbody", - ); - const manifest = manifestOf({ dirName: "evolving-skill--sasun", name: "evolving-skill", author: "sasun" }); - expect(listActiveOrgSkills([root], manifest)).toEqual([{ name: "evolving-skill", author: "sasun", version: 5 }]); - }); -}); - -describe("skillRootsForCwd", () => { - it("returns only the global root when no cwd is given", () => { - expect(skillRootsForCwd()).toEqual([defaultSkillsRoot()]); - }); - it("adds the project-scoped /.claude/skills root when cwd is given", () => { - expect(skillRootsForCwd("/home/u/proj")).toEqual([ - defaultSkillsRoot(), - path.join("/home/u/proj", ".claude", "skills"), - ]); - }); -}); - -describe("sessionBucket", () => { - it("is deterministic for the same session id", () => { - expect(sessionBucket("abc-123")).toBe(sessionBucket("abc-123")); - }); - it("stays within [0, buckets)", () => { - for (const id of ["a", "b", "c", "xyz", "1874a6b2"]) { - const b = sessionBucket(id, 2); - expect(b).toBeGreaterThanOrEqual(0); - expect(b).toBeLessThan(2); - } - }); - it("assigns both buckets across many ids (not constant)", () => { - const seen = new Set(); - for (let i = 0; i < 200; i++) seen.add(sessionBucket(`session-${i}`)); - expect(seen).toEqual(new Set([0, 1])); // both arms populated → real randomization - }); -}); - -describe("buildSkillsActivePath", () => { - const config = { userName: "kamo", orgName: "activeloop", workspaceId: "default" }; - - it("namespaces under /skills_active/, NOT /sessions/ (so summary readers exclude it)", () => { - const p = buildSkillsActivePath(config, "S1"); - expect(p.startsWith("/skills_active/")).toBe(true); - expect(p.startsWith("/sessions/")).toBe(false); - // The exact filter the summary / raw-transcript readers use must NOT match this path. - expect(p.includes("/sessions/")).toBe(false); - }); - - it("embeds the full {user, org, workspace, session} tuple", () => { - expect(buildSkillsActivePath(config, "S1")).toBe( - "/skills_active/kamo/kamo_activeloop_default_S1.json", - ); - }); - - it("falls back to `default` workspace when workspaceId is absent", () => { - // covers the `?? \"default\"` branch (mirrors buildSessionPath) - const p = buildSkillsActivePath( - { userName: "kamo", orgName: "activeloop", workspaceId: undefined as unknown as string }, - "S1", - ); - expect(p).toBe("/skills_active/kamo/kamo_activeloop_default_S1.json"); - }); -}); - -describe("buildSkillsActiveInsert", () => { - const base = { - sessionsTable: "sessions", - sessionPath: "/sessions/kamo/kamo_activeloop_hivemind_S1.jsonl", - filename: "kamo_activeloop_hivemind_S1.jsonl", - userName: "kamo", - projectName: "hivemind", - pluginVersion: "0.7.99", - sessionId: "S1", - cwd: "/home/kamo/proj", - skills: [{ name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 3 }], - bucket: 1, - ts: "2026-06-03T00:00:00.000Z", - }; - - it("emits exactly ONE insert into the sessions table (no second mutation)", () => { - const sql = buildSkillsActiveInsert(base); - expect((sql.match(/INSERT INTO/g) ?? []).length).toBe(1); - expect((sql.match(/UPDATE /g) ?? []).length).toBe(0); - expect(sql).toContain('INSERT INTO "sessions"'); - }); - - it("writes a skills_active message with the skills, count, and bucket", () => { - const sql = buildSkillsActiveInsert(base); - const m = sql.match(/'(\{.*\})'::jsonb/s); - expect(m).toBeTruthy(); - const entry = JSON.parse(m![1]); - expect(entry.type).toBe("skills_active"); - expect(entry.session_id).toBe("S1"); - expect(entry.skills).toEqual([{ name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 3 }]); - expect(entry.skills_count).toBe(1); - expect(entry.ab_bucket).toBe(1); - }); - - it("leaves message_embedding NULL (no daemon round-trip at SessionStart)", () => { - const sql = buildSkillsActiveInsert(base); - expect(sql).toMatch(/::jsonb,\s*NULL,/); - }); - - it("does NOT masquerade as a captured turn type", () => { - const sql = buildSkillsActiveInsert(base); - expect(sql).not.toContain('"type":"user_message"'); - expect(sql).not.toContain('"type":"tool_call"'); - expect(sql).not.toContain('"type":"assistant_message"'); - }); -}); From 1929c96e2264c5ef3e9ffc6c92d409edb86b04af Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:14:05 +0000 Subject: [PATCH 03/30] refactor(skillopt): drop skills_active availability attribution Skill *invocation* (Skill tool_use, already captured) is a more accurate signal than availability and pins the turn for windowing, so the SessionStart skills_active write + helper + the availability-based cohorts module are removed. The weekly trigger/worker stay. session-start-hook tests revert to the pre-attribution query sequence; trigger tests unchanged. --- node_modules | 1 + src/hooks/session-start.ts | 42 --------------- tests/claude-code/session-start-hook.test.ts | 56 ++++---------------- 3 files changed, 12 insertions(+), 87 deletions(-) create mode 120000 node_modules diff --git a/node_modules b/node_modules new file mode 120000 index 00000000..1dd8c0a6 --- /dev/null +++ b/node_modules @@ -0,0 +1 @@ +/home/ubuntu/al-projects/hivemind/node_modules \ No newline at end of file diff --git a/src/hooks/session-start.ts b/src/hooks/session-start.ts index af944953..0e63390b 100644 --- a/src/hooks/session-start.ts +++ b/src/hooks/session-start.ts @@ -14,7 +14,6 @@ import { loadConfig } from "../config.js"; import { DeeplakeApi } from "../deeplake-api.js"; import { sqlStr } from "../utils/sql.js"; import { projectNameFromCwd } from "../utils/project-name.js"; -import { listActiveOrgSkills, sessionBucket, buildSkillsActiveInsert, buildSkillsActivePath, skillRootsForCwd } from "../skillify/skills-active.js"; import { readStdin } from "../utils/stdin.js"; import { log as _log } from "../utils/debug.js"; import { getInstalledVersion } from "../utils/version-check.js"; @@ -223,11 +222,6 @@ async function main(): Promise { // freezes SessionStart. Hard opt-out via HIVEMIND_AUTOPULL_DISABLED=1. // All failures swallowed inside autoPullSkills (documented as // never-rejecting), so no try/catch needed here. - // - // Runs BEFORE the skill-attribution snapshot below so that a skill pulled - // (or upgraded) during THIS SessionStart is reflected in the recorded - // skills_active set — otherwise the row would capture a stale/empty set - // while the session can already use the freshly-pulled skill. const pullResult = await autoPullSkills(); log(`autopull: pulled=${pullResult.pulled} skipped=${pullResult.skipped}`); @@ -244,42 +238,6 @@ async function main(): Promise { await api.ensureSessionsTable(sessionsTable); await createPlaceholder(api, table, input.session_id, input.cwd ?? "", config.userName, config.orgName, config.workspaceId, pluginVersion); log("placeholder created"); - - // Skill attribution (measurement): record which org-shared skills were in - // context this session + a deterministic A/B bucket. This is the label that - // makes skill value measurable (sessions with vs without skill X / v1 vs v2). - // Org skills are identified via the pull manifest (authoritative), not the - // `--` dirname pattern. Snapshot runs after auto-pull (above) so it reflects - // freshly-pulled skills. Opt-out: HIVEMIND_SKILL_ATTRIBUTION=0. - // Swallowed — must never fail SessionStart. - if (process.env.HIVEMIND_SKILL_ATTRIBUTION !== "0") { - try { - // Scan global + project-scoped (/.claude/skills) roots so - // skills pulled with `--to project` are attributed too. - const skills = listActiveOrgSkills(skillRootsForCwd(input.cwd)); - // Distinct `/skills_active/` namespace (NOT `/sessions/`) so the summary / - // raw-transcript readers never mistake this attribution row for a transcript. - const attrSessionPath = buildSkillsActivePath(config, input.session_id); - const attrFilename = attrSessionPath.slice(attrSessionPath.lastIndexOf("/") + 1); - const sql = buildSkillsActiveInsert({ - sessionsTable, - sessionPath: attrSessionPath, - filename: attrFilename, - userName: config.userName, - projectName: projectNameFromCwd(input.cwd), - pluginVersion, - sessionId: input.session_id, - cwd: input.cwd, - skills, - bucket: sessionBucket(input.session_id), - ts: new Date().toISOString(), - }); - await api.query(sql); - log(`skills_active recorded: ${skills.length} org skills, bucket ${sessionBucket(input.session_id)}`); - } catch (e: any) { - log(`skills_active attribution failed (swallowed): ${e?.message ?? e}`); - } - } } else { const reason = process.env.HIVEMIND_CAPTURE === "false" ? "HIVEMIND_CAPTURE=false" diff --git a/tests/claude-code/session-start-hook.test.ts b/tests/claude-code/session-start-hook.test.ts index de5fa30a..c1639878 100644 --- a/tests/claude-code/session-start-hook.test.ts +++ b/tests/claude-code/session-start-hook.test.ts @@ -102,7 +102,6 @@ const stdoutSpy = vi.spyOn(process.stdout, "write"); async function runHook(env: Record = {}): Promise { delete process.env.HIVEMIND_WIKI_WORKER; delete process.env.HIVEMIND_CAPTURE; - delete process.env.HIVEMIND_SKILL_ATTRIBUTION; for (const [k, v] of Object.entries(env)) { if (v === undefined) delete process.env[k]; else process.env[k] = v; @@ -235,28 +234,21 @@ describe("session-start hook — placeholder branching", () => { expect(ensureTableMock).toHaveBeenCalled(); expect(ensureSessionsTableMock).toHaveBeenCalledWith("sessions"); // 1 SELECT (existing-summary check) + 1 INSERT (placeholder) - // + 1 INSERT (skills_active attribution) + 2 renderer SELECTs - // (listRules + listOpenGoals) = 5 queries. - expect(queryMock).toHaveBeenCalledTimes(5); + // + 2 renderer SELECTs (listRules + listOpenGoals) = 4 queries. + expect(queryMock).toHaveBeenCalledTimes(4); expect(queryMock.mock.calls[0][0]).toMatch(/^SELECT path FROM/); expect(queryMock.mock.calls[1][0]).toMatch(/^INSERT INTO/); - // skills_active attribution row — shape, not just count (asserts the new - // write is the attribution INSERT, so a second stray mutation can't sneak in). - expect(queryMock.mock.calls[2][0]).toMatch(/^INSERT INTO "sessions"/); - expect(queryMock.mock.calls[2][0]).toContain("skills_active"); - expect(queryMock.mock.calls[3][0]).toMatch(/^SELECT .* FROM "hivemind_rules"/); - expect(queryMock.mock.calls[4][0]).toMatch(/^SELECT .* FROM "hivemind_goals"/); + expect(queryMock.mock.calls[2][0]).toMatch(/^SELECT .* FROM "hivemind_rules"/); + expect(queryMock.mock.calls[3][0]).toMatch(/^SELECT .* FROM "hivemind_goals"/); expect(debugLogMock).toHaveBeenCalledWith("placeholder created"); }); it("skips placeholder INSERT when summary already exists (resumed session)", async () => { queryMock.mockResolvedValueOnce([{ path: "/summaries/alice/sid-1.md" }]); await runHook(); - // 1 placeholder SELECT (returns row, no INSERT) + 1 skills_active - // attribution INSERT (runs regardless of placeholder branch) + 2 renderer - // SELECTs (rules + goals) = 4 queries. - expect(queryMock).toHaveBeenCalledTimes(4); - expect(queryMock.mock.calls[1][0]).toContain("skills_active"); + // 1 placeholder SELECT (returns row, no INSERT) + 2 renderer SELECTs + // (rules + goals) = 3 queries. + expect(queryMock).toHaveBeenCalledTimes(3); }); it("non-empty rules block is appended to additionalContext", async () => { @@ -272,7 +264,6 @@ describe("session-start hook — placeholder branching", () => { }; queryMock.mockResolvedValueOnce([]); // placeholder SELECT queryMock.mockResolvedValueOnce([]); // placeholder INSERT - queryMock.mockResolvedValueOnce([]); // skills_active attribution INSERT queryMock.mockResolvedValueOnce([rule]); // renderer rules queryMock.mockResolvedValueOnce([]); // renderer goals (empty) const out = await runHook(); @@ -296,13 +287,12 @@ describe("session-start hook — placeholder branching", () => { }; queryMock.mockResolvedValueOnce([]); // placeholder SELECT queryMock.mockResolvedValueOnce([]); // placeholder INSERT - queryMock.mockResolvedValueOnce([]); // skills_active attribution INSERT queryMock.mockResolvedValueOnce([rule]); // renderer rules queryMock.mockResolvedValueOnce([]); // renderer goals (empty) const out = await runHook(); const parsed = JSON.parse(out!); expect(parsed.hookSpecificOutput.additionalContext).toContain("no DROP TABLE on prod"); - expect(queryMock).toHaveBeenCalledTimes(5); + expect(queryMock).toHaveBeenCalledTimes(4); }); it("skips the renderer SELECTs when the trusted table list omits rules + goals", async () => { @@ -310,12 +300,11 @@ describe("session-start hook — placeholder branching", () => { // SELECT. Only the placeholder SELECT + INSERT run. knownTablesMock.mockResolvedValue([]); await runHook(); - // placeholder SELECT + placeholder INSERT + skills_active attribution - // INSERT = 3 (renderer fires no SELECT when no tables are trusted). - expect(queryMock).toHaveBeenCalledTimes(3); + // placeholder SELECT + placeholder INSERT = 2 (renderer fires no SELECT + // when no tables are trusted). + expect(queryMock).toHaveBeenCalledTimes(2); expect(queryMock.mock.calls[0][0]).toMatch(/^SELECT path FROM/); expect(queryMock.mock.calls[1][0]).toMatch(/^INSERT INTO/); - expect(queryMock.mock.calls[2][0]).toContain("skills_active"); }); it("HIVEMIND_CAPTURE=false: no placeholder, no DDL (ensure), but renderer still runs", async () => { @@ -336,29 +325,6 @@ describe("session-start hook — placeholder branching", () => { ); }); - it("HIVEMIND_SKILL_ATTRIBUTION=0: skips the skills_active write entirely", async () => { - await runHook({ HIVEMIND_SKILL_ATTRIBUTION: "0" }); - // placeholder SELECT + INSERT + 2 renderer SELECTs = 4 (NO attribution row). - expect(queryMock).toHaveBeenCalledTimes(4); - // negative assertion: the attribution INSERT must not be present at all. - for (const call of queryMock.mock.calls) { - expect(call[0]).not.toContain("skills_active"); - } - }); - - it("swallows a failed skills_active attribution write (never breaks SessionStart)", async () => { - // Content-based (not position-based) so it's robust to query ordering: the - // attribution INSERT is the only query carrying the skills_active marker. - queryMock.mockImplementation((sql: string) => - sql.includes("skills_active") ? Promise.reject(new Error("attr boom")) : Promise.resolve([]), - ); - const out = await runHook(); - expect(out).toBeTruthy(); // hook still completes and emits context - expect(debugLogMock).toHaveBeenCalledWith( - expect.stringContaining("skills_active attribution failed (swallowed): attr boom"), - ); - }); - it("logs the SkillOpt fired branch when the weekly trigger spawns a worker", async () => { runWeeklySkillOptMock.mockReturnValue({ fired: true, reason: "spawned" }); await runHook(); From 72dca56ea10acb7d2349b87345506c753b6971ad Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:16:56 +0000 Subject: [PATCH 04/30] feat(skillopt): heuristic correction anchor (reward level 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure, free pushback detector: a USER turn right after an ASSISTANT turn matching correction patterns = a hard, observable failure signal, independent of any LLM. Tuned for recall (false positives just cost a judge call; false negatives only under-detect — never churns a good skill). Exposes windowedTurns() from skill-invocations so the anchor reads the same windowed slice the judge will. 6 anchor tests + patterns to tune on real data. --- src/skillify/session-anchor.ts | 43 +++++++++++++++++++++++++++++ src/skillify/skill-invocations.ts | 19 +++++++++---- tests/shared/session-anchor.test.ts | 41 +++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 src/skillify/session-anchor.ts create mode 100644 tests/shared/session-anchor.test.ts diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts new file mode 100644 index 00000000..677b8598 --- /dev/null +++ b/src/skillify/session-anchor.ts @@ -0,0 +1,43 @@ +/** + * Heuristic "anchor" — a HARD, observable signal in the transcript that a session + * went badly, independent of any LLM judgment: the user pushed back on / corrected + * what the assistant just did. Pure + free (no LLM, no I/O). + * + * It's the level-1 filter in the outcome pipeline: only windows with an anchor go + * to the (paid) success-judge, and a session is labelled a failure only when the + * anchor AND the judge agree. So this is deliberately tuned for RECALL over + * precision — a false positive just costs one judge call (which then drops it), + * but a false negative under-detects (conservative — it never churns a good skill). + * Patterns are meant to be tuned against real sessions; this is a starting set. + */ +import type { Turn } from "./skill-invocations.js"; + +export type AnchorKind = "correction" | "none"; +export interface Anchor { + anchored: boolean; + kind: AnchorKind; + evidence: string; // the user turn that triggered it (truncated) +} + +// User pushback: rejection / correction of what the assistant just produced. +const PUSHBACK = /\b(no|nope|wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|that'?s wrong|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail)/i; + +// Clear benign negatives we don't want to fire on (keeps obvious false positives +// out of the judge to save tokens). Intentionally narrow — when in doubt, fire. +const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good|thank)/i; + +/** + * Detect a correction anchor in a windowed slice of turns. Only a USER turn that + * immediately follows an ASSISTANT turn can be pushback (the first user turn is + * the request, not a reaction). + */ +export function detectAnchor(turns: Turn[]): Anchor { + for (let i = 1; i < turns.length; i++) { + const t = turns[i]; + if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue; + if (PUSHBACK.test(t.text) && !BENIGN.test(t.text)) { + return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) }; + } + } + return { anchored: false, kind: "none", evidence: "" }; +} diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts index 3cb0fd31..e5ef664f 100644 --- a/src/skillify/skill-invocations.ts +++ b/src/skillify/skill-invocations.ts @@ -97,7 +97,7 @@ export async function listSkillInvocations( return out; } -interface Turn { role: "USER" | "ASSISTANT"; text: string } +export interface Turn { role: "USER" | "ASSISTANT"; text: string } /** * Reconstruct the transcript turns of a session, and mark where (between which two @@ -139,17 +139,26 @@ async function sessionTurns( * turns after — where the help-or-harm signal lives — head+tail elided to maxChars. * `before`/`after` are tunable; defaults chosen as a small starting point. */ +export async function windowedTurns( + query: QueryFn, + sessionsTable: string, + inv: SkillInvocation, + opts: { before?: number; after?: number } = {}, +): Promise { + const before = opts.before ?? 3; + const after = opts.after ?? 6; + const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv); + return turns.slice(Math.max(0, invIndex - before), invIndex + after); +} + export async function windowAroundInvocation( query: QueryFn, sessionsTable: string, inv: SkillInvocation, opts: { before?: number; after?: number; maxChars?: number } = {}, ): Promise { - const before = opts.before ?? 3; - const after = opts.after ?? 6; const maxChars = opts.maxChars ?? 4000; - const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv); - const slice = turns.slice(Math.max(0, invIndex - before), invIndex + after); + const slice = await windowedTurns(query, sessionsTable, inv, opts); const joined = slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"); if (joined.length <= maxChars) return joined; const head = joined.slice(0, Math.floor(maxChars * 0.55)); diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts new file mode 100644 index 00000000..1780584f --- /dev/null +++ b/tests/shared/session-anchor.test.ts @@ -0,0 +1,41 @@ +import { describe, it, expect } from "vitest"; +import { detectAnchor } from "../../src/skillify/session-anchor.js"; +import type { Turn } from "../../src/skillify/skill-invocations.js"; + +const u = (text: string): Turn => ({ role: "USER", text }); +const a = (text: string): Turn => ({ role: "ASSISTANT", text }); + +describe("detectAnchor", () => { + it("fires on user pushback right after an assistant turn", () => { + const r = detectAnchor([u("add a smoke test"), a("done, here it is"), u("no that's wrong, it mocks the client")]); + expect(r.anchored).toBe(true); + expect(r.kind).toBe("correction"); + expect(r.evidence).toContain("wrong"); + }); + + it("does NOT fire on the opening request (no preceding assistant turn)", () => { + const r = detectAnchor([u("this won't work without a flush — add a smoke test")]); + expect(r.anchored).toBe(false); + }); + + it("does NOT fire on a user turn that follows another user turn", () => { + const r = detectAnchor([u("first"), u("that didn't work")]); // no assistant in between + expect(r.anchored).toBe(false); + }); + + it("suppresses clear benign negatives (no problem / works now / thanks)", () => { + expect(detectAnchor([a("fixed it"), u("no problem, thanks!")]).anchored).toBe(false); + expect(detectAnchor([a("try this"), u("works now, perfect")]).anchored).toBe(false); + }); + + it("catches several real correction phrasings", () => { + for (const p of ["that doesn't work", "still failing", "that's incorrect", "try again", "nope", "you broke the build"]) { + expect(detectAnchor([a("here"), u(p)]).anchored, p).toBe(true); + } + }); + + it("returns none when the user is satisfied / silent", () => { + expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false); + expect(detectAnchor([]).anchored).toBe(false); + }); +}); From 8398a2f27466bc62e90612018e6f87f487a70d19 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:19:28 +0000 Subject: [PATCH 05/30] feat(skillopt): success-judge (reward level 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLM judge of the windowed slice: was the task accomplished CORRECTLY? Prompted to ignore user mood (a praised-but-wrong answer is a FAILURE) — the anti-sycophancy axis the research validated. Runs on the user's claude -p (cheap default model, all tools denied), injected for tests. Conservative: unparseable/errored/empty judgments return success=1 so a flaky judge can only under-detect, never manufacture deficiency. 7 unit tests. --- src/skillify/success-judge.ts | 93 ++++++++++++++++++++++++++++++ tests/shared/success-judge.test.ts | 44 ++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 src/skillify/success-judge.ts create mode 100644 tests/shared/success-judge.test.ts diff --git a/src/skillify/success-judge.ts b/src/skillify/success-judge.ts new file mode 100644 index 00000000..3ad672a1 --- /dev/null +++ b/src/skillify/success-judge.ts @@ -0,0 +1,93 @@ +/** + * Success-judge — reward level 2. Given a windowed transcript slice, asks the + * model the ONE question that resists sycophancy: was the user's task actually + * accomplished CORRECTLY? (Ignore whether the user seemed happy — a praised-but- + * wrong answer is a failure.) Returns success 0|1 + confidence + reason. + * + * Runs on the USER's own agent (claude -p) — cost lands on the user, so the + * default model is cheap and this is only ever called on anchor-flagged windows + * (level 1), never on every session. + * + * The model call is injected (ModelCall) so the judging logic is unit-tested with + * zero real LLM calls; the default shells out to `claude -p`, all tools denied. + * + * Conservative on failure: an unparseable/errored/empty judgment returns + * success=1 (do NOT count as a failure), so a flaky judge can never manufacture + * deficiency — it can only fail to detect (which the next run catches). + */ +import { spawn } from "node:child_process"; + +export interface SuccessVerdict { + success: 0 | 1; + confidence: number; // 0..1 + reason: string; +} + +/** (systemPrompt, userPrompt) -> raw model text. Injected for tests. */ +export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; + +const SYSTEM = + "You are a strict engineering reviewer. Judge ONLY whether the user's task was " + + "actually accomplished CORRECTLY in this session slice. Ignore whether the user " + + "seemed happy or polite — a praised-but-wrong answer is a FAILURE. Reply with " + + 'ONLY a JSON object: {"success": 0 or 1, "confidence": 0.0-1.0, "reason": ' + + '"<=200 chars citing concrete evidence"}.'; + +function buildUserPrompt(window: string): string { + return `Session slice (USER/ASSISTANT turns around a skill invocation):\n\n${window}\n\n` + + "Did the user's task get accomplished correctly? JSON only."; +} + +function extractJson(raw: string): Record | null { + let s = raw.trim(); + const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/); + if (fence) s = fence[1].trim(); + const a = s.indexOf("{"); + const b = s.lastIndexOf("}"); + if (a === -1 || b <= a) return null; + try { return JSON.parse(s.slice(a, b + 1)) as Record; } catch { return null; } +} + +/** Parse a model response into a verdict; unparseable → conservative success=1. */ +export function parseVerdict(raw: string): SuccessVerdict { + const j = extractJson(raw); + if (!j) return { success: 1, confidence: 0, reason: "unparseable judge output" }; + const fail = j.success === 0 || j.success === "0" || j.success === false; + const confidence = typeof j.confidence === "number" ? Math.max(0, Math.min(1, j.confidence)) : 0.5; + const reason = typeof j.reason === "string" ? j.reason.slice(0, 240) : ""; + return { success: fail ? 0 : 1, confidence, reason }; +} + +/** Default backend: claude -p, cheap model, all tools denied (pure-text judgment). */ +function claudeJudge(model = "haiku"): ModelCall { + return (system, user) => new Promise((resolve, reject) => { + const args = [ + "-p", user, "--model", model, "--no-session-persistence", + "--output-format", "json", "--system-prompt", system, + "--disallowed-tools", "Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task", + ]; + const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] }); + let out = ""; + let err = ""; + const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("judge timed out")); }, 120_000); + child.stdout.on("data", (d) => { out += String(d); }); + child.stderr.on("data", (d) => { err += String(d); }); + child.on("error", (e) => { clearTimeout(timer); reject(e); }); + child.on("close", (code) => { + clearTimeout(timer); + if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`)); + try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); } + catch { resolve(out); } + }); + }); +} + +export async function judgeSuccess(window: string, opts: { model?: ModelCall } = {}): Promise { + if (!window.trim()) return { success: 1, confidence: 0, reason: "empty window" }; + const model = opts.model ?? claudeJudge(); + try { + return parseVerdict(await model(SYSTEM, buildUserPrompt(window))); + } catch (e: unknown) { + return { success: 1, confidence: 0, reason: `judge failed: ${(e as Error)?.message ?? String(e)}` }; + } +} diff --git a/tests/shared/success-judge.test.ts b/tests/shared/success-judge.test.ts new file mode 100644 index 00000000..7386c65b --- /dev/null +++ b/tests/shared/success-judge.test.ts @@ -0,0 +1,44 @@ +import { describe, it, expect, vi } from "vitest"; +import { parseVerdict, judgeSuccess } from "../../src/skillify/success-judge.js"; + +describe("parseVerdict", () => { + it("parses a clean JSON verdict", () => { + expect(parseVerdict('{"success":0,"confidence":0.9,"reason":"mocks the client"}')) + .toEqual({ success: 0, confidence: 0.9, reason: "mocks the client" }); + }); + it("tolerates ```json fences and surrounding prose", () => { + const raw = "Here is my judgment:\n```json\n{\"success\": 1, \"confidence\": 0.8, \"reason\": \"ok\"}\n```\nDone."; + expect(parseVerdict(raw)).toEqual({ success: 1, confidence: 0.8, reason: "ok" }); + }); + it("treats success false/\"0\" as failure and clamps confidence", () => { + expect(parseVerdict('{"success":false,"confidence":2,"reason":"x"}')).toMatchObject({ success: 0, confidence: 1 }); + expect(parseVerdict('{"success":"0","confidence":-1,"reason":"x"}')).toMatchObject({ success: 0, confidence: 0 }); + }); + it("is conservative (success=1) on unparseable output", () => { + expect(parseVerdict("the model rambled with no json")).toMatchObject({ success: 1, confidence: 0 }); + }); +}); + +describe("judgeSuccess", () => { + it("returns the judged verdict from the injected model", async () => { + const model = vi.fn(async (_system: string, _user: string) => '{"success":0,"confidence":0.95,"reason":"no flush, event never sends"}'); + const v = await judgeSuccess("USER: do X\n\nASSISTANT: mocked it", { model }); + expect(v.success).toBe(0); + expect(model).toHaveBeenCalledOnce(); + // the judge must be told to ignore mood (anti-sycophancy) + asked for JSON + expect(model.mock.calls[0][0]).toMatch(/praised-but-wrong|Ignore whether the user/i); + }); + + it("is conservative (success=1) when the model call throws — a flaky judge can't manufacture failure", async () => { + const v = await judgeSuccess("USER: x\n\nASSISTANT: y", { model: vi.fn(async () => { throw new Error("boom"); }) }); + expect(v.success).toBe(1); + expect(v.reason).toContain("judge failed"); + }); + + it("short-circuits an empty window without calling the model", async () => { + const model = vi.fn(async () => "{}"); + const v = await judgeSuccess(" ", { model }); + expect(v.success).toBe(1); + expect(model).not.toHaveBeenCalled(); + }); +}); From f4efb8ae3ae43110714304b21ce5d2e5a769c6c4 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:22:02 +0000 Subject: [PATCH 06/30] =?UTF-8?q?feat(skillopt):=20deficiency=20detector?= =?UTF-8?q?=20(invocation=20=E2=86=92=20anchor=20=E2=86=92=20judge=20?= =?UTF-8?q?=E2=86=92=20flag)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Orchestrates the engine's detection step: group org-skill invocations, window each, run the free anchor, and judge ONLY anchored windows. A confirmed failure needs anchor AND judge agreement (precision). A skill is deficient at >=minInvocations AND >=failureRateThreshold confirmed-failure rate. Returns per-skill stats + deficientCount (the caller's >=5 fire gate). All injected; 2 tests assert the min-n gate and that the judge runs only on anchored windows. --- src/skillify/deficiency-detector.ts | 98 ++++++++++++++++++++++++ tests/shared/deficiency-detector.test.ts | 69 +++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 src/skillify/deficiency-detector.ts create mode 100644 tests/shared/deficiency-detector.test.ts diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts new file mode 100644 index 00000000..70152597 --- /dev/null +++ b/src/skillify/deficiency-detector.ts @@ -0,0 +1,98 @@ +/** + * Deficiency detector — the core of the engine's "which skills are bad" step. + * + * For each org-skill invocation: window the transcript around it, run the FREE + * level-1 anchor (user pushback?), and only if anchored spend a level-2 judge + * call (was the task actually accomplished?). A "confirmed failure" requires BOTH + * — high precision, so we never churn a good skill. Aggregate per skill: a skill + * is deficient if it has enough invocations AND a high confirmed-failure rate. + * + * Token discipline: the judge runs ONLY on anchored windows (a fraction), on a + * windowed slice (not whole sessions). Everything injectable (query + judge model) + * so the whole orchestration is unit-tested with zero live Deeplake / LLM. + * + * The ≥5 fire gate lives with the caller (worker): we just return deficientCount. + */ +import { + listSkillInvocations, windowedTurns, type QueryFn, type SkillInvocation, +} from "./skill-invocations.js"; +import { detectAnchor } from "./session-anchor.js"; +import { judgeSuccess, type ModelCall } from "./success-judge.js"; + +export interface SkillDeficiency { + name: string; + author: string; + invocations: number; // org-skill invocations examined + anchored: number; // had a level-1 anchor → judged + confirmedFailures: number; // anchor AND judge said success=0 + failureRate: number; // confirmedFailures / invocations + deficient: boolean; // failureRate >= threshold AND invocations >= minInvocations + examples: string[]; // a few failure reasons (for the proposer) +} + +export interface DetectorConfig { + minInvocations?: number; // min-n per skill before we trust the rate (default 8) + failureRateThreshold?: number; // confirmed-failure rate to flag deficient (default 0.4) + window?: { before?: number; after?: number; maxChars?: number }; + judge?: ModelCall; // injected; default = real claude judge + sinceIso?: string; // lookback bound + limit?: number; // cap invocation rows pulled +} + +const skillKey = (name: string, author: string) => `${name}--${author}`; + +export interface DetectionResult { + skills: SkillDeficiency[]; + deficientCount: number; +} + +export async function detectDeficientSkills( + query: QueryFn, + sessionsTable: string, + cfg: DetectorConfig = {}, +): Promise { + const minInvocations = cfg.minInvocations ?? 8; + const threshold = cfg.failureRateThreshold ?? 0.4; + + const invocations = await listSkillInvocations(query, sessionsTable, { sinceIso: cfg.sinceIso, limit: cfg.limit }); + + const groups = new Map(); + for (const inv of invocations) { + const k = skillKey(inv.name, inv.author); + const arr = groups.get(k); + if (arr) arr.push(inv); else groups.set(k, [inv]); + } + + const skills: SkillDeficiency[] = []; + for (const list of groups.values()) { + let anchored = 0; + let confirmed = 0; + const examples: string[] = []; + for (const inv of list) { + const turns = await windowedTurns(query, sessionsTable, inv, cfg.window); + const anchor = detectAnchor(turns); + if (!anchor.anchored) continue; // free filter — no judge call + anchored++; + const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"); + const verdict = await judgeSuccess(window, { model: cfg.judge }); + if (verdict.success === 0) { // confirmed: anchor AND judge agree + confirmed++; + if (examples.length < 3) examples.push(verdict.reason || anchor.evidence); + } + } + const failureRate = list.length ? confirmed / list.length : 0; + skills.push({ + name: list[0].name, + author: list[0].author, + invocations: list.length, + anchored, + confirmedFailures: confirmed, + failureRate, + deficient: list.length >= minInvocations && failureRate >= threshold, + examples, + }); + } + + skills.sort((a, b) => b.failureRate - a.failureRate || b.invocations - a.invocations); + return { skills, deficientCount: skills.filter((s) => s.deficient).length }; +} diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts new file mode 100644 index 00000000..6e00b3d2 --- /dev/null +++ b/tests/shared/deficiency-detector.test.ts @@ -0,0 +1,69 @@ +import { describe, it, expect, vi } from "vitest"; +import { detectDeficientSkills } from "../../src/skillify/deficiency-detector.js"; + +const TABLE = "sessions"; + +const invRow = (skill: string, sid: string) => ({ + message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, + last_update_date: sid, +}); +const transcript = (skill: string, sid: string, pushback: boolean) => [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "assistant_message", content: "done" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } }, +]; + +function world() { + const invs: Array> = []; + const transcripts = new Map>>(); + const add = (skill: string, sid: string, pushback: boolean) => { + invs.push(invRow(skill, sid)); + transcripts.set(sid, transcript(skill, sid, pushback)); + }; + for (let i = 0; i < 10; i++) add("bad--auth", `bad${i}`, i < 5); // 5/10 pushback → deficient + for (let i = 0; i < 10; i++) add("good--auth", `good${i}`, false); // 0 pushback → healthy + for (let i = 0; i < 3; i++) add("sparse--auth", `sparse${i}`, true); // all fail but too few (min-n) + return { invs, transcripts }; +} + +describe("detectDeficientSkills", () => { + it("flags only skills with enough invocations AND a high confirmed-failure rate", async () => { + const { invs, transcripts } = world(); + const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks the client"}'); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; // the invocation list + const m = sql.match(/\/sessions\/%([^%]+)%/); // a window query + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + + const { skills, deficientCount } = await detectDeficientSkills(query, TABLE, { judge }); + const bad = skills.find((s) => s.name === "bad")!; + const good = skills.find((s) => s.name === "good")!; + const sparse = skills.find((s) => s.name === "sparse")!; + + expect(bad).toMatchObject({ invocations: 10, anchored: 5, confirmedFailures: 5, deficient: true }); + expect(bad.failureRate).toBeCloseTo(0.5); + expect(good).toMatchObject({ invocations: 10, anchored: 0, confirmedFailures: 0, deficient: false }); + expect(sparse).toMatchObject({ invocations: 3, confirmedFailures: 3, deficient: false }); // min-n blocks it + expect(deficientCount).toBe(1); + + // token discipline: judge runs ONLY on anchored windows (5 bad + 3 sparse = 8), never the 10 good + expect(judge).toHaveBeenCalledTimes(8); + }); + + it("respects a custom threshold + min-n", async () => { + const { invs, transcripts } = world(); + const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"x"}'); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + // minInvocations 3, threshold 0.9 → only "sparse" (rate 1.0, 3 inv) qualifies; "bad" (0.5) doesn't + const { deficientCount, skills } = await detectDeficientSkills(query, TABLE, { judge, minInvocations: 3, failureRateThreshold: 0.9 }); + expect(skills.find((s) => s.name === "sparse")!.deficient).toBe(true); + expect(skills.find((s) => s.name === "bad")!.deficient).toBe(false); + expect(deficientCount).toBe(1); + }); +}); From 93149a0d7109853a8f7effae462162aab8c908f1 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:26:26 +0000 Subject: [PATCH 07/30] feat(skillopt): structured edits + budget + slow-update region Port of SkillOpt's edit-application: append/insert_after/replace/delete ops, the edit budget (textual learning rate), and the protected region (the paper's slow-update) that fast edits must not touch. Pure + deterministic; 7 tests cover each op, the budget, and the protection. --- src/skillify/skill-edits.ts | 105 +++++++++++++++++++++++++++++++ tests/shared/skill-edits.test.ts | 55 ++++++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 src/skillify/skill-edits.ts create mode 100644 tests/shared/skill-edits.test.ts diff --git a/src/skillify/skill-edits.ts b/src/skillify/skill-edits.ts new file mode 100644 index 00000000..218ba6a0 --- /dev/null +++ b/src/skillify/skill-edits.ts @@ -0,0 +1,105 @@ +/** + * Structured, bounded edits over a markdown SKILL.md — the paper's edit operations + * (append / insert_after / replace / delete) and "textual learning rate" (edit + * budget). Port of SkillOpt's skillopt/optimizer/skill.py. + * + * A protected region — between and + * — holds longitudinal guidance that fast per-edit changes must NOT touch (the + * paper's slow-update). Edits targeting it are skipped, and `append` lands above it. + * + * Pure + deterministic — no I/O, fully unit-testable. + */ +export type EditOp = "append" | "insert_after" | "replace" | "delete"; +export interface Edit { + op: EditOp; + target?: string; // anchor text for insert_after / replace / delete + content?: string; // new text for append / insert_after / replace +} + +export const SU_START = ""; +export const SU_END = ""; + +function protectedRange(skill: string): [number, number] | null { + const a = skill.indexOf(SU_START); + const b = skill.indexOf(SU_END); + if (a === -1 || b === -1 || b < a) return null; + return [a, b + SU_END.length]; +} + +function targetsProtected(skill: string, target: string): boolean { + const r = protectedRange(skill); + if (!r || !target) return false; + const idx = skill.indexOf(target); + return idx !== -1 && idx >= r[0] && idx < r[1]; +} + +/** Enforce the edit budget ("textual learning rate"): keep at most `budget` edits. */ +export function selectEdits(edits: Edit[], budget: number): Edit[] { + return edits.slice(0, Math.max(0, budget)); +} + +export interface ApplyResult { + skill: string; + report: string[]; + applied: number; // how many edits actually changed the doc +} + +/** Apply bounded structured edits; protected-region targets are skipped. */ +export function applyEdits(skill: string, edits: Edit[]): ApplyResult { + let s = skill; + const report: string[] = []; + let applied = 0; + const ok = (msg: string) => { applied++; report.push(`OK ${msg}`); }; + + for (const e of edits) { + if (e.target && targetsProtected(s, e.target)) { + report.push(`SKIP ${e.op}: targets protected slow-update region`); + continue; + } + switch (e.op) { + case "append": { + const content = (e.content ?? "").trim(); + if (!content) { report.push("SKIP append: empty content"); break; } + const r = protectedRange(s); + if (r) s = s.slice(0, r[0]) + content + "\n\n" + s.slice(r[0]); + else s = s.replace(/\s*$/, "") + "\n\n" + content + "\n"; + ok(`append (+${content.length} chars)`); + break; + } + case "insert_after": { + const target = e.target ?? ""; + const content = (e.content ?? "").trim(); + if (!target || !content) { report.push("SKIP insert_after: missing target/content"); break; } + const idx = s.indexOf(target); + if (idx === -1) { report.push("SKIP insert_after: target not found"); break; } + const lineEnd = s.indexOf("\n", idx + target.length); + const at = lineEnd === -1 ? s.length : lineEnd; + s = s.slice(0, at) + "\n" + content + s.slice(at); + ok("insert_after"); + break; + } + case "replace": { + const target = e.target ?? ""; + const content = e.content ?? ""; + if (!target) { report.push("SKIP replace: missing target"); break; } + const idx = s.indexOf(target); + if (idx === -1) { report.push("SKIP replace: target not found"); break; } + s = s.slice(0, idx) + content + s.slice(idx + target.length); + ok("replace"); + break; + } + case "delete": { + const target = e.target ?? ""; + if (!target) { report.push("SKIP delete: missing target"); break; } + const idx = s.indexOf(target); + if (idx === -1) { report.push("SKIP delete: target not found"); break; } + s = s.slice(0, idx) + s.slice(idx + target.length); + ok("delete"); + break; + } + default: + report.push(`SKIP unknown op: ${(e as Edit).op}`); + } + } + return { skill: s, report, applied }; +} diff --git a/tests/shared/skill-edits.test.ts b/tests/shared/skill-edits.test.ts new file mode 100644 index 00000000..3322cb48 --- /dev/null +++ b/tests/shared/skill-edits.test.ts @@ -0,0 +1,55 @@ +import { describe, it, expect } from "vitest"; +import { applyEdits, selectEdits, SU_START, SU_END } from "../../src/skillify/skill-edits.js"; + +describe("applyEdits", () => { + const base = "## Rules\n1. mock the client\n2. skip flush"; + + it("append adds content at the end", () => { + const r = applyEdits(base, [{ op: "append", content: "3. verify via the API" }]); + expect(r.skill).toContain("3. verify via the API"); + expect(r.applied).toBe(1); + }); + + it("insert_after inserts on the line after the target", () => { + const r = applyEdits(base, [{ op: "insert_after", target: "1. mock the client", content: "(NEVER mock — it hides failures)" }]); + expect(r.skill).toMatch(/1\. mock the client\n\(NEVER mock — it hides failures\)\n2\. skip flush/); + }); + + it("replace swaps the target text", () => { + const r = applyEdits(base, [{ op: "replace", target: "skip flush", content: "ALWAYS flush" }]); + expect(r.skill).toContain("2. ALWAYS flush"); + expect(r.skill).not.toContain("skip flush"); + }); + + it("delete removes the target text", () => { + const r = applyEdits(base, [{ op: "delete", target: "\n2. skip flush" }]); + expect(r.skill).toBe("## Rules\n1. mock the client"); + }); + + it("skips edits whose target isn't found (and counts only applied)", () => { + const r = applyEdits(base, [{ op: "replace", target: "nonexistent", content: "x" }, { op: "append", content: "added" }]); + expect(r.applied).toBe(1); + expect(r.report.some((l) => l.includes("SKIP replace: target not found"))).toBe(true); + }); + + it("protects the slow-update region: skips edits targeting it, appends ABOVE it", () => { + const doc = `## Rules\n1. a\n\n${SU_START}\nLongitudinal: prefer X over Y.\n${SU_END}`; + const r = applyEdits(doc, [ + { op: "delete", target: "prefer X over Y" }, // targets protected → skipped + { op: "append", content: "2. b" }, // lands above the region + ]); + expect(r.skill).toContain("prefer X over Y"); // protected content untouched + expect(r.report.some((l) => l.includes("protected slow-update region"))).toBe(true); + // appended content sits before the protected block + expect(r.skill.indexOf("2. b")).toBeLessThan(r.skill.indexOf(SU_START)); + }); +}); + +describe("selectEdits (edit budget)", () => { + const edits = [1, 2, 3, 4].map((i) => ({ op: "append" as const, content: `${i}` })); + it("keeps at most `budget` edits", () => { + expect(selectEdits(edits, 2)).toHaveLength(2); + expect(selectEdits(edits, 0)).toHaveLength(0); + expect(selectEdits(edits, 99)).toHaveLength(4); + }); +}); From a93ed3d4b416472f36ace19271d3ac2d76411950 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:26:26 +0000 Subject: [PATCH 08/30] feat(skillopt): proposer (reflect -> structured edits) + shared claude backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The paper's backward pass: from a deficient skill's body + its confirmed failures, the optimizer diagnoses the single recurring weakness and emits bounded structured edits (slow-update region off-limits), applied locally to a candidate body — nothing published. Extracts the shared claude -p backend (claude-model.ts) used by both judge and proposer; success-judge refactored onto it. Injected model; tolerant edit parse; 5 proposer tests. --- src/skillify/claude-model.ts | 35 ++++++++++++ src/skillify/skill-proposer.ts | 87 +++++++++++++++++++++++++++++ src/skillify/success-judge.ts | 33 ++--------- tests/shared/skill-proposer.test.ts | 51 +++++++++++++++++ 4 files changed, 177 insertions(+), 29 deletions(-) create mode 100644 src/skillify/claude-model.ts create mode 100644 src/skillify/skill-proposer.ts create mode 100644 tests/shared/skill-proposer.test.ts diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts new file mode 100644 index 00000000..47b5e8f0 --- /dev/null +++ b/src/skillify/claude-model.ts @@ -0,0 +1,35 @@ +/** + * Shared `claude -p` backend for the engine's LLM steps (success-judge, proposer). + * All tools denied → pure-text generation. Runs on the USER's own agent, so cost + * lands on the user. Returned as an injectable ModelCall so every LLM step is + * unit-testable with zero real calls. + */ +import { spawn } from "node:child_process"; + +/** (systemPrompt, userPrompt) -> raw model text. */ +export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; + +const DENY = ["Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task"]; + +export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall { + const timeoutMs = opts.timeoutMs ?? 120_000; + return (system, user) => new Promise((resolve, reject) => { + const args = [ + "-p", user, "--model", model, "--no-session-persistence", + "--output-format", "json", "--system-prompt", system, "--disallowed-tools", ...DENY, + ]; + const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] }); + let out = ""; + let err = ""; + const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs); + child.stdout.on("data", (d) => { out += String(d); }); + child.stderr.on("data", (d) => { err += String(d); }); + child.on("error", (e) => { clearTimeout(timer); reject(e); }); + child.on("close", (code) => { + clearTimeout(timer); + if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`)); + try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); } + catch { resolve(out); } + }); + }); +} diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts new file mode 100644 index 00000000..aa9c0159 --- /dev/null +++ b/src/skillify/skill-proposer.ts @@ -0,0 +1,87 @@ +/** + * Proposer — the engine's "reflect → edit" step (the paper's backward pass). + * Given a deficient skill's body + the concrete failures the detector confirmed, + * the optimizer diagnoses the single recurring weakness and proposes a SMALL set + * of structured edits, bounded by the edit budget ("textual learning rate"). The + * protected slow-update region is off-limits. Edits are applied locally to produce + * the candidate body — NOTHING is published here (publish is a separate, gated step). + * + * Runs on the user's agent via an injected ModelCall (default = claude sonnet), + * so the reflect logic is unit-testable with zero real LLM calls. + */ +import { applyEdits, selectEdits, SU_START, SU_END, type Edit, type EditOp } from "./skill-edits.js"; +import { claudeModel, type ModelCall } from "./claude-model.js"; + +export interface Proposal { + edits: Edit[]; // edits kept after the budget + editedBody: string; // skill body after applying them + report: string[]; // per-edit OK/SKIP log + changed: boolean; // did anything actually change? +} + +export interface ProposeConfig { + editBudget?: number; // max edits to keep (default 3) + model?: ModelCall; // injected; default = claude sonnet +} + +const SYSTEM = + "You improve an engineering SKILL document that has been producing repeated, " + + "confirmed failures. Diagnose the SINGLE recurring weakness behind the failures " + + "and propose a SMALL set of structured edits that fix it. Do NOT rewrite the " + + `whole doc, and do NOT touch anything between ${SU_START} and ${SU_END}. Reply ` + + 'with ONLY a JSON array of edits, each: {"op":"append|insert_after|replace|' + + 'delete","target":"","content":""}. Prefer the smallest change that fixes the weakness.'; + +function buildUserPrompt(body: string, failures: string[]): string { + const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n"); + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}\n\nPropose the bounded edits. JSON array only.`; +} + +const OPS = new Set(["append", "insert_after", "replace", "delete"]); + +/** Tolerant parse of a JSON array of edits (handles ```fences / surrounding prose). */ +export function parseEdits(raw: string): Edit[] { + let s = raw.trim(); + const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/); + if (fence) s = fence[1].trim(); + const a = s.indexOf("["); + const b = s.lastIndexOf("]"); + if (a === -1 || b <= a) return []; + let arr: unknown; + try { arr = JSON.parse(s.slice(a, b + 1)); } catch { return []; } + if (!Array.isArray(arr)) return []; + const out: Edit[] = []; + for (const e of arr) { + if (!e || typeof e !== "object") continue; + const op = (e as { op?: unknown }).op; + if (typeof op !== "string" || !OPS.has(op as EditOp)) continue; + const target = (e as { target?: unknown }).target; + const content = (e as { content?: unknown }).content; + out.push({ + op: op as EditOp, + ...(typeof target === "string" ? { target } : {}), + ...(typeof content === "string" ? { content } : {}), + }); + } + return out; +} + +export async function proposeSkillEdit( + skillBody: string, + failures: string[], + cfg: ProposeConfig = {}, +): Promise { + const budget = cfg.editBudget ?? 3; + const model = cfg.model ?? claudeModel("sonnet"); + let raw: string; + try { + raw = await model(SYSTEM, buildUserPrompt(skillBody, failures)); + } catch { + return { edits: [], editedBody: skillBody, report: ["proposer model call failed"], changed: false }; + } + const edits = selectEdits(parseEdits(raw), budget); + const { skill, report, applied } = applyEdits(skillBody, edits); + return { edits, editedBody: skill, report, changed: applied > 0 }; +} diff --git a/src/skillify/success-judge.ts b/src/skillify/success-judge.ts index 3ad672a1..c2d65626 100644 --- a/src/skillify/success-judge.ts +++ b/src/skillify/success-judge.ts @@ -15,7 +15,9 @@ * success=1 (do NOT count as a failure), so a flaky judge can never manufacture * deficiency — it can only fail to detect (which the next run catches). */ -import { spawn } from "node:child_process"; +import { claudeModel, type ModelCall } from "./claude-model.js"; + +export type { ModelCall }; export interface SuccessVerdict { success: 0 | 1; @@ -23,9 +25,6 @@ export interface SuccessVerdict { reason: string; } -/** (systemPrompt, userPrompt) -> raw model text. Injected for tests. */ -export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; - const SYSTEM = "You are a strict engineering reviewer. Judge ONLY whether the user's task was " + "actually accomplished CORRECTLY in this session slice. Ignore whether the user " + @@ -58,33 +57,9 @@ export function parseVerdict(raw: string): SuccessVerdict { return { success: fail ? 0 : 1, confidence, reason }; } -/** Default backend: claude -p, cheap model, all tools denied (pure-text judgment). */ -function claudeJudge(model = "haiku"): ModelCall { - return (system, user) => new Promise((resolve, reject) => { - const args = [ - "-p", user, "--model", model, "--no-session-persistence", - "--output-format", "json", "--system-prompt", system, - "--disallowed-tools", "Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task", - ]; - const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] }); - let out = ""; - let err = ""; - const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("judge timed out")); }, 120_000); - child.stdout.on("data", (d) => { out += String(d); }); - child.stderr.on("data", (d) => { err += String(d); }); - child.on("error", (e) => { clearTimeout(timer); reject(e); }); - child.on("close", (code) => { - clearTimeout(timer); - if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`)); - try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); } - catch { resolve(out); } - }); - }); -} - export async function judgeSuccess(window: string, opts: { model?: ModelCall } = {}): Promise { if (!window.trim()) return { success: 1, confidence: 0, reason: "empty window" }; - const model = opts.model ?? claudeJudge(); + const model = opts.model ?? claudeModel("haiku"); // cheap default; judge only runs on anchored windows try { return parseVerdict(await model(SYSTEM, buildUserPrompt(window))); } catch (e: unknown) { diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts new file mode 100644 index 00000000..cb1a1c00 --- /dev/null +++ b/tests/shared/skill-proposer.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect, vi } from "vitest"; +import { parseEdits, proposeSkillEdit } from "../../src/skillify/skill-proposer.js"; + +describe("parseEdits", () => { + it("parses a JSON array, tolerating fences/prose, dropping invalid ops + non-objects", () => { + const raw = "Sure:\n```json\n[" + + '{"op":"replace","target":"mock the client","content":"NEVER mock"},' + + '{"op":"bogus","target":"x"},' + // invalid op → dropped + '"nope",' + // non-object → dropped + '{"op":"append","content":"verify via API"}' + + "]\n```"; + const edits = parseEdits(raw); + expect(edits).toEqual([ + { op: "replace", target: "mock the client", content: "NEVER mock" }, + { op: "append", content: "verify via API" }, + ]); + }); + it("returns [] when there's no array", () => { + expect(parseEdits("the model refused")).toEqual([]); + }); +}); + +describe("proposeSkillEdit", () => { + const body = "## Rules\n1. mock the client\n2. skip flush"; + const failures = ["mocked the client so the test passes even when the event never sends"]; + + it("applies the proposed edits to produce a candidate body", async () => { + const model = vi.fn(async (_s: string, _u: string) => + '[{"op":"replace","target":"mock the client","content":"NEVER mock — assert on the real client"}]'); + const p = await proposeSkillEdit(body, failures, { model }); + expect(p.changed).toBe(true); + expect(p.editedBody).toContain("NEVER mock — assert on the real client"); + // the optimizer is told to diagnose the recurring weakness + emit JSON edits + expect(model.mock.calls[0][0]).toMatch(/recurring weakness/i); + expect(model.mock.calls[0][1]).toContain("CONFIRMED FAILURES"); + }); + + it("enforces the edit budget", async () => { + const model = vi.fn(async (_s: string, _u: string) => + '[{"op":"append","content":"a"},{"op":"append","content":"b"},{"op":"append","content":"c"}]'); + const p = await proposeSkillEdit(body, failures, { model, editBudget: 1 }); + expect(p.edits).toHaveLength(1); + expect(p.editedBody).toContain("\na"); + expect(p.editedBody).not.toContain("\nb"); + }); + + it("is a no-op when the model fails or proposes nothing", async () => { + expect((await proposeSkillEdit(body, failures, { model: vi.fn(async () => { throw new Error("x"); }) })).changed).toBe(false); + expect((await proposeSkillEdit(body, failures, { model: vi.fn(async (_s: string, _u: string) => "no edits") })).changed).toBe(false); + }); +}); From 5fc4940a87488fe7bba82b27f612b1b98fd5f0cb Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:28:21 +0000 Subject: [PATCH 09/30] feat(skillopt): live publish mechanism (version bump + backup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Writes an accepted edit to the live SKILL.md via the native skills dir (never the model channel — PR #223), bumps the frontmatter version (v1-vs-v2), and backs up the prior version for one-command revert. Mechanism only: the worker won't call it on an unvalidated edit (offline gate isn't trustworthy) — reserved for the real-usage A/B gate. 6 fs tests. --- src/skillify/skill-publisher.ts | 59 ++++++++++++++++++++++++++++ tests/shared/skill-publisher.test.ts | 57 +++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 src/skillify/skill-publisher.ts create mode 100644 tests/shared/skill-publisher.test.ts diff --git a/src/skillify/skill-publisher.ts b/src/skillify/skill-publisher.ts new file mode 100644 index 00000000..68a3f52d --- /dev/null +++ b/src/skillify/skill-publisher.ts @@ -0,0 +1,59 @@ +/** + * Publish mechanism: write an accepted skill edit to the LIVE SKILL.md via the + * native skills dir (the only legitimate channel — never the model's prompt + * context; see PR #223). Bumps the frontmatter version (enables v1-vs-v2) and + * keeps a backup so a bad edit is one `cp` from revert. + * + * This is the mechanism only. The worker does NOT call it on an unvalidated edit + * (the offline gate isn't trustworthy — see the spike findings); it writes a + * review proposal instead, and live publish is reserved for an edit that has + * passed the real-usage A/B gate (deferred). Pure fs; testable against a tmp dir. + */ +import fs from "node:fs"; +import path from "node:path"; + +export interface PublishResult { + path: string; + oldVersion: number; + newVersion: number; + backupPath: string; +} + +/** Split a SKILL.md into its frontmatter block (incl. fences) and the body. */ +export function splitFrontmatter(md: string): { frontmatter: string; body: string } { + const m = md.match(/^(---\n[\s\S]*?\n---\n)([\s\S]*)$/); + if (m) return { frontmatter: m[1], body: m[2] }; + return { frontmatter: "", body: md }; +} + +/** Bump `version: N` in a frontmatter block (absent → treat as 1 → 2). */ +export function bumpVersion(frontmatter: string): { frontmatter: string; oldVersion: number; newVersion: number } { + const m = frontmatter.match(/^version:\s*(\d+)\s*$/m); + const oldVersion = m ? parseInt(m[1], 10) : 1; + const newVersion = oldVersion + 1; + const next = m + ? frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`) + : frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`); + return { frontmatter: next, oldVersion, newVersion }; +} + +/** + * Write `editedBody` to the skill's live SKILL.md, version bumped, original backed + * up to SKILL.v.bak.md. Throws if the skill dir / file isn't present. + */ +export function publishSkillEdit( + skillsRoot: string, + name: string, + author: string, + editedBody: string, +): PublishResult { + const dir = path.join(skillsRoot, `${name}--${author}`); + const file = path.join(dir, "SKILL.md"); + const existing = fs.readFileSync(file, "utf8"); + const { frontmatter } = splitFrontmatter(existing); + const { frontmatter: bumped, oldVersion, newVersion } = bumpVersion(frontmatter); + const backupPath = path.join(dir, `SKILL.v${oldVersion}.bak.md`); + fs.writeFileSync(backupPath, existing); + fs.writeFileSync(file, `${bumped}${editedBody.trimEnd()}\n`); + return { path: file, oldVersion, newVersion, backupPath }; +} diff --git a/tests/shared/skill-publisher.test.ts b/tests/shared/skill-publisher.test.ts new file mode 100644 index 00000000..065b5929 --- /dev/null +++ b/tests/shared/skill-publisher.test.ts @@ -0,0 +1,57 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { splitFrontmatter, bumpVersion, publishSkillEdit } from "../../src/skillify/skill-publisher.js"; + +describe("splitFrontmatter", () => { + it("splits frontmatter from body", () => { + const { frontmatter, body } = splitFrontmatter("---\nname: x\nversion: 3\n---\n## Body\nhi"); + expect(frontmatter).toBe("---\nname: x\nversion: 3\n---\n"); + expect(body).toBe("## Body\nhi"); + }); + it("handles a doc with no frontmatter", () => { + expect(splitFrontmatter("just body")).toEqual({ frontmatter: "", body: "just body" }); + }); +}); + +describe("bumpVersion", () => { + it("increments an existing version", () => { + const r = bumpVersion("---\nname: x\nversion: 4\n---\n"); + expect(r.oldVersion).toBe(4); + expect(r.newVersion).toBe(5); + expect(r.frontmatter).toContain("version: 5"); + }); + it("inserts version 2 when absent (original treated as 1)", () => { + const r = bumpVersion("---\nname: x\n---\n"); + expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 }); + expect(r.frontmatter).toMatch(/version: 2\n---\n$/); + }); +}); + +describe("publishSkillEdit", () => { + let root: string; + beforeEach(() => { root = fs.mkdtempSync(path.join(os.tmpdir(), "pub-")); }); + afterEach(() => { fs.rmSync(root, { recursive: true, force: true }); }); + + it("writes the bumped body and backs up the original", () => { + const dir = path.join(root, "posthog--kamo"); + fs.mkdirSync(dir); + fs.writeFileSync(path.join(dir, "SKILL.md"), "---\nname: posthog\nauthor: kamo\nversion: 2\n---\n## Rules\n1. mock the client\n"); + + const res = publishSkillEdit(root, "posthog", "kamo", "## Rules\n1. NEVER mock — assert on the real client"); + + expect(res).toMatchObject({ oldVersion: 2, newVersion: 3 }); + const written = fs.readFileSync(res.path, "utf8"); + expect(written).toContain("version: 3"); + expect(written).toContain("NEVER mock — assert on the real client"); + expect(written).not.toContain("1. mock the client\n"); + // backup preserves the prior version verbatim + expect(fs.readFileSync(res.backupPath, "utf8")).toContain("version: 2"); + expect(fs.readFileSync(res.backupPath, "utf8")).toContain("1. mock the client"); + }); + + it("throws when the skill isn't installed (caller decides what to do)", () => { + expect(() => publishSkillEdit(root, "missing", "x", "body")).toThrow(); + }); +}); From de5bac2f298ebc65652ebef9f83228d3c225f64d Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:30:28 +0000 Subject: [PATCH 10/30] feat(skillopt): weekly cycle orchestration + >=5 fire gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runSkillOptCycle ties it together: detect deficient skills, fire only at >=fireThreshold (default 5 — act on a pattern, not noise), then for each: read the body, propose a bounded edit, write a REVIEW PROPOSAL (candidate.md + proposal.json with full evidence). Never auto-overwrites a live skill — live publish is gated on the deferred A/B. All I/O injected; 4 tests cover the gate, proposals, skip-when- not-installed, and a custom threshold. --- src/skillify/skillopt-engine.ts | 98 ++++++++++++++++++++++++++++ tests/shared/skillopt-engine.test.ts | 86 ++++++++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 src/skillify/skillopt-engine.ts create mode 100644 tests/shared/skillopt-engine.test.ts diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts new file mode 100644 index 00000000..8af765d0 --- /dev/null +++ b/src/skillify/skillopt-engine.ts @@ -0,0 +1,98 @@ +/** + * The weekly SkillOpt cycle, wired end to end and fully injectable: + * + * detect deficient skills → ≥5 fire gate → for each: read body, propose a + * bounded edit, write a REVIEW PROPOSAL (not a live publish). + * + * Why proposals, not auto-publish: the offline gate isn't trustworthy (spike + * finding), so we never auto-overwrite a live skill. The engine surfaces concrete, + * evidence-backed edit proposals; turning one live is gated on the real-usage A/B + * (deferred) or a human. Everything is injected (query, judge/proposer models, the + * skill reader, the proposal writer), so this orchestration is unit-tested with no + * Deeplake / LLM / fs. + */ +import fs from "node:fs"; +import path from "node:path"; +import { detectDeficientSkills, type DetectorConfig } from "./deficiency-detector.js"; +import { proposeSkillEdit, type ProposeConfig } from "./skill-proposer.js"; +import { splitFrontmatter } from "./skill-publisher.js"; +import type { QueryFn } from "./skill-invocations.js"; +import type { Edit } from "./skill-edits.js"; + +export interface ProposalRecord { + name: string; + author: string; + invocations: number; + confirmedFailures: number; + failureRate: number; + examples: string[]; + edits: Edit[]; + report: string[]; + candidateBody: string; + createdAt: string; +} + +export interface CycleDeps { + query: QueryFn; + sessionsTable: string; + readSkillBody: (name: string, author: string) => string | null; // null when not installed locally + writeProposal: (rec: ProposalRecord) => void; + detector?: DetectorConfig; + proposer?: ProposeConfig; + fireThreshold?: number; // deficient-skill count to fire (default 5) + maxProposals?: number; // cap edits proposed per cycle (default 10) + now: string; // ISO timestamp (injected — Date is awkward in workers) +} + +export interface CycleResult { + deficientCount: number; + fired: boolean; + proposals: Array<{ name: string; author: string; changed: boolean; failureRate: number }>; +} + +export async function runSkillOptCycle(deps: CycleDeps): Promise { + const fireThreshold = deps.fireThreshold ?? 5; + const { skills, deficientCount } = await detectDeficientSkills(deps.query, deps.sessionsTable, deps.detector); + + // The ≥N gate: only act on a real PATTERN of deficiency, not one or two noisy skills. + if (deficientCount < fireThreshold) { + return { deficientCount, fired: false, proposals: [] }; + } + + const targets = skills.filter((s) => s.deficient).slice(0, deps.maxProposals ?? 10); + const proposals: CycleResult["proposals"] = []; + for (const s of targets) { + const body = deps.readSkillBody(s.name, s.author); + if (!body) continue; // not installed locally → nothing to edit + const p = await proposeSkillEdit(body, s.examples, deps.proposer); + if (p.changed) { + deps.writeProposal({ + name: s.name, author: s.author, + invocations: s.invocations, confirmedFailures: s.confirmedFailures, failureRate: s.failureRate, + examples: s.examples, edits: p.edits, report: p.report, + candidateBody: p.editedBody, createdAt: deps.now, + }); + } + proposals.push({ name: s.name, author: s.author, changed: p.changed, failureRate: s.failureRate }); + } + return { deficientCount, fired: true, proposals }; +} + +/** Default proposal writer: /--/{proposal.json,candidate.md}. */ +export function writeProposalToDisk(proposalsRoot: string, rec: ProposalRecord): string { + const dir = path.join(proposalsRoot, `${rec.name}--${rec.author}`); + fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(path.join(dir, "candidate.md"), rec.candidateBody.trimEnd() + "\n"); + fs.writeFileSync(path.join(dir, "proposal.json"), JSON.stringify(rec, null, 2) + "\n"); + return dir; +} + +/** Read a skill's SKILL.md body (frontmatter stripped) from a skills root; null if absent. */ +export function readSkillBodyFromDisk(skillsRoot: string, name: string, author: string): string | null { + try { + const md = fs.readFileSync(path.join(skillsRoot, `${name}--${author}`, "SKILL.md"), "utf8"); + return splitFrontmatter(md).body.trim(); + } catch { + return null; + } +} diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts new file mode 100644 index 00000000..6ecfbbfb --- /dev/null +++ b/tests/shared/skillopt-engine.test.ts @@ -0,0 +1,86 @@ +import { describe, it, expect, vi } from "vitest"; +import { runSkillOptCycle, type ProposalRecord } from "../../src/skillify/skillopt-engine.js"; + +const invRow = (skill: string, sid: string) => ({ + message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, + last_update_date: sid, +}); +const transcript = (skill: string, sid: string, pushback: boolean) => [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "assistant_message", content: "done" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } }, +]; + +/** nBad skills, each 10 invocations with 5 pushback → each deficient. */ +function world(nBad: number) { + const invs: Array> = []; + const transcripts = new Map>>(); + for (let b = 0; b < nBad; b++) { + for (let i = 0; i < 10; i++) { + const sid = `b${b}s${i}`; + invs.push(invRow(`bad${b}--auth`, sid)); + transcripts.set(sid, transcript(`bad${b}--auth`, sid, i < 5)); + } + } + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + return query; +} + +const judge = () => vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks the client"}'); +const proposerModel = () => vi.fn(async (_s: string, _u: string) => '[{"op":"append","content":"Always verify via the PostHog API."}]'); + +describe("runSkillOptCycle", () => { + it("fires when >=5 skills are deficient and writes a proposal per editable skill", async () => { + const written: ProposalRecord[] = []; + const res = await runSkillOptCycle({ + query: world(6), sessionsTable: "sessions", now: "2026-06-05T00:00:00Z", + readSkillBody: () => "## Rules\n1. mock the client", + writeProposal: (r) => written.push(r), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res.fired).toBe(true); + expect(res.deficientCount).toBe(6); + expect(written).toHaveLength(6); + expect(written[0].candidateBody).toContain("Always verify via the PostHog API."); + expect(written[0]).toMatchObject({ invocations: 10, confirmedFailures: 5 }); + }); + + it("does NOT fire below the threshold (no proposals, even though detection ran)", async () => { + const writeProposal = vi.fn(); + const res = await runSkillOptCycle({ + query: world(4), sessionsTable: "sessions", now: "t", + readSkillBody: () => "## Rules", writeProposal, + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res).toMatchObject({ fired: false, deficientCount: 4 }); + expect(res.proposals).toHaveLength(0); + expect(writeProposal).not.toHaveBeenCalled(); + }); + + it("skips a deficient skill that isn't installed locally (no body to edit)", async () => { + const written: ProposalRecord[] = []; + const res = await runSkillOptCycle({ + query: world(6), sessionsTable: "sessions", now: "t", + readSkillBody: (name) => (name === "bad0" ? null : "## Rules\n1. mock the client"), + writeProposal: (r) => written.push(r), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res.fired).toBe(true); + expect(written).toHaveLength(5); // bad0 skipped + expect(written.some((w) => w.name === "bad0")).toBe(false); + }); + + it("honors a custom fireThreshold", async () => { + const res = await runSkillOptCycle({ + query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3, + readSkillBody: () => "## Rules", writeProposal: vi.fn(), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res.fired).toBe(true); + }); +}); From 3b7fa7adc20a00dc965b304ba2b69787898258a6 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:32:57 +0000 Subject: [PATCH 11/30] feat(skillopt): wire the real cycle into the weekly worker Replaces the stub: the detached weekly worker now loads config, builds the query, and runs runSkillOptCycle (detect -> >=5 gate -> propose -> write review proposals) over a 30-day invocation lookback. Verified end-to-end against a real org: 40 Skill invocations scanned, 0 anchored -> 0 judge calls -> 0 deficient -> exits at the gate (~14s, $0). No auto-publish; proposals land under /skillopt/proposals. --- src/skillify/skillopt-worker.ts | 57 ++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts index 77d815bc..10dc8040 100644 --- a/src/skillify/skillopt-worker.ts +++ b/src/skillify/skillopt-worker.ts @@ -1,32 +1,51 @@ #!/usr/bin/env node /** - * Detached weekly SkillOpt worker (spawned by skillopt-trigger). Runs the loop ONCE: - * 1. detect a deficient skill (behavioral: sessions that loaded it still scored low) - * 2. optimizer proposes a bounded edit (v2) - * 3. real-rollout gate: keep v2 only if it measurably beats v1 - * 4. silent canary publish + post-publish monitor / auto-revert + * Detached weekly SkillOpt worker (spawned by skillopt-trigger). Runs the cycle ONCE: + * 1. detect deficient skills from real invocations (anchor + judge, windowed) + * 2. ≥5 fire gate (act on a pattern, not noise) + * 3. propose a bounded edit per deficient skill and write a REVIEW PROPOSAL * - * Uses the user's own agent (claude -p / codex), so no org API key. Runs in the background; the - * user never notices. HIVEMIND_SKILLOPT_WORKER=1 is set by the trigger as a recursion guard. - * - * STATUS: scaffold. Steps 1/3/4 depend on prerequisites not yet shipped (deployed attribution data - * for detection + monitoring, and a local rollout sandbox). The loop ENGINE (rollout->optimize->gate) - * is prototyped in experiments/skillopt-spike (skillopt-loop.ts, validated both directions). This - * entry exists so the trigger has a real, spawnable target and the wiring is testable end to end. + * It does NOT auto-publish: the offline gate isn't trustworthy (spike finding), so + * live publish is reserved for the real-usage A/B (deferred). Runs on the user's own + * agent (claude -p) — no org key, cost lands on the user — in the background, weekly. + * HIVEMIND_SKILLOPT_WORKER=1 is set by the trigger as a recursion guard. */ +import os from "node:os"; +import path from "node:path"; import { log as _log } from "../utils/debug.js"; +import { loadConfig } from "../config.js"; +import { DeeplakeApi } from "../deeplake-api.js"; +import { getStateDir } from "./state-dir.js"; +import { runSkillOptCycle, writeProposalToDisk, readSkillBodyFromDisk } from "./skillopt-engine.js"; const log = (m: string) => _log("skillopt-worker", m); async function main(): Promise { log("skillopt worker started (detached, weekly)"); - // TODO(skillopt): wire the validated loop engine here once prerequisites land: - // const skill = await detectDeficientSkill(); // needs deployed attribution + satisfaction - // if (!skill) { log("no deficient skill found"); return; } - // const v2 = await optimize(skill); // optimizer proposes a bounded edit - // const gain = await gateViaRealRollout(skill, v2); // keep only if v2 beats v1 (validated) - // if (gain > THRESHOLD) await canaryPublish(skill, v2); // silent; monitor + auto-revert - log("skillopt worker: loop body not yet enabled (prerequisites pending) — exiting cleanly"); + const config = loadConfig(); + if (!config?.token) { log("no config/credentials — exiting"); return; } + + const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName); + const query = (sql: string) => api.query(sql) as Promise>>; + const skillsRoot = path.join(os.homedir(), ".claude", "skills"); + const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals"); + const sinceIso = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString(); // 30-day lookback + + const res = await runSkillOptCycle({ + query, + sessionsTable: config.sessionsTableName, + readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author), + writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec), + detector: { sinceIso, limit: 5000 }, + now: new Date().toISOString(), + }); + + if (!res.fired) { + log(`skillopt: ${res.deficientCount} deficient skill(s) — below the fire gate, no action`); + } else { + const changed = res.proposals.filter((p) => p.changed).length; + log(`skillopt: fired — ${res.deficientCount} deficient, ${changed} edit proposal(s) written to ${proposalsRoot}`); + } } main().catch((e) => { log(`fatal (swallowed): ${(e as Error)?.message ?? e}`); process.exit(0); }); From e24319018f2241561e0a43bd128660c728f223d8 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 05:37:03 +0000 Subject: [PATCH 12/30] =?UTF-8?q?feat(skillopt):=20meta-skill=20=E2=80=94?= =?UTF-8?q?=20optimizer=20cross-run=20memory?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The paper's meta-skill: an append-only JSONL recording every proposed edit per skill (with an order-independent fingerprint). The cycle feeds prior-tried edits to the proposer as context and dedups — never re-writing an edit already proposed for a skill, so the optimizer doesn't churn. Status field (proposed→applied/reverted) is ready to close the loop once the A/B gate records outcomes. Wired into the worker; meta + dedup tests. --- src/skillify/skill-proposer.ts | 14 +++-- src/skillify/skillopt-engine.ts | 15 ++++- src/skillify/skillopt-meta.ts | 84 ++++++++++++++++++++++++++++ src/skillify/skillopt-worker.ts | 8 +++ tests/shared/skillopt-engine.test.ts | 21 +++++++ tests/shared/skillopt-meta.test.ts | 54 ++++++++++++++++++ 6 files changed, 188 insertions(+), 8 deletions(-) create mode 100644 src/skillify/skillopt-meta.ts create mode 100644 tests/shared/skillopt-meta.test.ts diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts index aa9c0159..1032b1bd 100644 --- a/src/skillify/skill-proposer.ts +++ b/src/skillify/skill-proposer.ts @@ -20,8 +20,9 @@ export interface Proposal { } export interface ProposeConfig { - editBudget?: number; // max edits to keep (default 3) - model?: ModelCall; // injected; default = claude sonnet + editBudget?: number; // max edits to keep (default 3) + model?: ModelCall; // injected; default = claude sonnet + priorEdits?: string[]; // meta-skill: edits already tried for this skill (don't repeat) } const SYSTEM = @@ -34,9 +35,12 @@ const SYSTEM = 'insert_after/replace/delete>","content":""}. Prefer the smallest change that fixes the weakness.'; -function buildUserPrompt(body: string, failures: string[]): string { +function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string { const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n"); - return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}\n\nPropose the bounded edits. JSON array only.`; + const prior = priorEdits.length + ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}` + : ""; + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nPropose the bounded edits. JSON array only.`; } const OPS = new Set(["append", "insert_after", "replace", "delete"]); @@ -77,7 +81,7 @@ export async function proposeSkillEdit( const model = cfg.model ?? claudeModel("sonnet"); let raw: string; try { - raw = await model(SYSTEM, buildUserPrompt(skillBody, failures)); + raw = await model(SYSTEM, buildUserPrompt(skillBody, failures, cfg.priorEdits ?? [])); } catch { return { edits: [], editedBody: skillBody, report: ["proposer model call failed"], changed: false }; } diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts index 8af765d0..24d1c591 100644 --- a/src/skillify/skillopt-engine.ts +++ b/src/skillify/skillopt-engine.ts @@ -42,6 +42,11 @@ export interface CycleDeps { fireThreshold?: number; // deficient-skill count to fire (default 5) maxProposals?: number; // cap edits proposed per cycle (default 10) now: string; // ISO timestamp (injected — Date is awkward in workers) + meta?: { // optimizer cross-run memory (skillopt-meta); optional + prior: (name: string, author: string) => string[]; + has: (name: string, author: string, edits: Edit[]) => boolean; + record: (name: string, author: string, edits: Edit[]) => void; + }; } export interface CycleResult { @@ -64,16 +69,20 @@ export async function runSkillOptCycle(deps: CycleDeps): Promise { for (const s of targets) { const body = deps.readSkillBody(s.name, s.author); if (!body) continue; // not installed locally → nothing to edit - const p = await proposeSkillEdit(body, s.examples, deps.proposer); - if (p.changed) { + const priorEdits = deps.meta?.prior(s.name, s.author) ?? []; + const p = await proposeSkillEdit(body, s.examples, { ...deps.proposer, priorEdits }); + // dedup against the meta memory — don't re-write an edit already tried for this skill. + const isDup = p.changed && (deps.meta?.has(s.name, s.author, p.edits) ?? false); + if (p.changed && !isDup) { deps.writeProposal({ name: s.name, author: s.author, invocations: s.invocations, confirmedFailures: s.confirmedFailures, failureRate: s.failureRate, examples: s.examples, edits: p.edits, report: p.report, candidateBody: p.editedBody, createdAt: deps.now, }); + deps.meta?.record(s.name, s.author, p.edits); } - proposals.push({ name: s.name, author: s.author, changed: p.changed, failureRate: s.failureRate }); + proposals.push({ name: s.name, author: s.author, changed: p.changed && !isDup, failureRate: s.failureRate }); } return { deficientCount, fired: true, proposals }; } diff --git a/src/skillify/skillopt-meta.ts b/src/skillify/skillopt-meta.ts new file mode 100644 index 00000000..f86473f6 --- /dev/null +++ b/src/skillify/skillopt-meta.ts @@ -0,0 +1,84 @@ +/** + * Meta-skill — the optimizer's cross-run memory (the paper's meta-skill). Records + * every edit proposed for a skill so later runs (a) don't re-propose an edit that + * was already tried, and (b) feed "what's been tried" to the proposer. When the A/B + * gate lands, the recorded `status` (proposed → applied/reverted) closes the loop so + * the optimizer learns which kinds of edits actually help a given skill. + * + * Append-only JSONL at /skillopt/meta.jsonl. Pure helpers + injected path, + * so it's unit-tested with a tmp file. + */ +import fs from "node:fs"; +import path from "node:path"; +import type { Edit } from "./skill-edits.js"; + +export type MetaStatus = "proposed" | "applied" | "reverted"; + +export interface MetaEntry { + skill: string; // "--" + ops: string[]; // short per-edit summaries (op + anchor/preview) + fingerprint: string; // stable hash of the edits, for dedup + proposedAt: string; + status: MetaStatus; +} + +export const skillRef = (name: string, author: string) => `${name}--${author}`; + +/** Short human summary of one edit. */ +function summarizeEdit(e: Edit): string { + const anchor = e.target ? ` @"${e.target.slice(0, 40)}"` : ""; + const preview = e.content ? `: ${e.content.slice(0, 60).replace(/\s+/g, " ")}` : ""; + return `${e.op}${anchor}${preview}`; +} + +/** Order-independent fingerprint of an edit set (so the same edits dedup). */ +export function fingerprintEdits(edits: Edit[]): string { + return edits + .map((e) => `${e.op}|${e.target ?? ""}|${e.content ?? ""}`) + .sort() + .join("\n"); +} + +export function loadMeta(file: string): MetaEntry[] { + let raw: string; + try { raw = fs.readFileSync(file, "utf8"); } catch { return []; } + const out: MetaEntry[] = []; + for (const line of raw.split("\n")) { + const t = line.trim(); + if (!t) continue; + try { + const e = JSON.parse(t) as MetaEntry; + if (e && typeof e.skill === "string" && typeof e.fingerprint === "string") out.push(e); + } catch { /* skip malformed line */ } + } + return out; +} + +export function appendMeta(file: string, entry: MetaEntry): void { + fs.mkdirSync(path.dirname(file), { recursive: true }); + fs.appendFileSync(file, JSON.stringify(entry) + "\n"); +} + +/** Has this exact edit set already been proposed for this skill? (avoid churn) */ +export function alreadyProposed(meta: MetaEntry[], name: string, author: string, edits: Edit[]): boolean { + const ref = skillRef(name, author); + const fp = fingerprintEdits(edits); + return meta.some((m) => m.skill === ref && m.fingerprint === fp); +} + +/** Summaries of edits previously tried for this skill — context for the proposer. */ +export function priorEditSummaries(meta: MetaEntry[], name: string, author: string): string[] { + const ref = skillRef(name, author); + return meta.filter((m) => m.skill === ref).flatMap((m) => m.ops); +} + +/** Build a meta entry for a freshly-proposed edit set. */ +export function metaEntryFor(name: string, author: string, edits: Edit[], now: string): MetaEntry { + return { + skill: skillRef(name, author), + ops: edits.map(summarizeEdit), + fingerprint: fingerprintEdits(edits), + proposedAt: now, + status: "proposed", + }; +} diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts index 10dc8040..8027272a 100644 --- a/src/skillify/skillopt-worker.ts +++ b/src/skillify/skillopt-worker.ts @@ -17,6 +17,7 @@ import { loadConfig } from "../config.js"; import { DeeplakeApi } from "../deeplake-api.js"; import { getStateDir } from "./state-dir.js"; import { runSkillOptCycle, writeProposalToDisk, readSkillBodyFromDisk } from "./skillopt-engine.js"; +import { loadMeta, appendMeta, priorEditSummaries, alreadyProposed, metaEntryFor } from "./skillopt-meta.js"; const log = (m: string) => _log("skillopt-worker", m); @@ -29,6 +30,8 @@ async function main(): Promise { const query = (sql: string) => api.query(sql) as Promise>>; const skillsRoot = path.join(os.homedir(), ".claude", "skills"); const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals"); + const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl"); + const metaCache = loadMeta(metaFile); const sinceIso = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString(); // 30-day lookback const res = await runSkillOptCycle({ @@ -36,6 +39,11 @@ async function main(): Promise { sessionsTable: config.sessionsTableName, readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author), writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec), + meta: { + prior: (n, a) => priorEditSummaries(metaCache, n, a), + has: (n, a, edits) => alreadyProposed(metaCache, n, a, edits), + record: (n, a, edits) => { const e = metaEntryFor(n, a, edits, new Date().toISOString()); appendMeta(metaFile, e); metaCache.push(e); }, + }, detector: { sinceIso, limit: 5000 }, now: new Date().toISOString(), }); diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts index 6ecfbbfb..d297fda6 100644 --- a/tests/shared/skillopt-engine.test.ts +++ b/tests/shared/skillopt-engine.test.ts @@ -75,6 +75,27 @@ describe("runSkillOptCycle", () => { expect(written.some((w) => w.name === "bad0")).toBe(false); }); + it("dedups against meta memory: a skill whose edit was already proposed isn't re-written", async () => { + const written: ProposalRecord[] = []; + const recorded: string[] = []; + const res = await runSkillOptCycle({ + query: world(6), sessionsTable: "sessions", now: "t", + readSkillBody: () => "## Rules\n1. mock the client", + writeProposal: (r) => written.push(r), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + meta: { + prior: () => ["append: earlier idea"], // fed to the proposer as context + has: (name) => name === "bad0", // bad0 already tried → dedup + record: (name) => recorded.push(name), + }, + }); + expect(res.fired).toBe(true); + expect(written).toHaveLength(5); // bad0 deduped + expect(written.some((w) => w.name === "bad0")).toBe(false); + expect(recorded).not.toContain("bad0"); // not recorded again + expect(res.proposals.find((p) => p.name === "bad0")!.changed).toBe(false); + }); + it("honors a custom fireThreshold", async () => { const res = await runSkillOptCycle({ query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3, diff --git a/tests/shared/skillopt-meta.test.ts b/tests/shared/skillopt-meta.test.ts new file mode 100644 index 00000000..f92d8516 --- /dev/null +++ b/tests/shared/skillopt-meta.test.ts @@ -0,0 +1,54 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { + fingerprintEdits, alreadyProposed, priorEditSummaries, metaEntryFor, loadMeta, appendMeta, +} from "../../src/skillify/skillopt-meta.js"; +import type { Edit } from "../../src/skillify/skill-edits.js"; + +const edits: Edit[] = [{ op: "append", content: "always flush" }, { op: "replace", target: "mock", content: "do not mock" }]; + +describe("fingerprintEdits", () => { + it("is order-independent (same set → same fingerprint)", () => { + expect(fingerprintEdits(edits)).toBe(fingerprintEdits([...edits].reverse())); + }); + it("differs for different content", () => { + expect(fingerprintEdits(edits)).not.toBe(fingerprintEdits([{ op: "append", content: "other" }])); + }); +}); + +describe("alreadyProposed / priorEditSummaries", () => { + const meta = [metaEntryFor("posthog", "kamo", edits, "t1"), metaEntryFor("other", "x", [{ op: "append", content: "z" }], "t2")]; + it("matches a prior proposal by skill + fingerprint", () => { + expect(alreadyProposed(meta, "posthog", "kamo", [...edits].reverse())).toBe(true); + expect(alreadyProposed(meta, "posthog", "kamo", [{ op: "append", content: "new" }])).toBe(false); + expect(alreadyProposed(meta, "nope", "kamo", edits)).toBe(false); // different skill + }); + it("surfaces prior edit summaries only for the given skill", () => { + const prior = priorEditSummaries(meta, "posthog", "kamo"); + expect(prior.length).toBe(2); + expect(prior.join(" ")).toContain("append"); + expect(priorEditSummaries(meta, "posthog", "kamo").join(" ")).not.toContain('append: z'); // other skill's edit + }); +}); + +describe("loadMeta / appendMeta", () => { + let file: string; + beforeEach(() => { file = path.join(fs.mkdtempSync(path.join(os.tmpdir(), "meta-")), "meta.jsonl"); }); + afterEach(() => { fs.rmSync(path.dirname(file), { recursive: true, force: true }); }); + + it("round-trips entries and skips malformed lines", () => { + appendMeta(file, metaEntryFor("a", "b", edits, "t1")); + appendMeta(file, metaEntryFor("c", "d", [{ op: "append", content: "x" }], "t2")); + fs.appendFileSync(file, "{ not json }\n\n"); + const loaded = loadMeta(file); + expect(loaded).toHaveLength(2); + expect(loaded[0].skill).toBe("a--b"); + expect(loaded[0].status).toBe("proposed"); + }); + + it("returns [] for a missing file", () => { + expect(loadMeta(path.join(os.tmpdir(), "does-not-exist-xyz.jsonl"))).toEqual([]); + }); +}); From b03c10a9514abb705b219b6072eaf6b519d60b44 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:06:16 +0000 Subject: [PATCH 13/30] feat(skillopt): edit-outcome gate (longitudinal before/after validation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The validation organ. A randomized A/B needs the skill version at invocation time (a capture change we lack), so the feasible gate is longitudinal: after an edit publishes, compare the skill's confirmed-failure rate after vs before — a real drop keeps it, a real rise reverts (from the SKILL.v backup), unclear/too-few = inconclusive. Observational (needs margin + min sample); randomized A/B is the upgrade once invocation-version capture lands. Extracts scoreInvocations (shared by detector + gate); adds untilIso to listSkillInvocations. 7 tests. --- src/skillify/deficiency-detector.ts | 50 +++++++++++----- src/skillify/skill-edit-gate.ts | 89 ++++++++++++++++++++++++++++ src/skillify/skill-invocations.ts | 3 +- tests/shared/skill-edit-gate.test.ts | 58 ++++++++++++++++++ 4 files changed, 184 insertions(+), 16 deletions(-) create mode 100644 src/skillify/skill-edit-gate.ts create mode 100644 tests/shared/skill-edit-gate.test.ts diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts index 70152597..e209df0b 100644 --- a/src/skillify/deficiency-detector.ts +++ b/src/skillify/deficiency-detector.ts @@ -41,6 +41,40 @@ export interface DetectorConfig { const skillKey = (name: string, author: string) => `${name}--${author}`; +export interface ScoreConfig { + window?: { before?: number; after?: number; maxChars?: number }; + judge?: ModelCall; +} + +/** + * Score a set of invocations: window each, run the free anchor, and judge ONLY the + * anchored ones. Shared by the detector (per-skill deficiency) and the edit gate + * (a skill's failure rate in a time window). + */ +export async function scoreInvocations( + query: QueryFn, + sessionsTable: string, + invocations: SkillInvocation[], + cfg: ScoreConfig = {}, +): Promise<{ anchored: number; confirmed: number; examples: string[] }> { + let anchored = 0; + let confirmed = 0; + const examples: string[] = []; + for (const inv of invocations) { + const turns = await windowedTurns(query, sessionsTable, inv, cfg.window); + const anchor = detectAnchor(turns); + if (!anchor.anchored) continue; // free filter — no judge call + anchored++; + const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"); + const verdict = await judgeSuccess(window, { model: cfg.judge }); + if (verdict.success === 0) { + confirmed++; + if (examples.length < 3) examples.push(verdict.reason || anchor.evidence); + } + } + return { anchored, confirmed, examples }; +} + export interface DetectionResult { skills: SkillDeficiency[]; deficientCount: number; @@ -65,21 +99,7 @@ export async function detectDeficientSkills( const skills: SkillDeficiency[] = []; for (const list of groups.values()) { - let anchored = 0; - let confirmed = 0; - const examples: string[] = []; - for (const inv of list) { - const turns = await windowedTurns(query, sessionsTable, inv, cfg.window); - const anchor = detectAnchor(turns); - if (!anchor.anchored) continue; // free filter — no judge call - anchored++; - const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"); - const verdict = await judgeSuccess(window, { model: cfg.judge }); - if (verdict.success === 0) { // confirmed: anchor AND judge agree - confirmed++; - if (examples.length < 3) examples.push(verdict.reason || anchor.evidence); - } - } + const { anchored, confirmed, examples } = await scoreInvocations(query, sessionsTable, list, cfg); const failureRate = list.length ? confirmed / list.length : 0; skills.push({ name: list[0].name, diff --git a/src/skillify/skill-edit-gate.ts b/src/skillify/skill-edit-gate.ts new file mode 100644 index 00000000..50d01450 --- /dev/null +++ b/src/skillify/skill-edit-gate.ts @@ -0,0 +1,89 @@ +/** + * Edit-outcome gate — the validation organ (the paper's gate, adapted). + * + * A randomized A/B is the ideal, but it needs the skill VERSION recorded at + * invocation time (a capture change we don't have yet — the Skill tool_use only + * carries the skill name). So the feasible gate is LONGITUDINAL: after an edit is + * published, compare the skill's confirmed-failure rate in the window AFTER publish + * vs BEFORE. A real drop = the edit helped → keep; a real rise = it hurt → revert + * (one `cp` from the SKILL.v.bak backup). No clear signal / too few post-publish + * uses → inconclusive (wait, or revert when stale). + * + * It's OBSERVATIONAL (confounded — the population shifts week to week), so it needs + * a margin + a minimum sample. Randomized A/B is the clean upgrade once invocation- + * version capture lands. Reuses scoreInvocations, so the same anchor+judge that + * detects deficiency also validates the fix. Injected query/judge → unit-testable. + */ +import { listSkillInvocations, type QueryFn } from "./skill-invocations.js"; +import { scoreInvocations } from "./deficiency-detector.js"; +import type { ModelCall } from "./claude-model.js"; + +export interface WindowStats { + invocations: number; + anchored: number; + confirmed: number; + failureRate: number; // confirmed / invocations +} + +export interface GateDecision { + before: WindowStats; + after: WindowStats; + delta: number; // before.failureRate - after.failureRate (positive = improved) + decision: "keep" | "revert" | "inconclusive"; +} + +interface MeasureOpts { + sinceIso?: string; + untilIso?: string; + limit?: number; + window?: { before?: number; after?: number; maxChars?: number }; + judge?: ModelCall; +} + +/** Confirmed-failure rate for one skill over a time window. */ +export async function measureSkillFailureRate( + query: QueryFn, + sessionsTable: string, + name: string, + author: string, + opts: MeasureOpts = {}, +): Promise { + const all = await listSkillInvocations(query, sessionsTable, { sinceIso: opts.sinceIso, untilIso: opts.untilIso, limit: opts.limit }); + const mine = all.filter((i) => i.name === name && i.author === author); + const { anchored, confirmed } = await scoreInvocations(query, sessionsTable, mine, { window: opts.window, judge: opts.judge }); + return { invocations: mine.length, anchored, confirmed, failureRate: mine.length ? confirmed / mine.length : 0 }; +} + +/** Pure decision from before/after stats. */ +export function gateEditOutcome( + before: WindowStats, + after: WindowStats, + opts: { margin?: number; minAfter?: number } = {}, +): GateDecision { + const margin = opts.margin ?? 0.2; + const minAfter = opts.minAfter ?? 5; + const delta = before.failureRate - after.failureRate; + let decision: GateDecision["decision"]; + if (after.invocations < minAfter) decision = "inconclusive"; // not enough post-publish use + else if (delta >= margin) decision = "keep"; // failure rate dropped → helped + else if (after.failureRate - before.failureRate >= margin) decision = "revert"; // got measurably worse + else decision = "inconclusive"; // no clear signal + return { before, after, delta, decision }; +} + +/** Full gate: measure before/after a publish timestamp and decide. */ +export async function gateEdit( + query: QueryFn, + sessionsTable: string, + name: string, + author: string, + publishIso: string, + opts: { windowDays?: number; nowIso?: string; margin?: number; minAfter?: number } & MeasureOpts = {}, +): Promise { + const windowDays = opts.windowDays ?? 14; + const beforeSince = new Date(Date.parse(publishIso) - windowDays * 24 * 60 * 60 * 1000).toISOString(); + const shared = { limit: opts.limit, window: opts.window, judge: opts.judge }; + const before = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: beforeSince, untilIso: publishIso }); + const after = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: publishIso, untilIso: opts.nowIso }); + return gateEditOutcome(before, after, opts); +} diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts index e5ef664f..544cd3a3 100644 --- a/src/skillify/skill-invocations.ts +++ b/src/skillify/skill-invocations.ts @@ -68,10 +68,11 @@ export function splitOrgSkill(skill: string): { name: string; author: string } | export async function listSkillInvocations( query: QueryFn, sessionsTable: string, - opts: { sinceIso?: string; limit?: number } = {}, + opts: { sinceIso?: string; untilIso?: string; limit?: number } = {}, ): Promise { const where = [`CAST(message AS TEXT) LIKE '%"Skill"%'`]; if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`); + if (opts.untilIso) where.push(`last_update_date < '${sqlStr(opts.untilIso)}'`); const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : ""; const rows = await query( `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`, diff --git a/tests/shared/skill-edit-gate.test.ts b/tests/shared/skill-edit-gate.test.ts new file mode 100644 index 00000000..283ff1fb --- /dev/null +++ b/tests/shared/skill-edit-gate.test.ts @@ -0,0 +1,58 @@ +import { describe, it, expect, vi } from "vitest"; +import { gateEditOutcome, gateEdit, type WindowStats } from "../../src/skillify/skill-edit-gate.js"; + +const stats = (invocations: number, failureRate: number): WindowStats => + ({ invocations, anchored: Math.round(invocations * failureRate), confirmed: Math.round(invocations * failureRate), failureRate }); + +describe("gateEditOutcome", () => { + it("KEEP when the failure rate dropped by >= margin", () => { + expect(gateEditOutcome(stats(10, 0.6), stats(10, 0.1)).decision).toBe("keep"); + }); + it("REVERT when it got measurably worse", () => { + expect(gateEditOutcome(stats(10, 0.1), stats(10, 0.5)).decision).toBe("revert"); + }); + it("INCONCLUSIVE when there's too little post-publish use", () => { + expect(gateEditOutcome(stats(10, 0.6), stats(3, 0.0)).decision).toBe("inconclusive"); + }); + it("INCONCLUSIVE when the change is within the margin (noise)", () => { + expect(gateEditOutcome(stats(10, 0.30), stats(10, 0.25)).decision).toBe("inconclusive"); + }); +}); + +const invRow = (skill: string, sid: string) => ({ + message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, + last_update_date: sid, +}); +const transcript = (skill: string, sid: string, pushback: boolean) => [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "assistant_message", content: "done (mocked)" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "looks good thanks" } }, +]; + +describe("gateEdit (longitudinal before/after)", () => { + it("keeps an edit whose failure rate dropped after publish", async () => { + const PUB = "2026-06-05T00:00:00.000Z"; + const transcripts = new Map>>(); + const beforeInvs: Array> = []; + const afterInvs: Array> = []; + for (let i = 0; i < 8; i++) { + const b = `bef${i}`, a = `aft${i}`; + beforeInvs.push(invRow("x--a", b)); transcripts.set(b, transcript("x--a", b, true)); // before: all pushback + afterInvs.push(invRow("x--a", a)); transcripts.set(a, transcript("x--a", a, false)); // after: none + } + const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks"}'); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) { + return sql.includes(`< '${PUB}'`) ? beforeInvs : afterInvs; // before window has the untilIso bound + } + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + + const res = await gateEdit(query, "sessions", "x", "a", PUB, { windowDays: 14, nowIso: "2026-06-12T00:00:00.000Z", judge, minAfter: 5 }); + expect(res.before.failureRate).toBeCloseTo(1.0); + expect(res.after.failureRate).toBeCloseTo(0.0); + expect(res.decision).toBe("keep"); + }); +}); From a1905fda0c7319e716a9dc6ec15f62442bd304f1 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:22:46 +0000 Subject: [PATCH 14/30] review: remove committed node_modules symlink (coderabbit) A machine-specific absolute node_modules symlink got committed in 1929c96e because .gitignore only had `node_modules/` (matches a directory, not the symlink file). Untracked it and tightened the ignore to `node_modules` so the symlink can't be re-added. The symlink stays locally (for builds); it's just no longer in git. --- .gitignore | 2 +- node_modules | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) delete mode 120000 node_modules diff --git a/.gitignore b/.gitignore index a0b7da6d..d6754c7c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -node_modules/ +node_modules dist/ tmp/ *.js.map diff --git a/node_modules b/node_modules deleted file mode 120000 index 1dd8c0a6..00000000 --- a/node_modules +++ /dev/null @@ -1 +0,0 @@ -/home/ubuntu/al-projects/hivemind/node_modules \ No newline at end of file From 67ee35c0a0a52834c2830dea181f439b4bb098ab Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:32:43 +0000 Subject: [PATCH 15/30] review: cap the judged window at maxChars (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scoreInvocations built the judge window from windowedTurns without applying the maxChars cap, so a captured turn containing a pasted log/diff could be sent whole to claude -p — slow, costly, or failing. Extracted elide() from windowAroundInvocation and apply it to the window before judging. Test asserts a 5000-char paste is capped under maxChars. --- src/skillify/deficiency-detector.ts | 4 ++-- src/skillify/skill-invocations.ts | 15 +++++++++------ tests/shared/deficiency-detector.test.ts | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts index e209df0b..6ed606ab 100644 --- a/src/skillify/deficiency-detector.ts +++ b/src/skillify/deficiency-detector.ts @@ -14,7 +14,7 @@ * The ≥5 fire gate lives with the caller (worker): we just return deficientCount. */ import { - listSkillInvocations, windowedTurns, type QueryFn, type SkillInvocation, + listSkillInvocations, windowedTurns, elide, type QueryFn, type SkillInvocation, } from "./skill-invocations.js"; import { detectAnchor } from "./session-anchor.js"; import { judgeSuccess, type ModelCall } from "./success-judge.js"; @@ -65,7 +65,7 @@ export async function scoreInvocations( const anchor = detectAnchor(turns); if (!anchor.anchored) continue; // free filter — no judge call anchored++; - const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"); + const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000); const verdict = await judgeSuccess(window, { model: cfg.judge }); if (verdict.success === 0) { confirmed++; diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts index 544cd3a3..1a6d5c64 100644 --- a/src/skillify/skill-invocations.ts +++ b/src/skillify/skill-invocations.ts @@ -152,17 +152,20 @@ export async function windowedTurns( return turns.slice(Math.max(0, invIndex - before), invIndex + after); } +/** Head+tail elide a string to maxChars (so a pasted log/diff can't blow a prompt). */ +export function elide(text: string, maxChars: number): string { + if (text.length <= maxChars) return text; + const head = text.slice(0, Math.floor(maxChars * 0.55)); + const tail = text.slice(text.length - Math.floor(maxChars * 0.45)); + return `${head}\n\n…[${text.length - maxChars} chars elided]…\n\n${tail}`; +} + export async function windowAroundInvocation( query: QueryFn, sessionsTable: string, inv: SkillInvocation, opts: { before?: number; after?: number; maxChars?: number } = {}, ): Promise { - const maxChars = opts.maxChars ?? 4000; const slice = await windowedTurns(query, sessionsTable, inv, opts); - const joined = slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"); - if (joined.length <= maxChars) return joined; - const head = joined.slice(0, Math.floor(maxChars * 0.55)); - const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45)); - return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`; + return elide(slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000); } diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts index 6e00b3d2..57972fa4 100644 --- a/tests/shared/deficiency-detector.test.ts +++ b/tests/shared/deficiency-detector.test.ts @@ -52,6 +52,27 @@ describe("detectDeficientSkills", () => { expect(judge).toHaveBeenCalledTimes(8); }); + it("caps the judged window at maxChars (a pasted log can't blow the judge call)", async () => { + const huge = "L".repeat(5000); + const skill = "bigskill--x", sid = "S1"; + const transcripts = new Map>>([[sid, [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "assistant_message", content: huge } }, // pasted log + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "user_message", content: "no that's wrong" } }, + ]]]); + let judgedLen = 0; + const judge = vi.fn(async (_s: string, user: string) => { judgedLen = user.length; return '{"success":0,"confidence":0.9,"reason":"x"}'; }); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return [invRow(skill, sid)]; + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + await detectDeficientSkills(query, TABLE, { judge, minInvocations: 1, window: { maxChars: 300 } }); + expect(judgedLen).toBeGreaterThan(0); // judge was called (anchored) + expect(judgedLen).toBeLessThan(800); // capped — not the ~5000-char paste + }); + it("respects a custom threshold + min-n", async () => { const { invs, transcripts } = world(); const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"x"}'); From f4047fc82a6fb2528b7ad59721f8ba4fafab5d26 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:32:43 +0000 Subject: [PATCH 16/30] review: deny all write-capable Claude tools in the judge/proposer (codex P2) The judge & proposer get untrusted captured transcript text in their prompts, so --disallowed-tools is the prompt-injection guard. The list missed write-capable tools (MultiEdit, NotebookEdit, TodoWrite), leaving an escape hatch in installs that expose them. Enumerated them so a malicious failure example can't act. --- src/skillify/claude-model.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts index 47b5e8f0..30be3701 100644 --- a/src/skillify/claude-model.ts +++ b/src/skillify/claude-model.ts @@ -9,7 +9,14 @@ import { spawn } from "node:child_process"; /** (systemPrompt, userPrompt) -> raw model text. */ export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; -const DENY = ["Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task"]; +// Deny EVERY write/exec/network tool — the judge & proposer get untrusted captured +// transcript text in their prompts, so a prompt-injected failure example must not be +// able to act. Enumerate the write-capable ones (MultiEdit/NotebookEdit/TodoWrite) +// too, not just the obvious Edit/Write. +const DENY = [ + "Bash", "Edit", "MultiEdit", "Write", "NotebookEdit", "Read", "Glob", "Grep", + "WebFetch", "WebSearch", "Task", "TodoWrite", +]; export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall { const timeoutMs = opts.timeoutMs ?? 120_000; From fc3825a383f4d0a50d4af7d7e38744d9071aa8a5 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:32:43 +0000 Subject: [PATCH 17/30] review: worker reads project-scoped skill root too (codex P2) The worker only read ~/.claude/skills, so a deficient skill pulled with --to project (/.claude/skills) had its invocation detected but no body to edit -> the proposal was silently skipped. The detached worker inherits the SessionStart cwd, so it now falls back to /.claude/skills. --- src/skillify/skillopt-worker.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts index 8027272a..1e0596c2 100644 --- a/src/skillify/skillopt-worker.ts +++ b/src/skillify/skillopt-worker.ts @@ -28,7 +28,12 @@ async function main(): Promise { const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName); const query = (sql: string) => api.query(sql) as Promise>>; + // Read both the global root and the project-scoped root (skills pulled with + // `--to project` live under /.claude/skills; the detached worker inherits + // the SessionStart cwd). Without the project root, a deficient project-pulled + // skill would be silently skipped (readSkillBody → null). const skillsRoot = path.join(os.homedir(), ".claude", "skills"); + const projectRoot = path.join(process.cwd(), ".claude", "skills"); const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals"); const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl"); const metaCache = loadMeta(metaFile); @@ -37,7 +42,7 @@ async function main(): Promise { const res = await runSkillOptCycle({ query, sessionsTable: config.sessionsTableName, - readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author), + readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author) ?? readSkillBodyFromDisk(projectRoot, name, author), writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec), meta: { prior: (n, a) => priorEditSummaries(metaCache, n, a), From a3e31861b5cab17fa802711b01df64d1ac633458 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:43:20 +0000 Subject: [PATCH 18/30] review: run judge/proposer with capture disabled (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a machine with the Hivemind hooks installed, each judge/proposer claude -p call was captured as a real session row — polluting the very sessions data the detector scans. Spawn the child with HIVEMIND_CAPTURE=false, as the skillify gate runner does. --- src/skillify/claude-model.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts index 30be3701..b11f15be 100644 --- a/src/skillify/claude-model.ts +++ b/src/skillify/claude-model.ts @@ -25,7 +25,13 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M "-p", user, "--model", model, "--no-session-persistence", "--output-format", "json", "--system-prompt", system, "--disallowed-tools", ...DENY, ]; - const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] }); + // HIVEMIND_CAPTURE=false so these judge/proposer calls are NOT captured as + // real sessions — otherwise the engine pollutes the very sessions data it + // scans (and the synthetic prompts would show up as transcript rows). + const child = spawn("claude", args, { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, HIVEMIND_CAPTURE: "false" }, + }); let out = ""; let err = ""; const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs); From a118d0a1ad268346007d97f769cc7100bcb6f526 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 06:43:20 +0000 Subject: [PATCH 19/30] review: strong pushback overrides benign in the anchor (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "thanks, but this is still failing" matched PUSHBACK but BENIGN's bare "thank" also matched, so detectAnchor returned false and the failure was dropped — bad for a recall-oriented stage. Split into STRONG corrections (always anchor, even amid polite words) and the ambiguous bare "no" (benign-gated). Test covers polite corrections. --- src/skillify/session-anchor.ts | 18 +++++++++++------- tests/shared/session-anchor.test.ts | 5 +++++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts index 677b8598..0b9fac9e 100644 --- a/src/skillify/session-anchor.ts +++ b/src/skillify/session-anchor.ts @@ -19,23 +19,27 @@ export interface Anchor { evidence: string; // the user turn that triggered it (truncated) } -// User pushback: rejection / correction of what the assistant just produced. -const PUSHBACK = /\b(no|nope|wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|that'?s wrong|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail)/i; +// Unambiguous correction — ALWAYS an anchor, even amid polite words. This must +// win over BENIGN so "thanks, but this is still failing" still fires. +const STRONG = /\b(wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail|not right)/i; -// Clear benign negatives we don't want to fire on (keeps obvious false positives -// out of the judge to save tokens). Intentionally narrow — when in doubt, fire. -const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good|thank)/i; +// Ambiguous negation: "no" is pushback ("no, that's off") but also benign +// ("no problem"), so it only anchors when the turn isn't a clear benign phrase. +const AMBIGUOUS = /\b(no|nope)\b/i; +const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i; /** * Detect a correction anchor in a windowed slice of turns. Only a USER turn that * immediately follows an ASSISTANT turn can be pushback (the first user turn is - * the request, not a reaction). + * the request, not a reaction). Recall-oriented: a strong correction phrase fires + * regardless of polite framing; only the bare "no" is benign-gated. */ export function detectAnchor(turns: Turn[]): Anchor { for (let i = 1; i < turns.length; i++) { const t = turns[i]; if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue; - if (PUSHBACK.test(t.text) && !BENIGN.test(t.text)) { + const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text)); + if (anchored) { return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) }; } } diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts index 1780584f..62dc7ee3 100644 --- a/tests/shared/session-anchor.test.ts +++ b/tests/shared/session-anchor.test.ts @@ -34,6 +34,11 @@ describe("detectAnchor", () => { } }); + it("fires on polite-but-failing corrections (strong pushback overrides benign words)", () => { + expect(detectAnchor([a("here"), u("thanks, but this is still failing")]).anchored).toBe(true); + expect(detectAnchor([a("done"), u("perfect start, but that's still wrong")]).anchored).toBe(true); + }); + it("returns none when the user is satisfied / silent", () => { expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false); expect(detectAnchor([]).anchored).toBe(false); From 50913652fb241ab4e385c1f79d097d903b9019b2 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:23:26 +0000 Subject: [PATCH 20/30] review: reject protected-region overlaps, not just inside-starts (codex P2) targetsProtected only checked the target's start offset, so a replace/delete whose target began just before SLOW_UPDATE_START and spanned into the block slipped through and could remove protected longitudinal guidance. Now rejects any target RANGE that overlaps [r0, r1). Test covers a spanning target. --- src/skillify/skill-edits.ts | 6 +++++- tests/shared/skill-edits.test.ts | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/skillify/skill-edits.ts b/src/skillify/skill-edits.ts index 218ba6a0..115c1aa9 100644 --- a/src/skillify/skill-edits.ts +++ b/src/skillify/skill-edits.ts @@ -30,7 +30,11 @@ function targetsProtected(skill: string, target: string): boolean { const r = protectedRange(skill); if (!r || !target) return false; const idx = skill.indexOf(target); - return idx !== -1 && idx >= r[0] && idx < r[1]; + if (idx === -1) return false; + // Reject if the target RANGE [idx, idx+len) overlaps the protected range at all — + // not just if it starts inside it (a target that begins just before SLOW_UPDATE_START + // and spans into the block must not be allowed to delete protected guidance). + return idx < r[1] && idx + target.length > r[0]; } /** Enforce the edit budget ("textual learning rate"): keep at most `budget` edits. */ diff --git a/tests/shared/skill-edits.test.ts b/tests/shared/skill-edits.test.ts index 3322cb48..9143270c 100644 --- a/tests/shared/skill-edits.test.ts +++ b/tests/shared/skill-edits.test.ts @@ -43,6 +43,15 @@ describe("applyEdits", () => { // appended content sits before the protected block expect(r.skill.indexOf("2. b")).toBeLessThan(r.skill.indexOf(SU_START)); }); + + it("rejects a target that SPANS INTO the protected region (not just one starting inside)", () => { + const doc = `## Rules\n1. a\n${SU_START}\nLongitudinal guidance.\n${SU_END}`; + // target begins before SLOW_UPDATE_START but extends into the protected block + const r = applyEdits(doc, [{ op: "delete", target: `1. a\n${SU_START}\nLongitudinal` }]); + expect(r.applied).toBe(0); + expect(r.skill).toContain("Longitudinal guidance."); + expect(r.report.some((l) => l.includes("protected slow-update region"))).toBe(true); + }); }); describe("selectEdits (edit budget)", () => { From b934458b084ece8127bdc8427b6ca846b690d929 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:23:26 +0000 Subject: [PATCH 21/30] review: anchor only post-invocation turns + validate skill refs as paths (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two findings sharing skill-invocations.ts: - detectAnchor scanned the whole window incl. pre-invocation turns, so a correction from an EARLIER failed attempt (skill used as a repair) was misattributed to this skill. windowedTurns now returns a pivot (first post-invocation turn); detectAnchor takes fromIndex and only fires when both the reaction AND the assistant it reacts to are post-invocation. Test transcripts reordered to real capture order (user → tool_call → assistant → reaction). - splitOrgSkill now rejects refs containing path separators or '..' before name/author are used to build skills/proposals filesystem paths (path-escape guard, matching the pull path's untrusted treatment). --- src/skillify/deficiency-detector.ts | 4 ++-- src/skillify/session-anchor.ts | 15 +++++++++------ src/skillify/skill-invocations.ts | 21 +++++++++++++++++---- tests/shared/deficiency-detector.test.ts | 4 ++-- tests/shared/session-anchor.test.ts | 8 ++++++++ tests/shared/skill-edit-gate.test.ts | 2 +- tests/shared/skill-invocations.test.ts | 6 ++++++ tests/shared/skillopt-engine.test.ts | 2 +- 8 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts index 6ed606ab..73b77514 100644 --- a/src/skillify/deficiency-detector.ts +++ b/src/skillify/deficiency-detector.ts @@ -61,8 +61,8 @@ export async function scoreInvocations( let confirmed = 0; const examples: string[] = []; for (const inv of invocations) { - const turns = await windowedTurns(query, sessionsTable, inv, cfg.window); - const anchor = detectAnchor(turns); + const { turns, pivot } = await windowedTurns(query, sessionsTable, inv, cfg.window); + const anchor = detectAnchor(turns, pivot); // anchor only on post-invocation reaction if (!anchor.anchored) continue; // free filter — no judge call anchored++; const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000); diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts index 0b9fac9e..83056d8d 100644 --- a/src/skillify/session-anchor.ts +++ b/src/skillify/session-anchor.ts @@ -29,15 +29,18 @@ const AMBIGUOUS = /\b(no|nope)\b/i; const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i; /** - * Detect a correction anchor in a windowed slice of turns. Only a USER turn that - * immediately follows an ASSISTANT turn can be pushback (the first user turn is - * the request, not a reaction). Recall-oriented: a strong correction phrase fires - * regardless of polite framing; only the bare "no" is benign-gated. + * Detect a correction anchor in a windowed slice of turns. A pushback is a USER turn + * reacting to an ASSISTANT turn — and BOTH must be POST-invocation (index ≥ fromIndex), + * so a correction that happened BEFORE the skill ran (e.g. the skill was a repair + * attempt) isn't misattributed to this skill. fromIndex defaults to 0 (scan all). + * Recall-oriented: a strong correction fires regardless of polite framing; only the + * bare "no" is benign-gated. */ -export function detectAnchor(turns: Turn[]): Anchor { - for (let i = 1; i < turns.length; i++) { +export function detectAnchor(turns: Turn[], fromIndex = 0): Anchor { + for (let i = Math.max(1, fromIndex); i < turns.length; i++) { const t = turns[i]; if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue; + if (i - 1 < fromIndex) continue; // the assistant being reacted to must be post-invocation const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text)); if (anchored) { return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) }; diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts index 1a6d5c64..791a65d7 100644 --- a/src/skillify/skill-invocations.ts +++ b/src/skillify/skill-invocations.ts @@ -55,6 +55,10 @@ export function invokedSkillRef(msg: ParsedMsg): string | null { /** Split "--" → parts. null for plugin-namespaced / bare / malformed refs. */ export function splitOrgSkill(skill: string): { name: string; author: string } | null { if (skill.includes(":")) return null; // plugin-namespaced (e.g. hivemind:hivemind-memory) + // name/author are used to build filesystem paths (skills dir, proposals dir), so a + // captured tool_input must not smuggle path separators / traversal — same untrusted + // treatment the pull path applies to these segments. + if (skill.includes("/") || skill.includes("\\") || skill.includes("..")) return null; const i = skill.lastIndexOf("--"); if (i <= 0 || i + 2 >= skill.length) return null; // bare or malformed return { name: skill.slice(0, i), author: skill.slice(i + 2) }; @@ -140,16 +144,25 @@ async function sessionTurns( * turns after — where the help-or-harm signal lives — head+tail elided to maxChars. * `before`/`after` are tunable; defaults chosen as a small starting point. */ +/** A windowed slice plus `pivot` = the index in `turns` of the first POST-invocation + * turn (turns before it are the pre-invocation context — kept for the judge, but the + * anchor must not scan them, or a prior correction gets misattributed to this skill). */ +export interface WindowSlice { + turns: Turn[]; + pivot: number; +} + export async function windowedTurns( query: QueryFn, sessionsTable: string, inv: SkillInvocation, opts: { before?: number; after?: number } = {}, -): Promise { +): Promise { const before = opts.before ?? 3; const after = opts.after ?? 6; const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv); - return turns.slice(Math.max(0, invIndex - before), invIndex + after); + const start = Math.max(0, invIndex - before); + return { turns: turns.slice(start, invIndex + after), pivot: invIndex - start }; } /** Head+tail elide a string to maxChars (so a pasted log/diff can't blow a prompt). */ @@ -166,6 +179,6 @@ export async function windowAroundInvocation( inv: SkillInvocation, opts: { before?: number; after?: number; maxChars?: number } = {}, ): Promise { - const slice = await windowedTurns(query, sessionsTable, inv, opts); - return elide(slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000); + const { turns } = await windowedTurns(query, sessionsTable, inv, opts); + return elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000); } diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts index 57972fa4..5c9ee877 100644 --- a/tests/shared/deficiency-detector.test.ts +++ b/tests/shared/deficiency-detector.test.ts @@ -9,8 +9,8 @@ const invRow = (skill: string, sid: string) => ({ }); const transcript = (skill: string, sid: string, pushback: boolean) => [ { message: { type: "user_message", content: "do it" } }, - { message: { type: "assistant_message", content: "done" } }, { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: "done" } }, { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } }, ]; @@ -57,8 +57,8 @@ describe("detectDeficientSkills", () => { const skill = "bigskill--x", sid = "S1"; const transcripts = new Map>>([[sid, [ { message: { type: "user_message", content: "do it" } }, - { message: { type: "assistant_message", content: huge } }, // pasted log { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: huge } }, // pasted log { message: { type: "user_message", content: "no that's wrong" } }, ]]]); let judgedLen = 0; diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts index 62dc7ee3..942462c7 100644 --- a/tests/shared/session-anchor.test.ts +++ b/tests/shared/session-anchor.test.ts @@ -39,6 +39,14 @@ describe("detectAnchor", () => { expect(detectAnchor([a("done"), u("perfect start, but that's still wrong")]).anchored).toBe(true); }); + it("ignores PRE-invocation pushback (fromIndex) — no misattribution to a repair-attempt skill", () => { + // turns: [a, USER pushback (pre-invocation), a (skill output), USER ok] — pivot=2 + const turns = [a("attempt 1"), u("no that's wrong"), a("retried with the skill"), u("looks good")]; + expect(detectAnchor(turns, 2).anchored).toBe(false); // the pre-invocation correction is not scanned + // a genuine POST-invocation pushback still fires + expect(detectAnchor([a("attempt 1"), u("no wrong"), a("fixed"), u("still failing")], 2).anchored).toBe(true); + }); + it("returns none when the user is satisfied / silent", () => { expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false); expect(detectAnchor([]).anchored).toBe(false); diff --git a/tests/shared/skill-edit-gate.test.ts b/tests/shared/skill-edit-gate.test.ts index 283ff1fb..4cbb760e 100644 --- a/tests/shared/skill-edit-gate.test.ts +++ b/tests/shared/skill-edit-gate.test.ts @@ -25,8 +25,8 @@ const invRow = (skill: string, sid: string) => ({ }); const transcript = (skill: string, sid: string, pushback: boolean) => [ { message: { type: "user_message", content: "do it" } }, - { message: { type: "assistant_message", content: "done (mocked)" } }, { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: "done (mocked)" } }, { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "looks good thanks" } }, ]; diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts index ecdfadbb..ec8e584d 100644 --- a/tests/shared/skill-invocations.test.ts +++ b/tests/shared/skill-invocations.test.ts @@ -39,6 +39,12 @@ describe("splitOrgSkill", () => { expect(splitOrgSkill("update-config")).toBeNull(); // bare expect(splitOrgSkill("baz--")).toBeNull(); // empty author }); + it("rejects refs with path separators / traversal (no path escape)", () => { + expect(splitOrgSkill("../../etc--x")).toBeNull(); + expect(splitOrgSkill("ok--..%2f")).toBeNull(); // contains .. + expect(splitOrgSkill("a/b--c")).toBeNull(); // separator + expect(splitOrgSkill("a--b/c")).toBeNull(); // separator in author + }); }); describe("listSkillInvocations", () => { diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts index d297fda6..cd9bf3cc 100644 --- a/tests/shared/skillopt-engine.test.ts +++ b/tests/shared/skillopt-engine.test.ts @@ -7,8 +7,8 @@ const invRow = (skill: string, sid: string) => ({ }); const transcript = (skill: string, sid: string, pushback: boolean) => [ { message: { type: "user_message", content: "do it" } }, - { message: { type: "assistant_message", content: "done" } }, { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: "done" } }, { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } }, ]; From b3db22edfe473d3ecc86b1dd88d1f2834cb95d34 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:37:59 +0000 Subject: [PATCH 22/30] review: no-tools allow-list for judge/proposer, not a deny-list (codex P1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A deny-list only blocks named built-ins — any other tool in the user's env (LS, NotebookRead, configured MCP tools) stayed callable, breaking the pure-text isolation for calls that include untrusted captured transcript text. Switched to `--tools """ (empty allow-list = zero tools available, authoritative over built-ins AND MCP). Verified the flag isolates with a real claude -p call. --- src/skillify/claude-model.ts | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts index b11f15be..7be4bf87 100644 --- a/src/skillify/claude-model.ts +++ b/src/skillify/claude-model.ts @@ -9,21 +9,16 @@ import { spawn } from "node:child_process"; /** (systemPrompt, userPrompt) -> raw model text. */ export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; -// Deny EVERY write/exec/network tool — the judge & proposer get untrusted captured -// transcript text in their prompts, so a prompt-injected failure example must not be -// able to act. Enumerate the write-capable ones (MultiEdit/NotebookEdit/TodoWrite) -// too, not just the obvious Edit/Write. -const DENY = [ - "Bash", "Edit", "MultiEdit", "Write", "NotebookEdit", "Read", "Glob", "Grep", - "WebFetch", "WebSearch", "Task", "TodoWrite", -]; - export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall { const timeoutMs = opts.timeoutMs ?? 120_000; return (system, user) => new Promise((resolve, reject) => { const args = [ "-p", user, "--model", model, "--no-session-persistence", - "--output-format", "json", "--system-prompt", system, "--disallowed-tools", ...DENY, + "--output-format", "json", "--system-prompt", system, + // Empty allow-list = NO tools available. Authoritative: it covers built-ins AND + // any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected + // transcript text in the judge/proposer prompt can never trigger tool use. + "--tools", "", ]; // HIVEMIND_CAPTURE=false so these judge/proposer calls are NOT captured as // real sessions — otherwise the engine pollutes the very sessions data it From ba78b2878b6a7b6be51d69ee775c2b07a03f46d2 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:37:59 +0000 Subject: [PATCH 23/30] review: resolve skill body via pull manifest, not the worker cwd (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Invocations come from ALL projects, but the worker resolved project-scoped skills from its own SessionStart cwd — so a skill pulled --to project in another cwd was skipped, or a same-named skill in the current cwd was edited instead. readSkillBodyViaManifest now resolves the authoritative installRoot from the pull manifest (global ~/.claude/skills as fallback). Test covers the project-root read. --- src/skillify/skillopt-engine.ts | 24 ++++++++++++++++++++++++ src/skillify/skillopt-worker.ts | 14 +++++++------- tests/shared/skillopt-engine.test.ts | 23 ++++++++++++++++++++++- 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts index 24d1c591..4d1a5f8f 100644 --- a/src/skillify/skillopt-engine.ts +++ b/src/skillify/skillopt-engine.ts @@ -18,6 +18,7 @@ import { proposeSkillEdit, type ProposeConfig } from "./skill-proposer.js"; import { splitFrontmatter } from "./skill-publisher.js"; import type { QueryFn } from "./skill-invocations.js"; import type { Edit } from "./skill-edits.js"; +import type { PulledManifest } from "./manifest.js"; export interface ProposalRecord { name: string; @@ -105,3 +106,26 @@ export function readSkillBodyFromDisk(skillsRoot: string, name: string, author: return null; } } + +/** + * Resolve a skill's body from its ACTUAL install location via the pull manifest, + * trying every recorded installRoot, then a fallback root. Authoritative — handles + * skills pulled with `--to project` into any cwd (invocations come from all + * projects, so the worker can't assume its own cwd), and avoids editing a + * same-named skill that happens to sit in the current cwd. + */ +export function readSkillBodyViaManifest( + name: string, + author: string, + manifest: PulledManifest, + fallbackRoot?: string, +): string | null { + const dirName = `${name}--${author}`; + const roots = manifest.entries.filter((e) => e.dirName === dirName).map((e) => e.installRoot); + if (fallbackRoot) roots.push(fallbackRoot); + for (const root of roots) { + const body = readSkillBodyFromDisk(root, name, author); + if (body) return body; + } + return null; +} diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts index 1e0596c2..ffaa1284 100644 --- a/src/skillify/skillopt-worker.ts +++ b/src/skillify/skillopt-worker.ts @@ -16,8 +16,9 @@ import { log as _log } from "../utils/debug.js"; import { loadConfig } from "../config.js"; import { DeeplakeApi } from "../deeplake-api.js"; import { getStateDir } from "./state-dir.js"; -import { runSkillOptCycle, writeProposalToDisk, readSkillBodyFromDisk } from "./skillopt-engine.js"; +import { runSkillOptCycle, writeProposalToDisk, readSkillBodyViaManifest } from "./skillopt-engine.js"; import { loadMeta, appendMeta, priorEditSummaries, alreadyProposed, metaEntryFor } from "./skillopt-meta.js"; +import { loadManifest } from "./manifest.js"; const log = (m: string) => _log("skillopt-worker", m); @@ -28,12 +29,11 @@ async function main(): Promise { const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName); const query = (sql: string) => api.query(sql) as Promise>>; - // Read both the global root and the project-scoped root (skills pulled with - // `--to project` live under /.claude/skills; the detached worker inherits - // the SessionStart cwd). Without the project root, a deficient project-pulled - // skill would be silently skipped (readSkillBody → null). + // Resolve skill bodies via the pull manifest's recorded installRoot (authoritative) + // — invocations come from ALL projects, so we can't assume the worker's own cwd. + // The global ~/.claude/skills is a fallback for skills not in the manifest. + const manifest = loadManifest(); const skillsRoot = path.join(os.homedir(), ".claude", "skills"); - const projectRoot = path.join(process.cwd(), ".claude", "skills"); const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals"); const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl"); const metaCache = loadMeta(metaFile); @@ -42,7 +42,7 @@ async function main(): Promise { const res = await runSkillOptCycle({ query, sessionsTable: config.sessionsTableName, - readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author) ?? readSkillBodyFromDisk(projectRoot, name, author), + readSkillBody: (name, author) => readSkillBodyViaManifest(name, author, manifest, skillsRoot), writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec), meta: { prior: (n, a) => priorEditSummaries(metaCache, n, a), diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts index cd9bf3cc..b06fda27 100644 --- a/tests/shared/skillopt-engine.test.ts +++ b/tests/shared/skillopt-engine.test.ts @@ -1,5 +1,9 @@ import { describe, it, expect, vi } from "vitest"; -import { runSkillOptCycle, type ProposalRecord } from "../../src/skillify/skillopt-engine.js"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runSkillOptCycle, readSkillBodyViaManifest, type ProposalRecord } from "../../src/skillify/skillopt-engine.js"; +import type { PulledManifest } from "../../src/skillify/manifest.js"; const invRow = (skill: string, sid: string) => ({ message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, @@ -96,6 +100,23 @@ describe("runSkillOptCycle", () => { expect(res.proposals.find((p) => p.name === "bad0")!.changed).toBe(false); }); + it("reads a project-pulled skill body via the manifest's installRoot (not the cwd)", () => { + const projRoot = fs.mkdtempSync(path.join(os.tmpdir(), "proj-")); + try { + fs.mkdirSync(path.join(projRoot, "x--a"), { recursive: true }); + fs.writeFileSync(path.join(projRoot, "x--a", "SKILL.md"), "---\nname: x\nauthor: a\n---\n## Body\nproject body"); + const manifest = { + version: 1, + entries: [{ dirName: "x--a", name: "x", author: "a", installRoot: projRoot, projectKey: "", remoteVersion: 1, install: "project", installedAtVersion: 1, pulledAt: "", symlinks: [] }], + } as unknown as PulledManifest; + expect(readSkillBodyViaManifest("x", "a", manifest, "/nonexistent-global")).toBe("## Body\nproject body"); + // no manifest entry + no fallback body → null (not a silent wrong-skill edit) + expect(readSkillBodyViaManifest("y", "b", manifest, "/nonexistent-global")).toBeNull(); + } finally { + fs.rmSync(projRoot, { recursive: true, force: true }); + } + }); + it("honors a custom fireThreshold", async () => { const res = await runSkillOptCycle({ query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3, From fe1b913ff25a5cc3f49d498553660631942338a5 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:54:23 +0000 Subject: [PATCH 24/30] review: skip this package's SessionStart hook on internal claude -p calls (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HIVEMIND_CAPTURE=false stopped capture, but the spawned judge/proposer claude -p still ran our SessionStart hook — injecting the large Deeplake context into the prompt and doing auto-pull/graph work, once per anchored invocation. Set HIVEMIND_WIKI_WORKER=1 (the guard internal runners use) so the hook returns immediately: no prompt contamination, no repeated background work. --- src/skillify/claude-model.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts index 7be4bf87..f0f7641b 100644 --- a/src/skillify/claude-model.ts +++ b/src/skillify/claude-model.ts @@ -20,12 +20,14 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M // transcript text in the judge/proposer prompt can never trigger tool use. "--tools", "", ]; - // HIVEMIND_CAPTURE=false so these judge/proposer calls are NOT captured as - // real sessions — otherwise the engine pollutes the very sessions data it - // scans (and the synthetic prompts would show up as transcript rows). + // HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND + // HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart + // hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph + // work) — one child per anchored invocation would otherwise contaminate the judge + // prompt and pile up background work. Same guard the other internal runners use. const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"], - env: { ...process.env, HIVEMIND_CAPTURE: "false" }, + env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" }, }); let out = ""; let err = ""; From 62907c66464c05b1838d3c1ed69e75134f63cbdf Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:54:23 +0000 Subject: [PATCH 25/30] review: match session rows exactly when reconstructing windows (codex P3) path LIKE '%sid%' could pull rows from an unrelated session (sid as a substring, or SQL LIKE wildcards in the id), feeding the judge a mixed transcript. Escape LIKE wildcards (+ ESCAPE clause) and drop any row whose message.session_id isn't the exact session. Test covers a cross-session collision row. --- src/skillify/skill-invocations.ts | 12 ++++++++++-- tests/shared/skill-invocations.test.ts | 13 +++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts index 791a65d7..ae9ed9b7 100644 --- a/src/skillify/skill-invocations.ts +++ b/src/skillify/skill-invocations.ts @@ -108,18 +108,26 @@ export interface Turn { role: "USER" | "ASSISTANT"; text: string } * Reconstruct the transcript turns of a session, and mark where (between which two * turns) the given invocation happened — so callers can window around it. */ +/** Escape SQL LIKE wildcards (% _ \) so a session id with those chars matches literally. */ +function likeEscape(s: string): string { + return s.replace(/([\\%_])/g, "\\$1"); +} + async function sessionTurns( query: QueryFn, sessionsTable: string, inv: SkillInvocation, ): Promise<{ turns: Turn[]; invIndex: number }> { - const sid = sqlStr(inv.sessionId); + const sid = sqlStr(likeEscape(inv.sessionId)); const rows = await query( - `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`, + `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ESCAPE '\\' ORDER BY creation_date ASC`, ); const turns: Turn[] = []; let invIndex = -1; for (const r of rows) { const j = parseMessage(r.message); if (!j) continue; + // Exact session match: `path LIKE %sid%` can match a substring/wildcard collision, + // so drop any row whose recorded session_id isn't this exact session. + if (typeof j.session_id === "string" && j.session_id !== inv.sessionId) continue; // The invocation itself is a tool_call (not a turn): mark its position then skip. const ref = invokedSkillRef(j); if (ref) { diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts index ec8e584d..8e8e1bfd 100644 --- a/tests/shared/skill-invocations.test.ts +++ b/tests/shared/skill-invocations.test.ts @@ -106,4 +106,17 @@ describe("windowAroundInvocation", () => { expect(out).toContain("chars elided"); expect(out.length).toBeLessThan(300); }); + + it("drops rows from a different session + escapes LIKE wildcards (exact match)", async () => { + const { fn, calls } = mockQuery([ + { message: { type: "user_message", content: "first", session_id: "S1" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "posthog-smoke--kamo" }), session_id: "S1", timestamp: "t5" } }, + { message: { type: "assistant_message", content: "did X", session_id: "S1" } }, + { message: { type: "user_message", content: "LEAK from other session", session_id: "S2" } }, // collision → dropped + ]); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5 }); + expect(calls[0]).toContain("ESCAPE '\\'"); + expect(out).toContain("did X"); + expect(out).not.toContain("LEAK from other session"); + }); }); From 78d68a82554a2ec990039341c015ad34572f1f68 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 18:54:23 +0000 Subject: [PATCH 26/30] review: create frontmatter when bumping a SKILL.md without one (codex P3) bumpVersion('') left the frontmatter empty while reporting newVersion: 2, so a published edit of a no-frontmatter skill carried no version and the gate couldn't tell versions apart. Now emits a minimal '---\nversion: N\n---\n' block. Test added. --- src/skillify/skill-publisher.ts | 11 ++++++++--- tests/shared/skill-publisher.test.ts | 5 +++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/skillify/skill-publisher.ts b/src/skillify/skill-publisher.ts index 68a3f52d..a647dcb5 100644 --- a/src/skillify/skill-publisher.ts +++ b/src/skillify/skill-publisher.ts @@ -31,9 +31,14 @@ export function bumpVersion(frontmatter: string): { frontmatter: string; oldVers const m = frontmatter.match(/^version:\s*(\d+)\s*$/m); const oldVersion = m ? parseInt(m[1], 10) : 1; const newVersion = oldVersion + 1; - const next = m - ? frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`) - : frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`); + let next: string; + if (m) { + next = frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`); // has a version line + } else if (/\n---\n$/.test(frontmatter)) { + next = frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`); // frontmatter, no version + } else { + next = `---\nversion: ${newVersion}\n---\n`; // no frontmatter → create one + } return { frontmatter: next, oldVersion, newVersion }; } diff --git a/tests/shared/skill-publisher.test.ts b/tests/shared/skill-publisher.test.ts index 065b5929..62916504 100644 --- a/tests/shared/skill-publisher.test.ts +++ b/tests/shared/skill-publisher.test.ts @@ -27,6 +27,11 @@ describe("bumpVersion", () => { expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 }); expect(r.frontmatter).toMatch(/version: 2\n---\n$/); }); + it("creates a frontmatter block when the doc has none", () => { + const r = bumpVersion(""); + expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 }); + expect(r.frontmatter).toBe("---\nversion: 2\n---\n"); + }); }); describe("publishSkillEdit", () => { From 3335238b1146b9c28edb18584c7d52a684cb8681 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 19:12:12 +0000 Subject: [PATCH 27/30] review: resolve the claude binary via findAgentBin, not PATH (codex P2) A detached hook worker may not have claude on PATH (e.g. ~/.claude/local/claude); spawn("claude") would ENOENT and the callers swallow it as no-change, so the weekly worker silently produced no proposals in those layouts. Resolve the binary the same way the rest of skillify does (findAgentBin('claude_code')). --- src/skillify/claude-model.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts index f0f7641b..eba80ffc 100644 --- a/src/skillify/claude-model.ts +++ b/src/skillify/claude-model.ts @@ -5,6 +5,7 @@ * unit-testable with zero real calls. */ import { spawn } from "node:child_process"; +import { findAgentBin } from "./gate-runner.js"; /** (systemPrompt, userPrompt) -> raw model text. */ export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; @@ -25,7 +26,10 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M // hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph // work) — one child per anchored invocation would otherwise contaminate the judge // prompt and pile up background work. Same guard the other internal runners use. - const child = spawn("claude", args, { + // Resolve the claude binary the same way the rest of skillify does — a detached + // hook worker may not have it on PATH (e.g. ~/.claude/local/claude), and a bare + // "claude" would ENOENT and the callers would swallow it as no-change. + const child = spawn(findAgentBin("claude_code"), args, { stdio: ["ignore", "pipe", "pipe"], env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" }, }); From f8427c0669df9cbd2fc8e091b1a642850733bf9b Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 19:12:12 +0000 Subject: [PATCH 28/30] review: don't burn the weekly throttle when logged out (codex P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit runWeeklySkillOpt stamped lastRun before the worker checks creds, so firing at a logged-out SessionStart consumed the week — a user who logged in minutes later was skipped as throttled until next week. Added a canFire gate (default: creds present) that returns reason 'no-creds' before any stamp/spawn. Test covers it. --- src/skillify/skillopt-trigger.ts | 13 ++++++++++++- tests/shared/skillopt-trigger.test.ts | 8 ++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/skillify/skillopt-trigger.ts b/src/skillify/skillopt-trigger.ts index f96a757a..81504d31 100644 --- a/src/skillify/skillopt-trigger.ts +++ b/src/skillify/skillopt-trigger.ts @@ -16,9 +16,15 @@ import { fileURLToPath } from "node:url"; import { log as _log } from "../utils/debug.js"; import { getStateDir } from "./state-dir.js"; import { tryAcquireWorkerLock, releaseWorkerLock } from "./state.js"; +import { loadCredentials } from "../commands/auth.js"; const log = (m: string) => _log("skillopt-trigger", m); +/** The worker needs creds to query Deeplake; default fire-gate skips when logged out. */ +function defaultHasCreds(): boolean { + try { return Boolean(loadCredentials()?.token); } catch { return false; } +} + export const WEEK_MS = 7 * 24 * 60 * 60 * 1000; /** Cross-process lock key arbitrating the weekly fire (see runWeeklySkillOpt). */ const LOCK_KEY = "skillopt-weekly"; @@ -81,11 +87,12 @@ export interface FireDeps { tryLock?: () => boolean; // cross-process arbiter; default: real worker lock releaseLock?: () => void; // default: release the real worker lock reload?: () => SkillOptState; // fresh state re-read INSIDE the lock; default: loadState + canFire?: () => boolean; // gate before stamping; default: creds present } export interface FireResult { fired: boolean; - reason?: "disabled" | "in-worker" | "throttled" | "locked" | "spawned"; + reason?: "disabled" | "in-worker" | "throttled" | "locked" | "no-creds" | "spawned"; } /** @@ -102,6 +109,10 @@ export function runWeeklySkillOpt(deps: FireDeps = {}): FireResult { // Cheap pre-lock check: skip the lock entirely when clearly throttled. if (!shouldFire(state.lastRun, now)) return { fired: false, reason: "throttled" }; + // Don't burn the weekly throttle when logged out — stamping lastRun before the + // worker confirms creds would skip a user who logs in shortly after until next week. + if (!(deps.canFire ?? defaultHasCreds)()) return { fired: false, reason: "no-creds" }; + // Cross-process arbiter: two SessionStart hooks racing at the weekly boundary // could both pass the throttle and spawn duplicate workers (doubling user-side // cost once the worker does real LLM work). An atomic openSync(wx) worker-lock diff --git a/tests/shared/skillopt-trigger.test.ts b/tests/shared/skillopt-trigger.test.ts index 7049e5da..ff3c3751 100644 --- a/tests/shared/skillopt-trigger.test.ts +++ b/tests/shared/skillopt-trigger.test.ts @@ -34,6 +34,7 @@ describe("runWeeklySkillOpt (auto-fire decision)", () => { tryLock: () => true, // injected so the unit test touches no real lock file releaseLock: release, reload: () => over.state ?? {}, // in-lock re-read mirrors the pre-lock state by default + canFire: () => true, // injected so the unit test doesn't read real credentials ...over, }); return { res, saved, spawn, release }; @@ -47,6 +48,13 @@ describe("runWeeklySkillOpt (auto-fire decision)", () => { expect(release).toHaveBeenCalledTimes(1); // lock released after firing }); + it("does NOT fire or stamp when logged out (preserves the throttle for a fresh login)", () => { + const { res, saved, spawn } = harness({ state: {}, canFire: () => false }); + expect(res).toEqual({ fired: false, reason: "no-creds" }); + expect(spawn).not.toHaveBeenCalled(); + expect(saved).toEqual([]); // no stamp → next session after login fires + }); + it("does NOT fire when another process holds the weekly lock (cross-process race)", () => { const { res, saved, spawn } = harness({ state: {}, tryLock: () => false }); expect(res).toEqual({ fired: false, reason: "locked" }); From ae278af5ea4d5476ee689093bfe704b97335b57a Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 19:24:52 +0000 Subject: [PATCH 29/30] review: strict MCP config on model calls + match worker's config gate (codex P2) Two follow-ons from round 6: - claude-model: --tools "" denies tool USE but claude -p still LOADS the user's MCP config; a broken/oversized MCP schema fails every judge/proposer call before JSON. Added --strict-mcp-config to ignore user MCP entirely. - trigger: the canFire gate used the credentials FILE, but the worker uses loadConfig() (which also accepts HIVEMIND_TOKEN/ORG_ID env creds). Mismatch skipped env-cred users forever / stamped on a token-only malformed file. Now both use loadConfig()?.token. --- src/skillify/claude-model.ts | 4 ++++ src/skillify/skillopt-trigger.ts | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts index eba80ffc..34ea08e6 100644 --- a/src/skillify/claude-model.ts +++ b/src/skillify/claude-model.ts @@ -20,6 +20,10 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M // any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected // transcript text in the judge/proposer prompt can never trigger tool use. "--tools", "", + // --strict-mcp-config ignores the user's MCP config entirely (--tools only denies + // USE, not LOADING) — a broken/oversized user MCP schema would otherwise fail every + // judge/proposer call before it returns JSON, silently stopping proposals. + "--strict-mcp-config", ]; // HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND // HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart diff --git a/src/skillify/skillopt-trigger.ts b/src/skillify/skillopt-trigger.ts index 81504d31..cc9e7d9e 100644 --- a/src/skillify/skillopt-trigger.ts +++ b/src/skillify/skillopt-trigger.ts @@ -16,13 +16,17 @@ import { fileURLToPath } from "node:url"; import { log as _log } from "../utils/debug.js"; import { getStateDir } from "./state-dir.js"; import { tryAcquireWorkerLock, releaseWorkerLock } from "./state.js"; -import { loadCredentials } from "../commands/auth.js"; +import { loadConfig } from "../config.js"; const log = (m: string) => _log("skillopt-trigger", m); -/** The worker needs creds to query Deeplake; default fire-gate skips when logged out. */ +/** + * Fire-gate: the worker queries Deeplake via loadConfig(), which accepts both the + * credentials file AND env creds (HIVEMIND_TOKEN/HIVEMIND_ORG_ID). Use the SAME check + * here so we neither skip env-cred users forever nor stamp on a malformed token-only file. + */ function defaultHasCreds(): boolean { - try { return Boolean(loadCredentials()?.token); } catch { return false; } + try { return Boolean(loadConfig()?.token); } catch { return false; } } export const WEEK_MS = 7 * 24 * 60 * 60 * 1000; From 06ca4cb54f229f8added68f03501ac87d222fff8 Mon Sep 17 00:00:00 2001 From: kaghni Date: Fri, 5 Jun 2026 20:24:10 +0000 Subject: [PATCH 30/30] feat(skillopt): env-configurable worker thresholds (defaults unchanged) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The weekly worker's lookback + detector thresholds are now overridable via env — HIVEMIND_SKILLOPT_{LOOKBACK_DAYS,MIN_INVOCATIONS,FAILURE_RATE,FIRE_THRESHOLD} — with the same defaults (30d, detector min-n, 0.4 rate, >=5 fire). A positive override wins; non-numeric/<=0 falls back. Useful for tuning + smaller-data testing. --- src/skillify/skillopt-worker.ts | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts index ffaa1284..d9f1ecd7 100644 --- a/src/skillify/skillopt-worker.ts +++ b/src/skillify/skillopt-worker.ts @@ -37,7 +37,12 @@ async function main(): Promise { const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals"); const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl"); const metaCache = loadMeta(metaFile); - const sinceIso = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString(); // 30-day lookback + // Lookback + thresholds are env-tunable (defaults: 30-day window, the detector's + // own min-n, and a ≥5-deficient fire gate). A positive override wins; anything + // non-numeric/≤0 falls back to the default. + const envNum = (k: string): number | undefined => { const n = Number(process.env[k]); return Number.isFinite(n) && n > 0 ? n : undefined; }; + const lookbackDays = envNum("HIVEMIND_SKILLOPT_LOOKBACK_DAYS") ?? 30; + const sinceIso = new Date(Date.now() - lookbackDays * 24 * 60 * 60 * 1000).toISOString(); const res = await runSkillOptCycle({ query, @@ -49,7 +54,12 @@ async function main(): Promise { has: (n, a, edits) => alreadyProposed(metaCache, n, a, edits), record: (n, a, edits) => { const e = metaEntryFor(n, a, edits, new Date().toISOString()); appendMeta(metaFile, e); metaCache.push(e); }, }, - detector: { sinceIso, limit: 5000 }, + detector: { + sinceIso, limit: 5000, + minInvocations: envNum("HIVEMIND_SKILLOPT_MIN_INVOCATIONS"), + failureRateThreshold: envNum("HIVEMIND_SKILLOPT_FAILURE_RATE"), + }, + fireThreshold: envNum("HIVEMIND_SKILLOPT_FIRE_THRESHOLD"), now: new Date().toISOString(), });