diff --git a/.gitignore b/.gitignore index a0b7da6d..d6754c7c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -node_modules/ +node_modules dist/ tmp/ *.js.map diff --git a/src/hooks/session-start.ts b/src/hooks/session-start.ts index af944953..0e63390b 100644 --- a/src/hooks/session-start.ts +++ b/src/hooks/session-start.ts @@ -14,7 +14,6 @@ import { loadConfig } from "../config.js"; import { DeeplakeApi } from "../deeplake-api.js"; import { sqlStr } from "../utils/sql.js"; import { projectNameFromCwd } from "../utils/project-name.js"; -import { listActiveOrgSkills, sessionBucket, buildSkillsActiveInsert, buildSkillsActivePath, skillRootsForCwd } from "../skillify/skills-active.js"; import { readStdin } from "../utils/stdin.js"; import { log as _log } from "../utils/debug.js"; import { getInstalledVersion } from "../utils/version-check.js"; @@ -223,11 +222,6 @@ async function main(): Promise { // freezes SessionStart. Hard opt-out via HIVEMIND_AUTOPULL_DISABLED=1. // All failures swallowed inside autoPullSkills (documented as // never-rejecting), so no try/catch needed here. - // - // Runs BEFORE the skill-attribution snapshot below so that a skill pulled - // (or upgraded) during THIS SessionStart is reflected in the recorded - // skills_active set — otherwise the row would capture a stale/empty set - // while the session can already use the freshly-pulled skill. const pullResult = await autoPullSkills(); log(`autopull: pulled=${pullResult.pulled} skipped=${pullResult.skipped}`); @@ -244,42 +238,6 @@ async function main(): Promise { await api.ensureSessionsTable(sessionsTable); await createPlaceholder(api, table, input.session_id, input.cwd ?? "", config.userName, config.orgName, config.workspaceId, pluginVersion); log("placeholder created"); - - // Skill attribution (measurement): record which org-shared skills were in - // context this session + a deterministic A/B bucket. This is the label that - // makes skill value measurable (sessions with vs without skill X / v1 vs v2). - // Org skills are identified via the pull manifest (authoritative), not the - // `--` dirname pattern. Snapshot runs after auto-pull (above) so it reflects - // freshly-pulled skills. Opt-out: HIVEMIND_SKILL_ATTRIBUTION=0. - // Swallowed — must never fail SessionStart. - if (process.env.HIVEMIND_SKILL_ATTRIBUTION !== "0") { - try { - // Scan global + project-scoped (/.claude/skills) roots so - // skills pulled with `--to project` are attributed too. - const skills = listActiveOrgSkills(skillRootsForCwd(input.cwd)); - // Distinct `/skills_active/` namespace (NOT `/sessions/`) so the summary / - // raw-transcript readers never mistake this attribution row for a transcript. - const attrSessionPath = buildSkillsActivePath(config, input.session_id); - const attrFilename = attrSessionPath.slice(attrSessionPath.lastIndexOf("/") + 1); - const sql = buildSkillsActiveInsert({ - sessionsTable, - sessionPath: attrSessionPath, - filename: attrFilename, - userName: config.userName, - projectName: projectNameFromCwd(input.cwd), - pluginVersion, - sessionId: input.session_id, - cwd: input.cwd, - skills, - bucket: sessionBucket(input.session_id), - ts: new Date().toISOString(), - }); - await api.query(sql); - log(`skills_active recorded: ${skills.length} org skills, bucket ${sessionBucket(input.session_id)}`); - } catch (e: any) { - log(`skills_active attribution failed (swallowed): ${e?.message ?? e}`); - } - } } else { const reason = process.env.HIVEMIND_CAPTURE === "false" ? "HIVEMIND_CAPTURE=false" diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts new file mode 100644 index 00000000..34ea08e6 --- /dev/null +++ b/src/skillify/claude-model.ts @@ -0,0 +1,53 @@ +/** + * Shared `claude -p` backend for the engine's LLM steps (success-judge, proposer). + * All tools denied → pure-text generation. Runs on the USER's own agent, so cost + * lands on the user. Returned as an injectable ModelCall so every LLM step is + * unit-testable with zero real calls. + */ +import { spawn } from "node:child_process"; +import { findAgentBin } from "./gate-runner.js"; + +/** (systemPrompt, userPrompt) -> raw model text. */ +export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise; + +export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall { + const timeoutMs = opts.timeoutMs ?? 120_000; + return (system, user) => new Promise((resolve, reject) => { + const args = [ + "-p", user, "--model", model, "--no-session-persistence", + "--output-format", "json", "--system-prompt", system, + // Empty allow-list = NO tools available. Authoritative: it covers built-ins AND + // any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected + // transcript text in the judge/proposer prompt can never trigger tool use. + "--tools", "", + // --strict-mcp-config ignores the user's MCP config entirely (--tools only denies + // USE, not LOADING) — a broken/oversized user MCP schema would otherwise fail every + // judge/proposer call before it returns JSON, silently stopping proposals. + "--strict-mcp-config", + ]; + // HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND + // HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart + // hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph + // work) — one child per anchored invocation would otherwise contaminate the judge + // prompt and pile up background work. Same guard the other internal runners use. + // Resolve the claude binary the same way the rest of skillify does — a detached + // hook worker may not have it on PATH (e.g. ~/.claude/local/claude), and a bare + // "claude" would ENOENT and the callers would swallow it as no-change. + const child = spawn(findAgentBin("claude_code"), args, { + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" }, + }); + let out = ""; + let err = ""; + const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs); + child.stdout.on("data", (d) => { out += String(d); }); + child.stderr.on("data", (d) => { err += String(d); }); + child.on("error", (e) => { clearTimeout(timer); reject(e); }); + child.on("close", (code) => { + clearTimeout(timer); + if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`)); + try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); } + catch { resolve(out); } + }); + }); +} diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts new file mode 100644 index 00000000..73b77514 --- /dev/null +++ b/src/skillify/deficiency-detector.ts @@ -0,0 +1,118 @@ +/** + * Deficiency detector — the core of the engine's "which skills are bad" step. + * + * For each org-skill invocation: window the transcript around it, run the FREE + * level-1 anchor (user pushback?), and only if anchored spend a level-2 judge + * call (was the task actually accomplished?). A "confirmed failure" requires BOTH + * — high precision, so we never churn a good skill. Aggregate per skill: a skill + * is deficient if it has enough invocations AND a high confirmed-failure rate. + * + * Token discipline: the judge runs ONLY on anchored windows (a fraction), on a + * windowed slice (not whole sessions). Everything injectable (query + judge model) + * so the whole orchestration is unit-tested with zero live Deeplake / LLM. + * + * The ≥5 fire gate lives with the caller (worker): we just return deficientCount. + */ +import { + listSkillInvocations, windowedTurns, elide, type QueryFn, type SkillInvocation, +} from "./skill-invocations.js"; +import { detectAnchor } from "./session-anchor.js"; +import { judgeSuccess, type ModelCall } from "./success-judge.js"; + +export interface SkillDeficiency { + name: string; + author: string; + invocations: number; // org-skill invocations examined + anchored: number; // had a level-1 anchor → judged + confirmedFailures: number; // anchor AND judge said success=0 + failureRate: number; // confirmedFailures / invocations + deficient: boolean; // failureRate >= threshold AND invocations >= minInvocations + examples: string[]; // a few failure reasons (for the proposer) +} + +export interface DetectorConfig { + minInvocations?: number; // min-n per skill before we trust the rate (default 8) + failureRateThreshold?: number; // confirmed-failure rate to flag deficient (default 0.4) + window?: { before?: number; after?: number; maxChars?: number }; + judge?: ModelCall; // injected; default = real claude judge + sinceIso?: string; // lookback bound + limit?: number; // cap invocation rows pulled +} + +const skillKey = (name: string, author: string) => `${name}--${author}`; + +export interface ScoreConfig { + window?: { before?: number; after?: number; maxChars?: number }; + judge?: ModelCall; +} + +/** + * Score a set of invocations: window each, run the free anchor, and judge ONLY the + * anchored ones. Shared by the detector (per-skill deficiency) and the edit gate + * (a skill's failure rate in a time window). + */ +export async function scoreInvocations( + query: QueryFn, + sessionsTable: string, + invocations: SkillInvocation[], + cfg: ScoreConfig = {}, +): Promise<{ anchored: number; confirmed: number; examples: string[] }> { + let anchored = 0; + let confirmed = 0; + const examples: string[] = []; + for (const inv of invocations) { + const { turns, pivot } = await windowedTurns(query, sessionsTable, inv, cfg.window); + const anchor = detectAnchor(turns, pivot); // anchor only on post-invocation reaction + if (!anchor.anchored) continue; // free filter — no judge call + anchored++; + const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000); + const verdict = await judgeSuccess(window, { model: cfg.judge }); + if (verdict.success === 0) { + confirmed++; + if (examples.length < 3) examples.push(verdict.reason || anchor.evidence); + } + } + return { anchored, confirmed, examples }; +} + +export interface DetectionResult { + skills: SkillDeficiency[]; + deficientCount: number; +} + +export async function detectDeficientSkills( + query: QueryFn, + sessionsTable: string, + cfg: DetectorConfig = {}, +): Promise { + const minInvocations = cfg.minInvocations ?? 8; + const threshold = cfg.failureRateThreshold ?? 0.4; + + const invocations = await listSkillInvocations(query, sessionsTable, { sinceIso: cfg.sinceIso, limit: cfg.limit }); + + const groups = new Map(); + for (const inv of invocations) { + const k = skillKey(inv.name, inv.author); + const arr = groups.get(k); + if (arr) arr.push(inv); else groups.set(k, [inv]); + } + + const skills: SkillDeficiency[] = []; + for (const list of groups.values()) { + const { anchored, confirmed, examples } = await scoreInvocations(query, sessionsTable, list, cfg); + const failureRate = list.length ? confirmed / list.length : 0; + skills.push({ + name: list[0].name, + author: list[0].author, + invocations: list.length, + anchored, + confirmedFailures: confirmed, + failureRate, + deficient: list.length >= minInvocations && failureRate >= threshold, + examples, + }); + } + + skills.sort((a, b) => b.failureRate - a.failureRate || b.invocations - a.invocations); + return { skills, deficientCount: skills.filter((s) => s.deficient).length }; +} diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts new file mode 100644 index 00000000..83056d8d --- /dev/null +++ b/src/skillify/session-anchor.ts @@ -0,0 +1,50 @@ +/** + * Heuristic "anchor" — a HARD, observable signal in the transcript that a session + * went badly, independent of any LLM judgment: the user pushed back on / corrected + * what the assistant just did. Pure + free (no LLM, no I/O). + * + * It's the level-1 filter in the outcome pipeline: only windows with an anchor go + * to the (paid) success-judge, and a session is labelled a failure only when the + * anchor AND the judge agree. So this is deliberately tuned for RECALL over + * precision — a false positive just costs one judge call (which then drops it), + * but a false negative under-detects (conservative — it never churns a good skill). + * Patterns are meant to be tuned against real sessions; this is a starting set. + */ +import type { Turn } from "./skill-invocations.js"; + +export type AnchorKind = "correction" | "none"; +export interface Anchor { + anchored: boolean; + kind: AnchorKind; + evidence: string; // the user turn that triggered it (truncated) +} + +// Unambiguous correction — ALWAYS an anchor, even amid polite words. This must +// win over BENIGN so "thanks, but this is still failing" still fires. +const STRONG = /\b(wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail|not right)/i; + +// Ambiguous negation: "no" is pushback ("no, that's off") but also benign +// ("no problem"), so it only anchors when the turn isn't a clear benign phrase. +const AMBIGUOUS = /\b(no|nope)\b/i; +const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i; + +/** + * Detect a correction anchor in a windowed slice of turns. A pushback is a USER turn + * reacting to an ASSISTANT turn — and BOTH must be POST-invocation (index ≥ fromIndex), + * so a correction that happened BEFORE the skill ran (e.g. the skill was a repair + * attempt) isn't misattributed to this skill. fromIndex defaults to 0 (scan all). + * Recall-oriented: a strong correction fires regardless of polite framing; only the + * bare "no" is benign-gated. + */ +export function detectAnchor(turns: Turn[], fromIndex = 0): Anchor { + for (let i = Math.max(1, fromIndex); i < turns.length; i++) { + const t = turns[i]; + if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue; + if (i - 1 < fromIndex) continue; // the assistant being reacted to must be post-invocation + const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text)); + if (anchored) { + return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) }; + } + } + return { anchored: false, kind: "none", evidence: "" }; +} diff --git a/src/skillify/skill-edit-gate.ts b/src/skillify/skill-edit-gate.ts new file mode 100644 index 00000000..50d01450 --- /dev/null +++ b/src/skillify/skill-edit-gate.ts @@ -0,0 +1,89 @@ +/** + * Edit-outcome gate — the validation organ (the paper's gate, adapted). + * + * A randomized A/B is the ideal, but it needs the skill VERSION recorded at + * invocation time (a capture change we don't have yet — the Skill tool_use only + * carries the skill name). So the feasible gate is LONGITUDINAL: after an edit is + * published, compare the skill's confirmed-failure rate in the window AFTER publish + * vs BEFORE. A real drop = the edit helped → keep; a real rise = it hurt → revert + * (one `cp` from the SKILL.v.bak backup). No clear signal / too few post-publish + * uses → inconclusive (wait, or revert when stale). + * + * It's OBSERVATIONAL (confounded — the population shifts week to week), so it needs + * a margin + a minimum sample. Randomized A/B is the clean upgrade once invocation- + * version capture lands. Reuses scoreInvocations, so the same anchor+judge that + * detects deficiency also validates the fix. Injected query/judge → unit-testable. + */ +import { listSkillInvocations, type QueryFn } from "./skill-invocations.js"; +import { scoreInvocations } from "./deficiency-detector.js"; +import type { ModelCall } from "./claude-model.js"; + +export interface WindowStats { + invocations: number; + anchored: number; + confirmed: number; + failureRate: number; // confirmed / invocations +} + +export interface GateDecision { + before: WindowStats; + after: WindowStats; + delta: number; // before.failureRate - after.failureRate (positive = improved) + decision: "keep" | "revert" | "inconclusive"; +} + +interface MeasureOpts { + sinceIso?: string; + untilIso?: string; + limit?: number; + window?: { before?: number; after?: number; maxChars?: number }; + judge?: ModelCall; +} + +/** Confirmed-failure rate for one skill over a time window. */ +export async function measureSkillFailureRate( + query: QueryFn, + sessionsTable: string, + name: string, + author: string, + opts: MeasureOpts = {}, +): Promise { + const all = await listSkillInvocations(query, sessionsTable, { sinceIso: opts.sinceIso, untilIso: opts.untilIso, limit: opts.limit }); + const mine = all.filter((i) => i.name === name && i.author === author); + const { anchored, confirmed } = await scoreInvocations(query, sessionsTable, mine, { window: opts.window, judge: opts.judge }); + return { invocations: mine.length, anchored, confirmed, failureRate: mine.length ? confirmed / mine.length : 0 }; +} + +/** Pure decision from before/after stats. */ +export function gateEditOutcome( + before: WindowStats, + after: WindowStats, + opts: { margin?: number; minAfter?: number } = {}, +): GateDecision { + const margin = opts.margin ?? 0.2; + const minAfter = opts.minAfter ?? 5; + const delta = before.failureRate - after.failureRate; + let decision: GateDecision["decision"]; + if (after.invocations < minAfter) decision = "inconclusive"; // not enough post-publish use + else if (delta >= margin) decision = "keep"; // failure rate dropped → helped + else if (after.failureRate - before.failureRate >= margin) decision = "revert"; // got measurably worse + else decision = "inconclusive"; // no clear signal + return { before, after, delta, decision }; +} + +/** Full gate: measure before/after a publish timestamp and decide. */ +export async function gateEdit( + query: QueryFn, + sessionsTable: string, + name: string, + author: string, + publishIso: string, + opts: { windowDays?: number; nowIso?: string; margin?: number; minAfter?: number } & MeasureOpts = {}, +): Promise { + const windowDays = opts.windowDays ?? 14; + const beforeSince = new Date(Date.parse(publishIso) - windowDays * 24 * 60 * 60 * 1000).toISOString(); + const shared = { limit: opts.limit, window: opts.window, judge: opts.judge }; + const before = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: beforeSince, untilIso: publishIso }); + const after = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: publishIso, untilIso: opts.nowIso }); + return gateEditOutcome(before, after, opts); +} diff --git a/src/skillify/skill-edits.ts b/src/skillify/skill-edits.ts new file mode 100644 index 00000000..115c1aa9 --- /dev/null +++ b/src/skillify/skill-edits.ts @@ -0,0 +1,109 @@ +/** + * Structured, bounded edits over a markdown SKILL.md — the paper's edit operations + * (append / insert_after / replace / delete) and "textual learning rate" (edit + * budget). Port of SkillOpt's skillopt/optimizer/skill.py. + * + * A protected region — between and + * — holds longitudinal guidance that fast per-edit changes must NOT touch (the + * paper's slow-update). Edits targeting it are skipped, and `append` lands above it. + * + * Pure + deterministic — no I/O, fully unit-testable. + */ +export type EditOp = "append" | "insert_after" | "replace" | "delete"; +export interface Edit { + op: EditOp; + target?: string; // anchor text for insert_after / replace / delete + content?: string; // new text for append / insert_after / replace +} + +export const SU_START = ""; +export const SU_END = ""; + +function protectedRange(skill: string): [number, number] | null { + const a = skill.indexOf(SU_START); + const b = skill.indexOf(SU_END); + if (a === -1 || b === -1 || b < a) return null; + return [a, b + SU_END.length]; +} + +function targetsProtected(skill: string, target: string): boolean { + const r = protectedRange(skill); + if (!r || !target) return false; + const idx = skill.indexOf(target); + if (idx === -1) return false; + // Reject if the target RANGE [idx, idx+len) overlaps the protected range at all — + // not just if it starts inside it (a target that begins just before SLOW_UPDATE_START + // and spans into the block must not be allowed to delete protected guidance). + return idx < r[1] && idx + target.length > r[0]; +} + +/** Enforce the edit budget ("textual learning rate"): keep at most `budget` edits. */ +export function selectEdits(edits: Edit[], budget: number): Edit[] { + return edits.slice(0, Math.max(0, budget)); +} + +export interface ApplyResult { + skill: string; + report: string[]; + applied: number; // how many edits actually changed the doc +} + +/** Apply bounded structured edits; protected-region targets are skipped. */ +export function applyEdits(skill: string, edits: Edit[]): ApplyResult { + let s = skill; + const report: string[] = []; + let applied = 0; + const ok = (msg: string) => { applied++; report.push(`OK ${msg}`); }; + + for (const e of edits) { + if (e.target && targetsProtected(s, e.target)) { + report.push(`SKIP ${e.op}: targets protected slow-update region`); + continue; + } + switch (e.op) { + case "append": { + const content = (e.content ?? "").trim(); + if (!content) { report.push("SKIP append: empty content"); break; } + const r = protectedRange(s); + if (r) s = s.slice(0, r[0]) + content + "\n\n" + s.slice(r[0]); + else s = s.replace(/\s*$/, "") + "\n\n" + content + "\n"; + ok(`append (+${content.length} chars)`); + break; + } + case "insert_after": { + const target = e.target ?? ""; + const content = (e.content ?? "").trim(); + if (!target || !content) { report.push("SKIP insert_after: missing target/content"); break; } + const idx = s.indexOf(target); + if (idx === -1) { report.push("SKIP insert_after: target not found"); break; } + const lineEnd = s.indexOf("\n", idx + target.length); + const at = lineEnd === -1 ? s.length : lineEnd; + s = s.slice(0, at) + "\n" + content + s.slice(at); + ok("insert_after"); + break; + } + case "replace": { + const target = e.target ?? ""; + const content = e.content ?? ""; + if (!target) { report.push("SKIP replace: missing target"); break; } + const idx = s.indexOf(target); + if (idx === -1) { report.push("SKIP replace: target not found"); break; } + s = s.slice(0, idx) + content + s.slice(idx + target.length); + ok("replace"); + break; + } + case "delete": { + const target = e.target ?? ""; + if (!target) { report.push("SKIP delete: missing target"); break; } + const idx = s.indexOf(target); + if (idx === -1) { report.push("SKIP delete: target not found"); break; } + s = s.slice(0, idx) + s.slice(idx + target.length); + ok("delete"); + break; + } + default: + report.push(`SKIP unknown op: ${(e as Edit).op}`); + } + } + return { skill: s, report, applied }; +} diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts new file mode 100644 index 00000000..ae9ed9b7 --- /dev/null +++ b/src/skillify/skill-invocations.ts @@ -0,0 +1,192 @@ +/** + * Read side of skill *invocation* attribution — the basis for deficiency detection. + * + * A skill can only help or hurt if the agent actually INVOKED it. Claude Code + * records each invocation as a `Skill` tool_use, which capture.ts persists as a + * tool_call row: `message.tool_name === "Skill"`, `message.tool_input` a JSON + * string `{ skill: "--", args? }`. We key on these real invocations + * rather than availability (the dropped skills_active) because: + * - it's accurate — availability-without-invocation is pure noise, and + * - it pins the exact turn, so we can window the judge tightly around it. + * + * Org skills only: the invoked `skill` is `--`. Plugin-namespaced + * (`hivemind:...`) and bare skills are not org-mined skills and are skipped. + * + * Every query is injected (QueryFn), so this is unit-testable with no live Deeplake. + */ +import { sqlStr } from "../utils/sql.js"; + +export type QueryFn = (sql: string) => Promise>>; + +export interface SkillInvocation { + sessionId: string; + name: string; + author: string; + ts: string; // invocation timestamp (message.timestamp, else the row's last_update_date) +} + +interface ParsedMsg { + type?: string; + tool_name?: string; + tool_input?: unknown; + content?: unknown; + session_id?: unknown; + timestamp?: unknown; +} + +function parseMessage(m: unknown): ParsedMsg | null { + if (m == null) return null; + if (typeof m === "string") { + try { return JSON.parse(m) as ParsedMsg; } catch { return null; } + } + if (typeof m === "object") return m as ParsedMsg; + return null; +} + +/** The skill ref invoked by a tool_call message (e.g. "name--author"), else null. */ +export function invokedSkillRef(msg: ParsedMsg): string | null { + if (msg.type !== "tool_call" || msg.tool_name !== "Skill") return null; + let input: unknown = msg.tool_input; + if (typeof input === "string") { try { input = JSON.parse(input); } catch { return null; } } + const skill = (input as { skill?: unknown })?.skill; + return typeof skill === "string" && skill.length > 0 ? skill : null; +} + +/** Split "--" → parts. null for plugin-namespaced / bare / malformed refs. */ +export function splitOrgSkill(skill: string): { name: string; author: string } | null { + if (skill.includes(":")) return null; // plugin-namespaced (e.g. hivemind:hivemind-memory) + // name/author are used to build filesystem paths (skills dir, proposals dir), so a + // captured tool_input must not smuggle path separators / traversal — same untrusted + // treatment the pull path applies to these segments. + if (skill.includes("/") || skill.includes("\\") || skill.includes("..")) return null; + const i = skill.lastIndexOf("--"); + if (i <= 0 || i + 2 >= skill.length) return null; // bare or malformed + return { name: skill.slice(0, i), author: skill.slice(i + 2) }; +} + +/** + * Org-skill invocations across captured sessions, newest first. Coarse prefilter + * on `"Skill"` (robust to JSONB colon-spacing) then a precise in-code check, so a + * stray "Skill" in prose can't slip through as a real invocation. + */ +export async function listSkillInvocations( + query: QueryFn, + sessionsTable: string, + opts: { sinceIso?: string; untilIso?: string; limit?: number } = {}, +): Promise { + const where = [`CAST(message AS TEXT) LIKE '%"Skill"%'`]; + if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`); + if (opts.untilIso) where.push(`last_update_date < '${sqlStr(opts.untilIso)}'`); + const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : ""; + const rows = await query( + `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`, + ); + const out: SkillInvocation[] = []; + for (const r of rows) { + const m = parseMessage(r.message); + if (!m) continue; + const ref = invokedSkillRef(m); + if (!ref) continue; + const parts = splitOrgSkill(ref); + if (!parts) continue; + const sessionId = typeof m.session_id === "string" ? m.session_id : ""; + if (!sessionId) continue; + out.push({ + sessionId, + name: parts.name, + author: parts.author, + ts: typeof m.timestamp === "string" ? m.timestamp + : (typeof r.last_update_date === "string" ? r.last_update_date : ""), + }); + } + return out; +} + +export interface Turn { role: "USER" | "ASSISTANT"; text: string } + +/** + * Reconstruct the transcript turns of a session, and mark where (between which two + * turns) the given invocation happened — so callers can window around it. + */ +/** Escape SQL LIKE wildcards (% _ \) so a session id with those chars matches literally. */ +function likeEscape(s: string): string { + return s.replace(/([\\%_])/g, "\\$1"); +} + +async function sessionTurns( + query: QueryFn, sessionsTable: string, inv: SkillInvocation, +): Promise<{ turns: Turn[]; invIndex: number }> { + const sid = sqlStr(likeEscape(inv.sessionId)); + const rows = await query( + `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ESCAPE '\\' ORDER BY creation_date ASC`, + ); + const turns: Turn[] = []; + let invIndex = -1; + for (const r of rows) { + const j = parseMessage(r.message); + if (!j) continue; + // Exact session match: `path LIKE %sid%` can match a substring/wildcard collision, + // so drop any row whose recorded session_id isn't this exact session. + if (typeof j.session_id === "string" && j.session_id !== inv.sessionId) continue; + // The invocation itself is a tool_call (not a turn): mark its position then skip. + const ref = invokedSkillRef(j); + if (ref) { + const p = splitOrgSkill(ref); + if (invIndex < 0 && p && p.name === inv.name && p.author === inv.author + && (typeof j.timestamp !== "string" || !inv.ts || j.timestamp === inv.ts)) { + invIndex = turns.length; + } + continue; + } + const text = typeof j.content === "string" ? j.content.trim() : ""; + if (!text) continue; + if (j.type === "user_message") turns.push({ role: "USER", text }); + else if (j.type === "assistant_message") turns.push({ role: "ASSISTANT", text }); + } + if (invIndex < 0) invIndex = turns.length; // invocation not located → treat as session end + return { turns, invIndex }; +} + +/** + * The transcript window around an invocation: `before` turns before it and `after` + * turns after — where the help-or-harm signal lives — head+tail elided to maxChars. + * `before`/`after` are tunable; defaults chosen as a small starting point. + */ +/** A windowed slice plus `pivot` = the index in `turns` of the first POST-invocation + * turn (turns before it are the pre-invocation context — kept for the judge, but the + * anchor must not scan them, or a prior correction gets misattributed to this skill). */ +export interface WindowSlice { + turns: Turn[]; + pivot: number; +} + +export async function windowedTurns( + query: QueryFn, + sessionsTable: string, + inv: SkillInvocation, + opts: { before?: number; after?: number } = {}, +): Promise { + const before = opts.before ?? 3; + const after = opts.after ?? 6; + const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv); + const start = Math.max(0, invIndex - before); + return { turns: turns.slice(start, invIndex + after), pivot: invIndex - start }; +} + +/** Head+tail elide a string to maxChars (so a pasted log/diff can't blow a prompt). */ +export function elide(text: string, maxChars: number): string { + if (text.length <= maxChars) return text; + const head = text.slice(0, Math.floor(maxChars * 0.55)); + const tail = text.slice(text.length - Math.floor(maxChars * 0.45)); + return `${head}\n\n…[${text.length - maxChars} chars elided]…\n\n${tail}`; +} + +export async function windowAroundInvocation( + query: QueryFn, + sessionsTable: string, + inv: SkillInvocation, + opts: { before?: number; after?: number; maxChars?: number } = {}, +): Promise { + const { turns } = await windowedTurns(query, sessionsTable, inv, opts); + return elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000); +} diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts new file mode 100644 index 00000000..1032b1bd --- /dev/null +++ b/src/skillify/skill-proposer.ts @@ -0,0 +1,91 @@ +/** + * Proposer — the engine's "reflect → edit" step (the paper's backward pass). + * Given a deficient skill's body + the concrete failures the detector confirmed, + * the optimizer diagnoses the single recurring weakness and proposes a SMALL set + * of structured edits, bounded by the edit budget ("textual learning rate"). The + * protected slow-update region is off-limits. Edits are applied locally to produce + * the candidate body — NOTHING is published here (publish is a separate, gated step). + * + * Runs on the user's agent via an injected ModelCall (default = claude sonnet), + * so the reflect logic is unit-testable with zero real LLM calls. + */ +import { applyEdits, selectEdits, SU_START, SU_END, type Edit, type EditOp } from "./skill-edits.js"; +import { claudeModel, type ModelCall } from "./claude-model.js"; + +export interface Proposal { + edits: Edit[]; // edits kept after the budget + editedBody: string; // skill body after applying them + report: string[]; // per-edit OK/SKIP log + changed: boolean; // did anything actually change? +} + +export interface ProposeConfig { + editBudget?: number; // max edits to keep (default 3) + model?: ModelCall; // injected; default = claude sonnet + priorEdits?: string[]; // meta-skill: edits already tried for this skill (don't repeat) +} + +const SYSTEM = + "You improve an engineering SKILL document that has been producing repeated, " + + "confirmed failures. Diagnose the SINGLE recurring weakness behind the failures " + + "and propose a SMALL set of structured edits that fix it. Do NOT rewrite the " + + `whole doc, and do NOT touch anything between ${SU_START} and ${SU_END}. Reply ` + + 'with ONLY a JSON array of edits, each: {"op":"append|insert_after|replace|' + + 'delete","target":"","content":""}. Prefer the smallest change that fixes the weakness.'; + +function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string { + const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n"); + const prior = priorEdits.length + ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}` + : ""; + return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nPropose the bounded edits. JSON array only.`; +} + +const OPS = new Set(["append", "insert_after", "replace", "delete"]); + +/** Tolerant parse of a JSON array of edits (handles ```fences / surrounding prose). */ +export function parseEdits(raw: string): Edit[] { + let s = raw.trim(); + const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/); + if (fence) s = fence[1].trim(); + const a = s.indexOf("["); + const b = s.lastIndexOf("]"); + if (a === -1 || b <= a) return []; + let arr: unknown; + try { arr = JSON.parse(s.slice(a, b + 1)); } catch { return []; } + if (!Array.isArray(arr)) return []; + const out: Edit[] = []; + for (const e of arr) { + if (!e || typeof e !== "object") continue; + const op = (e as { op?: unknown }).op; + if (typeof op !== "string" || !OPS.has(op as EditOp)) continue; + const target = (e as { target?: unknown }).target; + const content = (e as { content?: unknown }).content; + out.push({ + op: op as EditOp, + ...(typeof target === "string" ? { target } : {}), + ...(typeof content === "string" ? { content } : {}), + }); + } + return out; +} + +export async function proposeSkillEdit( + skillBody: string, + failures: string[], + cfg: ProposeConfig = {}, +): Promise { + const budget = cfg.editBudget ?? 3; + const model = cfg.model ?? claudeModel("sonnet"); + let raw: string; + try { + raw = await model(SYSTEM, buildUserPrompt(skillBody, failures, cfg.priorEdits ?? [])); + } catch { + return { edits: [], editedBody: skillBody, report: ["proposer model call failed"], changed: false }; + } + const edits = selectEdits(parseEdits(raw), budget); + const { skill, report, applied } = applyEdits(skillBody, edits); + return { edits, editedBody: skill, report, changed: applied > 0 }; +} diff --git a/src/skillify/skill-publisher.ts b/src/skillify/skill-publisher.ts new file mode 100644 index 00000000..a647dcb5 --- /dev/null +++ b/src/skillify/skill-publisher.ts @@ -0,0 +1,64 @@ +/** + * Publish mechanism: write an accepted skill edit to the LIVE SKILL.md via the + * native skills dir (the only legitimate channel — never the model's prompt + * context; see PR #223). Bumps the frontmatter version (enables v1-vs-v2) and + * keeps a backup so a bad edit is one `cp` from revert. + * + * This is the mechanism only. The worker does NOT call it on an unvalidated edit + * (the offline gate isn't trustworthy — see the spike findings); it writes a + * review proposal instead, and live publish is reserved for an edit that has + * passed the real-usage A/B gate (deferred). Pure fs; testable against a tmp dir. + */ +import fs from "node:fs"; +import path from "node:path"; + +export interface PublishResult { + path: string; + oldVersion: number; + newVersion: number; + backupPath: string; +} + +/** Split a SKILL.md into its frontmatter block (incl. fences) and the body. */ +export function splitFrontmatter(md: string): { frontmatter: string; body: string } { + const m = md.match(/^(---\n[\s\S]*?\n---\n)([\s\S]*)$/); + if (m) return { frontmatter: m[1], body: m[2] }; + return { frontmatter: "", body: md }; +} + +/** Bump `version: N` in a frontmatter block (absent → treat as 1 → 2). */ +export function bumpVersion(frontmatter: string): { frontmatter: string; oldVersion: number; newVersion: number } { + const m = frontmatter.match(/^version:\s*(\d+)\s*$/m); + const oldVersion = m ? parseInt(m[1], 10) : 1; + const newVersion = oldVersion + 1; + let next: string; + if (m) { + next = frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`); // has a version line + } else if (/\n---\n$/.test(frontmatter)) { + next = frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`); // frontmatter, no version + } else { + next = `---\nversion: ${newVersion}\n---\n`; // no frontmatter → create one + } + return { frontmatter: next, oldVersion, newVersion }; +} + +/** + * Write `editedBody` to the skill's live SKILL.md, version bumped, original backed + * up to SKILL.v.bak.md. Throws if the skill dir / file isn't present. + */ +export function publishSkillEdit( + skillsRoot: string, + name: string, + author: string, + editedBody: string, +): PublishResult { + const dir = path.join(skillsRoot, `${name}--${author}`); + const file = path.join(dir, "SKILL.md"); + const existing = fs.readFileSync(file, "utf8"); + const { frontmatter } = splitFrontmatter(existing); + const { frontmatter: bumped, oldVersion, newVersion } = bumpVersion(frontmatter); + const backupPath = path.join(dir, `SKILL.v${oldVersion}.bak.md`); + fs.writeFileSync(backupPath, existing); + fs.writeFileSync(file, `${bumped}${editedBody.trimEnd()}\n`); + return { path: file, oldVersion, newVersion, backupPath }; +} diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts new file mode 100644 index 00000000..4d1a5f8f --- /dev/null +++ b/src/skillify/skillopt-engine.ts @@ -0,0 +1,131 @@ +/** + * The weekly SkillOpt cycle, wired end to end and fully injectable: + * + * detect deficient skills → ≥5 fire gate → for each: read body, propose a + * bounded edit, write a REVIEW PROPOSAL (not a live publish). + * + * Why proposals, not auto-publish: the offline gate isn't trustworthy (spike + * finding), so we never auto-overwrite a live skill. The engine surfaces concrete, + * evidence-backed edit proposals; turning one live is gated on the real-usage A/B + * (deferred) or a human. Everything is injected (query, judge/proposer models, the + * skill reader, the proposal writer), so this orchestration is unit-tested with no + * Deeplake / LLM / fs. + */ +import fs from "node:fs"; +import path from "node:path"; +import { detectDeficientSkills, type DetectorConfig } from "./deficiency-detector.js"; +import { proposeSkillEdit, type ProposeConfig } from "./skill-proposer.js"; +import { splitFrontmatter } from "./skill-publisher.js"; +import type { QueryFn } from "./skill-invocations.js"; +import type { Edit } from "./skill-edits.js"; +import type { PulledManifest } from "./manifest.js"; + +export interface ProposalRecord { + name: string; + author: string; + invocations: number; + confirmedFailures: number; + failureRate: number; + examples: string[]; + edits: Edit[]; + report: string[]; + candidateBody: string; + createdAt: string; +} + +export interface CycleDeps { + query: QueryFn; + sessionsTable: string; + readSkillBody: (name: string, author: string) => string | null; // null when not installed locally + writeProposal: (rec: ProposalRecord) => void; + detector?: DetectorConfig; + proposer?: ProposeConfig; + fireThreshold?: number; // deficient-skill count to fire (default 5) + maxProposals?: number; // cap edits proposed per cycle (default 10) + now: string; // ISO timestamp (injected — Date is awkward in workers) + meta?: { // optimizer cross-run memory (skillopt-meta); optional + prior: (name: string, author: string) => string[]; + has: (name: string, author: string, edits: Edit[]) => boolean; + record: (name: string, author: string, edits: Edit[]) => void; + }; +} + +export interface CycleResult { + deficientCount: number; + fired: boolean; + proposals: Array<{ name: string; author: string; changed: boolean; failureRate: number }>; +} + +export async function runSkillOptCycle(deps: CycleDeps): Promise { + const fireThreshold = deps.fireThreshold ?? 5; + const { skills, deficientCount } = await detectDeficientSkills(deps.query, deps.sessionsTable, deps.detector); + + // The ≥N gate: only act on a real PATTERN of deficiency, not one or two noisy skills. + if (deficientCount < fireThreshold) { + return { deficientCount, fired: false, proposals: [] }; + } + + const targets = skills.filter((s) => s.deficient).slice(0, deps.maxProposals ?? 10); + const proposals: CycleResult["proposals"] = []; + for (const s of targets) { + const body = deps.readSkillBody(s.name, s.author); + if (!body) continue; // not installed locally → nothing to edit + const priorEdits = deps.meta?.prior(s.name, s.author) ?? []; + const p = await proposeSkillEdit(body, s.examples, { ...deps.proposer, priorEdits }); + // dedup against the meta memory — don't re-write an edit already tried for this skill. + const isDup = p.changed && (deps.meta?.has(s.name, s.author, p.edits) ?? false); + if (p.changed && !isDup) { + deps.writeProposal({ + name: s.name, author: s.author, + invocations: s.invocations, confirmedFailures: s.confirmedFailures, failureRate: s.failureRate, + examples: s.examples, edits: p.edits, report: p.report, + candidateBody: p.editedBody, createdAt: deps.now, + }); + deps.meta?.record(s.name, s.author, p.edits); + } + proposals.push({ name: s.name, author: s.author, changed: p.changed && !isDup, failureRate: s.failureRate }); + } + return { deficientCount, fired: true, proposals }; +} + +/** Default proposal writer: /--/{proposal.json,candidate.md}. */ +export function writeProposalToDisk(proposalsRoot: string, rec: ProposalRecord): string { + const dir = path.join(proposalsRoot, `${rec.name}--${rec.author}`); + fs.mkdirSync(dir, { recursive: true }); + fs.writeFileSync(path.join(dir, "candidate.md"), rec.candidateBody.trimEnd() + "\n"); + fs.writeFileSync(path.join(dir, "proposal.json"), JSON.stringify(rec, null, 2) + "\n"); + return dir; +} + +/** Read a skill's SKILL.md body (frontmatter stripped) from a skills root; null if absent. */ +export function readSkillBodyFromDisk(skillsRoot: string, name: string, author: string): string | null { + try { + const md = fs.readFileSync(path.join(skillsRoot, `${name}--${author}`, "SKILL.md"), "utf8"); + return splitFrontmatter(md).body.trim(); + } catch { + return null; + } +} + +/** + * Resolve a skill's body from its ACTUAL install location via the pull manifest, + * trying every recorded installRoot, then a fallback root. Authoritative — handles + * skills pulled with `--to project` into any cwd (invocations come from all + * projects, so the worker can't assume its own cwd), and avoids editing a + * same-named skill that happens to sit in the current cwd. + */ +export function readSkillBodyViaManifest( + name: string, + author: string, + manifest: PulledManifest, + fallbackRoot?: string, +): string | null { + const dirName = `${name}--${author}`; + const roots = manifest.entries.filter((e) => e.dirName === dirName).map((e) => e.installRoot); + if (fallbackRoot) roots.push(fallbackRoot); + for (const root of roots) { + const body = readSkillBodyFromDisk(root, name, author); + if (body) return body; + } + return null; +} diff --git a/src/skillify/skillopt-meta.ts b/src/skillify/skillopt-meta.ts new file mode 100644 index 00000000..f86473f6 --- /dev/null +++ b/src/skillify/skillopt-meta.ts @@ -0,0 +1,84 @@ +/** + * Meta-skill — the optimizer's cross-run memory (the paper's meta-skill). Records + * every edit proposed for a skill so later runs (a) don't re-propose an edit that + * was already tried, and (b) feed "what's been tried" to the proposer. When the A/B + * gate lands, the recorded `status` (proposed → applied/reverted) closes the loop so + * the optimizer learns which kinds of edits actually help a given skill. + * + * Append-only JSONL at /skillopt/meta.jsonl. Pure helpers + injected path, + * so it's unit-tested with a tmp file. + */ +import fs from "node:fs"; +import path from "node:path"; +import type { Edit } from "./skill-edits.js"; + +export type MetaStatus = "proposed" | "applied" | "reverted"; + +export interface MetaEntry { + skill: string; // "--" + ops: string[]; // short per-edit summaries (op + anchor/preview) + fingerprint: string; // stable hash of the edits, for dedup + proposedAt: string; + status: MetaStatus; +} + +export const skillRef = (name: string, author: string) => `${name}--${author}`; + +/** Short human summary of one edit. */ +function summarizeEdit(e: Edit): string { + const anchor = e.target ? ` @"${e.target.slice(0, 40)}"` : ""; + const preview = e.content ? `: ${e.content.slice(0, 60).replace(/\s+/g, " ")}` : ""; + return `${e.op}${anchor}${preview}`; +} + +/** Order-independent fingerprint of an edit set (so the same edits dedup). */ +export function fingerprintEdits(edits: Edit[]): string { + return edits + .map((e) => `${e.op}|${e.target ?? ""}|${e.content ?? ""}`) + .sort() + .join("\n"); +} + +export function loadMeta(file: string): MetaEntry[] { + let raw: string; + try { raw = fs.readFileSync(file, "utf8"); } catch { return []; } + const out: MetaEntry[] = []; + for (const line of raw.split("\n")) { + const t = line.trim(); + if (!t) continue; + try { + const e = JSON.parse(t) as MetaEntry; + if (e && typeof e.skill === "string" && typeof e.fingerprint === "string") out.push(e); + } catch { /* skip malformed line */ } + } + return out; +} + +export function appendMeta(file: string, entry: MetaEntry): void { + fs.mkdirSync(path.dirname(file), { recursive: true }); + fs.appendFileSync(file, JSON.stringify(entry) + "\n"); +} + +/** Has this exact edit set already been proposed for this skill? (avoid churn) */ +export function alreadyProposed(meta: MetaEntry[], name: string, author: string, edits: Edit[]): boolean { + const ref = skillRef(name, author); + const fp = fingerprintEdits(edits); + return meta.some((m) => m.skill === ref && m.fingerprint === fp); +} + +/** Summaries of edits previously tried for this skill — context for the proposer. */ +export function priorEditSummaries(meta: MetaEntry[], name: string, author: string): string[] { + const ref = skillRef(name, author); + return meta.filter((m) => m.skill === ref).flatMap((m) => m.ops); +} + +/** Build a meta entry for a freshly-proposed edit set. */ +export function metaEntryFor(name: string, author: string, edits: Edit[], now: string): MetaEntry { + return { + skill: skillRef(name, author), + ops: edits.map(summarizeEdit), + fingerprint: fingerprintEdits(edits), + proposedAt: now, + status: "proposed", + }; +} diff --git a/src/skillify/skillopt-trigger.ts b/src/skillify/skillopt-trigger.ts index f96a757a..cc9e7d9e 100644 --- a/src/skillify/skillopt-trigger.ts +++ b/src/skillify/skillopt-trigger.ts @@ -16,9 +16,19 @@ import { fileURLToPath } from "node:url"; import { log as _log } from "../utils/debug.js"; import { getStateDir } from "./state-dir.js"; import { tryAcquireWorkerLock, releaseWorkerLock } from "./state.js"; +import { loadConfig } from "../config.js"; const log = (m: string) => _log("skillopt-trigger", m); +/** + * Fire-gate: the worker queries Deeplake via loadConfig(), which accepts both the + * credentials file AND env creds (HIVEMIND_TOKEN/HIVEMIND_ORG_ID). Use the SAME check + * here so we neither skip env-cred users forever nor stamp on a malformed token-only file. + */ +function defaultHasCreds(): boolean { + try { return Boolean(loadConfig()?.token); } catch { return false; } +} + export const WEEK_MS = 7 * 24 * 60 * 60 * 1000; /** Cross-process lock key arbitrating the weekly fire (see runWeeklySkillOpt). */ const LOCK_KEY = "skillopt-weekly"; @@ -81,11 +91,12 @@ export interface FireDeps { tryLock?: () => boolean; // cross-process arbiter; default: real worker lock releaseLock?: () => void; // default: release the real worker lock reload?: () => SkillOptState; // fresh state re-read INSIDE the lock; default: loadState + canFire?: () => boolean; // gate before stamping; default: creds present } export interface FireResult { fired: boolean; - reason?: "disabled" | "in-worker" | "throttled" | "locked" | "spawned"; + reason?: "disabled" | "in-worker" | "throttled" | "locked" | "no-creds" | "spawned"; } /** @@ -102,6 +113,10 @@ export function runWeeklySkillOpt(deps: FireDeps = {}): FireResult { // Cheap pre-lock check: skip the lock entirely when clearly throttled. if (!shouldFire(state.lastRun, now)) return { fired: false, reason: "throttled" }; + // Don't burn the weekly throttle when logged out — stamping lastRun before the + // worker confirms creds would skip a user who logs in shortly after until next week. + if (!(deps.canFire ?? defaultHasCreds)()) return { fired: false, reason: "no-creds" }; + // Cross-process arbiter: two SessionStart hooks racing at the weekly boundary // could both pass the throttle and spawn duplicate workers (doubling user-side // cost once the worker does real LLM work). An atomic openSync(wx) worker-lock diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts index 77d815bc..d9f1ecd7 100644 --- a/src/skillify/skillopt-worker.ts +++ b/src/skillify/skillopt-worker.ts @@ -1,32 +1,74 @@ #!/usr/bin/env node /** - * Detached weekly SkillOpt worker (spawned by skillopt-trigger). Runs the loop ONCE: - * 1. detect a deficient skill (behavioral: sessions that loaded it still scored low) - * 2. optimizer proposes a bounded edit (v2) - * 3. real-rollout gate: keep v2 only if it measurably beats v1 - * 4. silent canary publish + post-publish monitor / auto-revert + * Detached weekly SkillOpt worker (spawned by skillopt-trigger). Runs the cycle ONCE: + * 1. detect deficient skills from real invocations (anchor + judge, windowed) + * 2. ≥5 fire gate (act on a pattern, not noise) + * 3. propose a bounded edit per deficient skill and write a REVIEW PROPOSAL * - * Uses the user's own agent (claude -p / codex), so no org API key. Runs in the background; the - * user never notices. HIVEMIND_SKILLOPT_WORKER=1 is set by the trigger as a recursion guard. - * - * STATUS: scaffold. Steps 1/3/4 depend on prerequisites not yet shipped (deployed attribution data - * for detection + monitoring, and a local rollout sandbox). The loop ENGINE (rollout->optimize->gate) - * is prototyped in experiments/skillopt-spike (skillopt-loop.ts, validated both directions). This - * entry exists so the trigger has a real, spawnable target and the wiring is testable end to end. + * It does NOT auto-publish: the offline gate isn't trustworthy (spike finding), so + * live publish is reserved for the real-usage A/B (deferred). Runs on the user's own + * agent (claude -p) — no org key, cost lands on the user — in the background, weekly. + * HIVEMIND_SKILLOPT_WORKER=1 is set by the trigger as a recursion guard. */ +import os from "node:os"; +import path from "node:path"; import { log as _log } from "../utils/debug.js"; +import { loadConfig } from "../config.js"; +import { DeeplakeApi } from "../deeplake-api.js"; +import { getStateDir } from "./state-dir.js"; +import { runSkillOptCycle, writeProposalToDisk, readSkillBodyViaManifest } from "./skillopt-engine.js"; +import { loadMeta, appendMeta, priorEditSummaries, alreadyProposed, metaEntryFor } from "./skillopt-meta.js"; +import { loadManifest } from "./manifest.js"; const log = (m: string) => _log("skillopt-worker", m); async function main(): Promise { log("skillopt worker started (detached, weekly)"); - // TODO(skillopt): wire the validated loop engine here once prerequisites land: - // const skill = await detectDeficientSkill(); // needs deployed attribution + satisfaction - // if (!skill) { log("no deficient skill found"); return; } - // const v2 = await optimize(skill); // optimizer proposes a bounded edit - // const gain = await gateViaRealRollout(skill, v2); // keep only if v2 beats v1 (validated) - // if (gain > THRESHOLD) await canaryPublish(skill, v2); // silent; monitor + auto-revert - log("skillopt worker: loop body not yet enabled (prerequisites pending) — exiting cleanly"); + const config = loadConfig(); + if (!config?.token) { log("no config/credentials — exiting"); return; } + + const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName); + const query = (sql: string) => api.query(sql) as Promise>>; + // Resolve skill bodies via the pull manifest's recorded installRoot (authoritative) + // — invocations come from ALL projects, so we can't assume the worker's own cwd. + // The global ~/.claude/skills is a fallback for skills not in the manifest. + const manifest = loadManifest(); + const skillsRoot = path.join(os.homedir(), ".claude", "skills"); + const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals"); + const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl"); + const metaCache = loadMeta(metaFile); + // Lookback + thresholds are env-tunable (defaults: 30-day window, the detector's + // own min-n, and a ≥5-deficient fire gate). A positive override wins; anything + // non-numeric/≤0 falls back to the default. + const envNum = (k: string): number | undefined => { const n = Number(process.env[k]); return Number.isFinite(n) && n > 0 ? n : undefined; }; + const lookbackDays = envNum("HIVEMIND_SKILLOPT_LOOKBACK_DAYS") ?? 30; + const sinceIso = new Date(Date.now() - lookbackDays * 24 * 60 * 60 * 1000).toISOString(); + + const res = await runSkillOptCycle({ + query, + sessionsTable: config.sessionsTableName, + readSkillBody: (name, author) => readSkillBodyViaManifest(name, author, manifest, skillsRoot), + writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec), + meta: { + prior: (n, a) => priorEditSummaries(metaCache, n, a), + has: (n, a, edits) => alreadyProposed(metaCache, n, a, edits), + record: (n, a, edits) => { const e = metaEntryFor(n, a, edits, new Date().toISOString()); appendMeta(metaFile, e); metaCache.push(e); }, + }, + detector: { + sinceIso, limit: 5000, + minInvocations: envNum("HIVEMIND_SKILLOPT_MIN_INVOCATIONS"), + failureRateThreshold: envNum("HIVEMIND_SKILLOPT_FAILURE_RATE"), + }, + fireThreshold: envNum("HIVEMIND_SKILLOPT_FIRE_THRESHOLD"), + now: new Date().toISOString(), + }); + + if (!res.fired) { + log(`skillopt: ${res.deficientCount} deficient skill(s) — below the fire gate, no action`); + } else { + const changed = res.proposals.filter((p) => p.changed).length; + log(`skillopt: fired — ${res.deficientCount} deficient, ${changed} edit proposal(s) written to ${proposalsRoot}`); + } } main().catch((e) => { log(`fatal (swallowed): ${(e as Error)?.message ?? e}`); process.exit(0); }); diff --git a/src/skillify/skills-active.ts b/src/skillify/skills-active.ts deleted file mode 100644 index d9b3c46d..00000000 Binary files a/src/skillify/skills-active.ts and /dev/null differ diff --git a/src/skillify/success-judge.ts b/src/skillify/success-judge.ts new file mode 100644 index 00000000..c2d65626 --- /dev/null +++ b/src/skillify/success-judge.ts @@ -0,0 +1,68 @@ +/** + * Success-judge — reward level 2. Given a windowed transcript slice, asks the + * model the ONE question that resists sycophancy: was the user's task actually + * accomplished CORRECTLY? (Ignore whether the user seemed happy — a praised-but- + * wrong answer is a failure.) Returns success 0|1 + confidence + reason. + * + * Runs on the USER's own agent (claude -p) — cost lands on the user, so the + * default model is cheap and this is only ever called on anchor-flagged windows + * (level 1), never on every session. + * + * The model call is injected (ModelCall) so the judging logic is unit-tested with + * zero real LLM calls; the default shells out to `claude -p`, all tools denied. + * + * Conservative on failure: an unparseable/errored/empty judgment returns + * success=1 (do NOT count as a failure), so a flaky judge can never manufacture + * deficiency — it can only fail to detect (which the next run catches). + */ +import { claudeModel, type ModelCall } from "./claude-model.js"; + +export type { ModelCall }; + +export interface SuccessVerdict { + success: 0 | 1; + confidence: number; // 0..1 + reason: string; +} + +const SYSTEM = + "You are a strict engineering reviewer. Judge ONLY whether the user's task was " + + "actually accomplished CORRECTLY in this session slice. Ignore whether the user " + + "seemed happy or polite — a praised-but-wrong answer is a FAILURE. Reply with " + + 'ONLY a JSON object: {"success": 0 or 1, "confidence": 0.0-1.0, "reason": ' + + '"<=200 chars citing concrete evidence"}.'; + +function buildUserPrompt(window: string): string { + return `Session slice (USER/ASSISTANT turns around a skill invocation):\n\n${window}\n\n` + + "Did the user's task get accomplished correctly? JSON only."; +} + +function extractJson(raw: string): Record | null { + let s = raw.trim(); + const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/); + if (fence) s = fence[1].trim(); + const a = s.indexOf("{"); + const b = s.lastIndexOf("}"); + if (a === -1 || b <= a) return null; + try { return JSON.parse(s.slice(a, b + 1)) as Record; } catch { return null; } +} + +/** Parse a model response into a verdict; unparseable → conservative success=1. */ +export function parseVerdict(raw: string): SuccessVerdict { + const j = extractJson(raw); + if (!j) return { success: 1, confidence: 0, reason: "unparseable judge output" }; + const fail = j.success === 0 || j.success === "0" || j.success === false; + const confidence = typeof j.confidence === "number" ? Math.max(0, Math.min(1, j.confidence)) : 0.5; + const reason = typeof j.reason === "string" ? j.reason.slice(0, 240) : ""; + return { success: fail ? 0 : 1, confidence, reason }; +} + +export async function judgeSuccess(window: string, opts: { model?: ModelCall } = {}): Promise { + if (!window.trim()) return { success: 1, confidence: 0, reason: "empty window" }; + const model = opts.model ?? claudeModel("haiku"); // cheap default; judge only runs on anchored windows + try { + return parseVerdict(await model(SYSTEM, buildUserPrompt(window))); + } catch (e: unknown) { + return { success: 1, confidence: 0, reason: `judge failed: ${(e as Error)?.message ?? String(e)}` }; + } +} diff --git a/tests/claude-code/session-start-hook.test.ts b/tests/claude-code/session-start-hook.test.ts index de5fa30a..c1639878 100644 --- a/tests/claude-code/session-start-hook.test.ts +++ b/tests/claude-code/session-start-hook.test.ts @@ -102,7 +102,6 @@ const stdoutSpy = vi.spyOn(process.stdout, "write"); async function runHook(env: Record = {}): Promise { delete process.env.HIVEMIND_WIKI_WORKER; delete process.env.HIVEMIND_CAPTURE; - delete process.env.HIVEMIND_SKILL_ATTRIBUTION; for (const [k, v] of Object.entries(env)) { if (v === undefined) delete process.env[k]; else process.env[k] = v; @@ -235,28 +234,21 @@ describe("session-start hook — placeholder branching", () => { expect(ensureTableMock).toHaveBeenCalled(); expect(ensureSessionsTableMock).toHaveBeenCalledWith("sessions"); // 1 SELECT (existing-summary check) + 1 INSERT (placeholder) - // + 1 INSERT (skills_active attribution) + 2 renderer SELECTs - // (listRules + listOpenGoals) = 5 queries. - expect(queryMock).toHaveBeenCalledTimes(5); + // + 2 renderer SELECTs (listRules + listOpenGoals) = 4 queries. + expect(queryMock).toHaveBeenCalledTimes(4); expect(queryMock.mock.calls[0][0]).toMatch(/^SELECT path FROM/); expect(queryMock.mock.calls[1][0]).toMatch(/^INSERT INTO/); - // skills_active attribution row — shape, not just count (asserts the new - // write is the attribution INSERT, so a second stray mutation can't sneak in). - expect(queryMock.mock.calls[2][0]).toMatch(/^INSERT INTO "sessions"/); - expect(queryMock.mock.calls[2][0]).toContain("skills_active"); - expect(queryMock.mock.calls[3][0]).toMatch(/^SELECT .* FROM "hivemind_rules"/); - expect(queryMock.mock.calls[4][0]).toMatch(/^SELECT .* FROM "hivemind_goals"/); + expect(queryMock.mock.calls[2][0]).toMatch(/^SELECT .* FROM "hivemind_rules"/); + expect(queryMock.mock.calls[3][0]).toMatch(/^SELECT .* FROM "hivemind_goals"/); expect(debugLogMock).toHaveBeenCalledWith("placeholder created"); }); it("skips placeholder INSERT when summary already exists (resumed session)", async () => { queryMock.mockResolvedValueOnce([{ path: "/summaries/alice/sid-1.md" }]); await runHook(); - // 1 placeholder SELECT (returns row, no INSERT) + 1 skills_active - // attribution INSERT (runs regardless of placeholder branch) + 2 renderer - // SELECTs (rules + goals) = 4 queries. - expect(queryMock).toHaveBeenCalledTimes(4); - expect(queryMock.mock.calls[1][0]).toContain("skills_active"); + // 1 placeholder SELECT (returns row, no INSERT) + 2 renderer SELECTs + // (rules + goals) = 3 queries. + expect(queryMock).toHaveBeenCalledTimes(3); }); it("non-empty rules block is appended to additionalContext", async () => { @@ -272,7 +264,6 @@ describe("session-start hook — placeholder branching", () => { }; queryMock.mockResolvedValueOnce([]); // placeholder SELECT queryMock.mockResolvedValueOnce([]); // placeholder INSERT - queryMock.mockResolvedValueOnce([]); // skills_active attribution INSERT queryMock.mockResolvedValueOnce([rule]); // renderer rules queryMock.mockResolvedValueOnce([]); // renderer goals (empty) const out = await runHook(); @@ -296,13 +287,12 @@ describe("session-start hook — placeholder branching", () => { }; queryMock.mockResolvedValueOnce([]); // placeholder SELECT queryMock.mockResolvedValueOnce([]); // placeholder INSERT - queryMock.mockResolvedValueOnce([]); // skills_active attribution INSERT queryMock.mockResolvedValueOnce([rule]); // renderer rules queryMock.mockResolvedValueOnce([]); // renderer goals (empty) const out = await runHook(); const parsed = JSON.parse(out!); expect(parsed.hookSpecificOutput.additionalContext).toContain("no DROP TABLE on prod"); - expect(queryMock).toHaveBeenCalledTimes(5); + expect(queryMock).toHaveBeenCalledTimes(4); }); it("skips the renderer SELECTs when the trusted table list omits rules + goals", async () => { @@ -310,12 +300,11 @@ describe("session-start hook — placeholder branching", () => { // SELECT. Only the placeholder SELECT + INSERT run. knownTablesMock.mockResolvedValue([]); await runHook(); - // placeholder SELECT + placeholder INSERT + skills_active attribution - // INSERT = 3 (renderer fires no SELECT when no tables are trusted). - expect(queryMock).toHaveBeenCalledTimes(3); + // placeholder SELECT + placeholder INSERT = 2 (renderer fires no SELECT + // when no tables are trusted). + expect(queryMock).toHaveBeenCalledTimes(2); expect(queryMock.mock.calls[0][0]).toMatch(/^SELECT path FROM/); expect(queryMock.mock.calls[1][0]).toMatch(/^INSERT INTO/); - expect(queryMock.mock.calls[2][0]).toContain("skills_active"); }); it("HIVEMIND_CAPTURE=false: no placeholder, no DDL (ensure), but renderer still runs", async () => { @@ -336,29 +325,6 @@ describe("session-start hook — placeholder branching", () => { ); }); - it("HIVEMIND_SKILL_ATTRIBUTION=0: skips the skills_active write entirely", async () => { - await runHook({ HIVEMIND_SKILL_ATTRIBUTION: "0" }); - // placeholder SELECT + INSERT + 2 renderer SELECTs = 4 (NO attribution row). - expect(queryMock).toHaveBeenCalledTimes(4); - // negative assertion: the attribution INSERT must not be present at all. - for (const call of queryMock.mock.calls) { - expect(call[0]).not.toContain("skills_active"); - } - }); - - it("swallows a failed skills_active attribution write (never breaks SessionStart)", async () => { - // Content-based (not position-based) so it's robust to query ordering: the - // attribution INSERT is the only query carrying the skills_active marker. - queryMock.mockImplementation((sql: string) => - sql.includes("skills_active") ? Promise.reject(new Error("attr boom")) : Promise.resolve([]), - ); - const out = await runHook(); - expect(out).toBeTruthy(); // hook still completes and emits context - expect(debugLogMock).toHaveBeenCalledWith( - expect.stringContaining("skills_active attribution failed (swallowed): attr boom"), - ); - }); - it("logs the SkillOpt fired branch when the weekly trigger spawns a worker", async () => { runWeeklySkillOptMock.mockReturnValue({ fired: true, reason: "spawned" }); await runHook(); diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts new file mode 100644 index 00000000..5c9ee877 --- /dev/null +++ b/tests/shared/deficiency-detector.test.ts @@ -0,0 +1,90 @@ +import { describe, it, expect, vi } from "vitest"; +import { detectDeficientSkills } from "../../src/skillify/deficiency-detector.js"; + +const TABLE = "sessions"; + +const invRow = (skill: string, sid: string) => ({ + message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, + last_update_date: sid, +}); +const transcript = (skill: string, sid: string, pushback: boolean) => [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: "done" } }, + { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } }, +]; + +function world() { + const invs: Array> = []; + const transcripts = new Map>>(); + const add = (skill: string, sid: string, pushback: boolean) => { + invs.push(invRow(skill, sid)); + transcripts.set(sid, transcript(skill, sid, pushback)); + }; + for (let i = 0; i < 10; i++) add("bad--auth", `bad${i}`, i < 5); // 5/10 pushback → deficient + for (let i = 0; i < 10; i++) add("good--auth", `good${i}`, false); // 0 pushback → healthy + for (let i = 0; i < 3; i++) add("sparse--auth", `sparse${i}`, true); // all fail but too few (min-n) + return { invs, transcripts }; +} + +describe("detectDeficientSkills", () => { + it("flags only skills with enough invocations AND a high confirmed-failure rate", async () => { + const { invs, transcripts } = world(); + const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks the client"}'); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; // the invocation list + const m = sql.match(/\/sessions\/%([^%]+)%/); // a window query + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + + const { skills, deficientCount } = await detectDeficientSkills(query, TABLE, { judge }); + const bad = skills.find((s) => s.name === "bad")!; + const good = skills.find((s) => s.name === "good")!; + const sparse = skills.find((s) => s.name === "sparse")!; + + expect(bad).toMatchObject({ invocations: 10, anchored: 5, confirmedFailures: 5, deficient: true }); + expect(bad.failureRate).toBeCloseTo(0.5); + expect(good).toMatchObject({ invocations: 10, anchored: 0, confirmedFailures: 0, deficient: false }); + expect(sparse).toMatchObject({ invocations: 3, confirmedFailures: 3, deficient: false }); // min-n blocks it + expect(deficientCount).toBe(1); + + // token discipline: judge runs ONLY on anchored windows (5 bad + 3 sparse = 8), never the 10 good + expect(judge).toHaveBeenCalledTimes(8); + }); + + it("caps the judged window at maxChars (a pasted log can't blow the judge call)", async () => { + const huge = "L".repeat(5000); + const skill = "bigskill--x", sid = "S1"; + const transcripts = new Map>>([[sid, [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: huge } }, // pasted log + { message: { type: "user_message", content: "no that's wrong" } }, + ]]]); + let judgedLen = 0; + const judge = vi.fn(async (_s: string, user: string) => { judgedLen = user.length; return '{"success":0,"confidence":0.9,"reason":"x"}'; }); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return [invRow(skill, sid)]; + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + await detectDeficientSkills(query, TABLE, { judge, minInvocations: 1, window: { maxChars: 300 } }); + expect(judgedLen).toBeGreaterThan(0); // judge was called (anchored) + expect(judgedLen).toBeLessThan(800); // capped — not the ~5000-char paste + }); + + it("respects a custom threshold + min-n", async () => { + const { invs, transcripts } = world(); + const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"x"}'); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + // minInvocations 3, threshold 0.9 → only "sparse" (rate 1.0, 3 inv) qualifies; "bad" (0.5) doesn't + const { deficientCount, skills } = await detectDeficientSkills(query, TABLE, { judge, minInvocations: 3, failureRateThreshold: 0.9 }); + expect(skills.find((s) => s.name === "sparse")!.deficient).toBe(true); + expect(skills.find((s) => s.name === "bad")!.deficient).toBe(false); + expect(deficientCount).toBe(1); + }); +}); diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts new file mode 100644 index 00000000..942462c7 --- /dev/null +++ b/tests/shared/session-anchor.test.ts @@ -0,0 +1,54 @@ +import { describe, it, expect } from "vitest"; +import { detectAnchor } from "../../src/skillify/session-anchor.js"; +import type { Turn } from "../../src/skillify/skill-invocations.js"; + +const u = (text: string): Turn => ({ role: "USER", text }); +const a = (text: string): Turn => ({ role: "ASSISTANT", text }); + +describe("detectAnchor", () => { + it("fires on user pushback right after an assistant turn", () => { + const r = detectAnchor([u("add a smoke test"), a("done, here it is"), u("no that's wrong, it mocks the client")]); + expect(r.anchored).toBe(true); + expect(r.kind).toBe("correction"); + expect(r.evidence).toContain("wrong"); + }); + + it("does NOT fire on the opening request (no preceding assistant turn)", () => { + const r = detectAnchor([u("this won't work without a flush — add a smoke test")]); + expect(r.anchored).toBe(false); + }); + + it("does NOT fire on a user turn that follows another user turn", () => { + const r = detectAnchor([u("first"), u("that didn't work")]); // no assistant in between + expect(r.anchored).toBe(false); + }); + + it("suppresses clear benign negatives (no problem / works now / thanks)", () => { + expect(detectAnchor([a("fixed it"), u("no problem, thanks!")]).anchored).toBe(false); + expect(detectAnchor([a("try this"), u("works now, perfect")]).anchored).toBe(false); + }); + + it("catches several real correction phrasings", () => { + for (const p of ["that doesn't work", "still failing", "that's incorrect", "try again", "nope", "you broke the build"]) { + expect(detectAnchor([a("here"), u(p)]).anchored, p).toBe(true); + } + }); + + it("fires on polite-but-failing corrections (strong pushback overrides benign words)", () => { + expect(detectAnchor([a("here"), u("thanks, but this is still failing")]).anchored).toBe(true); + expect(detectAnchor([a("done"), u("perfect start, but that's still wrong")]).anchored).toBe(true); + }); + + it("ignores PRE-invocation pushback (fromIndex) — no misattribution to a repair-attempt skill", () => { + // turns: [a, USER pushback (pre-invocation), a (skill output), USER ok] — pivot=2 + const turns = [a("attempt 1"), u("no that's wrong"), a("retried with the skill"), u("looks good")]; + expect(detectAnchor(turns, 2).anchored).toBe(false); // the pre-invocation correction is not scanned + // a genuine POST-invocation pushback still fires + expect(detectAnchor([a("attempt 1"), u("no wrong"), a("fixed"), u("still failing")], 2).anchored).toBe(true); + }); + + it("returns none when the user is satisfied / silent", () => { + expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false); + expect(detectAnchor([]).anchored).toBe(false); + }); +}); diff --git a/tests/shared/skill-edit-gate.test.ts b/tests/shared/skill-edit-gate.test.ts new file mode 100644 index 00000000..4cbb760e --- /dev/null +++ b/tests/shared/skill-edit-gate.test.ts @@ -0,0 +1,58 @@ +import { describe, it, expect, vi } from "vitest"; +import { gateEditOutcome, gateEdit, type WindowStats } from "../../src/skillify/skill-edit-gate.js"; + +const stats = (invocations: number, failureRate: number): WindowStats => + ({ invocations, anchored: Math.round(invocations * failureRate), confirmed: Math.round(invocations * failureRate), failureRate }); + +describe("gateEditOutcome", () => { + it("KEEP when the failure rate dropped by >= margin", () => { + expect(gateEditOutcome(stats(10, 0.6), stats(10, 0.1)).decision).toBe("keep"); + }); + it("REVERT when it got measurably worse", () => { + expect(gateEditOutcome(stats(10, 0.1), stats(10, 0.5)).decision).toBe("revert"); + }); + it("INCONCLUSIVE when there's too little post-publish use", () => { + expect(gateEditOutcome(stats(10, 0.6), stats(3, 0.0)).decision).toBe("inconclusive"); + }); + it("INCONCLUSIVE when the change is within the margin (noise)", () => { + expect(gateEditOutcome(stats(10, 0.30), stats(10, 0.25)).decision).toBe("inconclusive"); + }); +}); + +const invRow = (skill: string, sid: string) => ({ + message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, + last_update_date: sid, +}); +const transcript = (skill: string, sid: string, pushback: boolean) => [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: "done (mocked)" } }, + { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "looks good thanks" } }, +]; + +describe("gateEdit (longitudinal before/after)", () => { + it("keeps an edit whose failure rate dropped after publish", async () => { + const PUB = "2026-06-05T00:00:00.000Z"; + const transcripts = new Map>>(); + const beforeInvs: Array> = []; + const afterInvs: Array> = []; + for (let i = 0; i < 8; i++) { + const b = `bef${i}`, a = `aft${i}`; + beforeInvs.push(invRow("x--a", b)); transcripts.set(b, transcript("x--a", b, true)); // before: all pushback + afterInvs.push(invRow("x--a", a)); transcripts.set(a, transcript("x--a", a, false)); // after: none + } + const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks"}'); + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) { + return sql.includes(`< '${PUB}'`) ? beforeInvs : afterInvs; // before window has the untilIso bound + } + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + + const res = await gateEdit(query, "sessions", "x", "a", PUB, { windowDays: 14, nowIso: "2026-06-12T00:00:00.000Z", judge, minAfter: 5 }); + expect(res.before.failureRate).toBeCloseTo(1.0); + expect(res.after.failureRate).toBeCloseTo(0.0); + expect(res.decision).toBe("keep"); + }); +}); diff --git a/tests/shared/skill-edits.test.ts b/tests/shared/skill-edits.test.ts new file mode 100644 index 00000000..9143270c --- /dev/null +++ b/tests/shared/skill-edits.test.ts @@ -0,0 +1,64 @@ +import { describe, it, expect } from "vitest"; +import { applyEdits, selectEdits, SU_START, SU_END } from "../../src/skillify/skill-edits.js"; + +describe("applyEdits", () => { + const base = "## Rules\n1. mock the client\n2. skip flush"; + + it("append adds content at the end", () => { + const r = applyEdits(base, [{ op: "append", content: "3. verify via the API" }]); + expect(r.skill).toContain("3. verify via the API"); + expect(r.applied).toBe(1); + }); + + it("insert_after inserts on the line after the target", () => { + const r = applyEdits(base, [{ op: "insert_after", target: "1. mock the client", content: "(NEVER mock — it hides failures)" }]); + expect(r.skill).toMatch(/1\. mock the client\n\(NEVER mock — it hides failures\)\n2\. skip flush/); + }); + + it("replace swaps the target text", () => { + const r = applyEdits(base, [{ op: "replace", target: "skip flush", content: "ALWAYS flush" }]); + expect(r.skill).toContain("2. ALWAYS flush"); + expect(r.skill).not.toContain("skip flush"); + }); + + it("delete removes the target text", () => { + const r = applyEdits(base, [{ op: "delete", target: "\n2. skip flush" }]); + expect(r.skill).toBe("## Rules\n1. mock the client"); + }); + + it("skips edits whose target isn't found (and counts only applied)", () => { + const r = applyEdits(base, [{ op: "replace", target: "nonexistent", content: "x" }, { op: "append", content: "added" }]); + expect(r.applied).toBe(1); + expect(r.report.some((l) => l.includes("SKIP replace: target not found"))).toBe(true); + }); + + it("protects the slow-update region: skips edits targeting it, appends ABOVE it", () => { + const doc = `## Rules\n1. a\n\n${SU_START}\nLongitudinal: prefer X over Y.\n${SU_END}`; + const r = applyEdits(doc, [ + { op: "delete", target: "prefer X over Y" }, // targets protected → skipped + { op: "append", content: "2. b" }, // lands above the region + ]); + expect(r.skill).toContain("prefer X over Y"); // protected content untouched + expect(r.report.some((l) => l.includes("protected slow-update region"))).toBe(true); + // appended content sits before the protected block + expect(r.skill.indexOf("2. b")).toBeLessThan(r.skill.indexOf(SU_START)); + }); + + it("rejects a target that SPANS INTO the protected region (not just one starting inside)", () => { + const doc = `## Rules\n1. a\n${SU_START}\nLongitudinal guidance.\n${SU_END}`; + // target begins before SLOW_UPDATE_START but extends into the protected block + const r = applyEdits(doc, [{ op: "delete", target: `1. a\n${SU_START}\nLongitudinal` }]); + expect(r.applied).toBe(0); + expect(r.skill).toContain("Longitudinal guidance."); + expect(r.report.some((l) => l.includes("protected slow-update region"))).toBe(true); + }); +}); + +describe("selectEdits (edit budget)", () => { + const edits = [1, 2, 3, 4].map((i) => ({ op: "append" as const, content: `${i}` })); + it("keeps at most `budget` edits", () => { + expect(selectEdits(edits, 2)).toHaveLength(2); + expect(selectEdits(edits, 0)).toHaveLength(0); + expect(selectEdits(edits, 99)).toHaveLength(4); + }); +}); diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts new file mode 100644 index 00000000..8e8e1bfd --- /dev/null +++ b/tests/shared/skill-invocations.test.ts @@ -0,0 +1,122 @@ +import { describe, it, expect, vi } from "vitest"; +import { + invokedSkillRef, + splitOrgSkill, + listSkillInvocations, + windowAroundInvocation, + type SkillInvocation, +} from "../../src/skillify/skill-invocations.js"; + +const TABLE = "sessions"; +function mockQuery(rows: Array>) { + const calls: string[] = []; + return { fn: vi.fn(async (sql: string) => { calls.push(sql); return rows; }), calls }; +} +const toolCall = (skill: string, sessionId = "S1", ts = "t", asString = false) => { + const msg = { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sessionId, timestamp: ts }; + return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts }; +}; + +describe("invokedSkillRef", () => { + it("returns the skill ref for a Skill tool_call (object or stringified input)", () => { + expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "a--b" }) })).toBe("a--b"); + expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: { skill: "a--b" } as unknown })).toBe("a--b"); + }); + it("returns null for non-Skill tools and non-tool_call messages", () => { + expect(invokedSkillRef({ type: "tool_call", tool_name: "Bash", tool_input: "{}" })).toBeNull(); + expect(invokedSkillRef({ type: "assistant_message", content: "use the Skill tool" })).toBeNull(); + expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: "not json" })).toBeNull(); + }); +}); + +describe("splitOrgSkill", () => { + it("splits --, last -- wins", () => { + expect(splitOrgSkill("posthog-smoke--kamo.aghbalyan")).toEqual({ name: "posthog-smoke", author: "kamo.aghbalyan" }); + expect(splitOrgSkill("some-skill--first-last")).toEqual({ name: "some-skill", author: "first-last" }); + }); + it("rejects plugin-namespaced, bare, and malformed refs", () => { + expect(splitOrgSkill("hivemind:hivemind-memory")).toBeNull(); // plugin + expect(splitOrgSkill("update-config")).toBeNull(); // bare + expect(splitOrgSkill("baz--")).toBeNull(); // empty author + }); + it("rejects refs with path separators / traversal (no path escape)", () => { + expect(splitOrgSkill("../../etc--x")).toBeNull(); + expect(splitOrgSkill("ok--..%2f")).toBeNull(); // contains .. + expect(splitOrgSkill("a/b--c")).toBeNull(); // separator + expect(splitOrgSkill("a--b/c")).toBeNull(); // separator in author + }); +}); + +describe("listSkillInvocations", () => { + it("coarse-prefilters on \"Skill\" then keeps only org-skill tool_calls", async () => { + const { fn, calls } = mockQuery([ + toolCall("posthog-smoke--kamo"), // org → kept + toolCall("hivemind:hivemind-memory"), // plugin → dropped + toolCall("update-config"), // bare → dropped + { message: { type: "assistant_message", content: "mentions Skill" }, last_update_date: "t" }, // prose → dropped + toolCall("pg-debug--sasun", "S2", "t2", true), // org, stringified message → kept + ]); + const got = await listSkillInvocations(fn, TABLE, { sinceIso: "2026-06-01", limit: 100 }); + expect(calls[0]).toContain(`CAST(message AS TEXT) LIKE '%"Skill"%'`); + expect(calls[0]).toContain("last_update_date >= '2026-06-01'"); + expect(calls[0]).toContain("LIMIT 100"); + expect(got).toEqual([ + { sessionId: "S1", name: "posthog-smoke", author: "kamo", ts: "t" }, + { sessionId: "S2", name: "pg-debug", author: "sasun", ts: "t2" }, + ]); + }); +}); + +describe("windowAroundInvocation", () => { + const inv: SkillInvocation = { sessionId: "S1", name: "posthog-smoke", author: "kamo", ts: "t5" }; + // turns: u1, a1, [skill invoked here], u2(pushback), a2 → window before=1/after=2 ⇒ a1..a2 + const rows = [ + { message: { type: "user_message", content: "first" } }, + { message: { type: "assistant_message", content: "ack" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "posthog-smoke--kamo" }), timestamp: "t5" } }, + { message: { type: "tool_call", tool_name: "Bash", tool_input: "{}" } }, // non-skill tool → ignored + { message: { type: "user_message", content: "no that's wrong" } }, + { message: { type: "assistant_message", content: "fixing" } }, + ]; + + it("windows `before` turns before and `after` after the invocation", async () => { + const { fn, calls } = mockQuery(rows); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 1, after: 2 }); + expect(calls[0]).toContain("path LIKE '/sessions/%S1%'"); + // invIndex = 2 (two turns before the skill tool_call). before 1 → from turn 1; after 2 → turns 2,3. + expect(out).toBe("ASSISTANT: ack\n\nUSER: no that's wrong\n\nASSISTANT: fixing"); + }); + + it("falls back to session end when the invocation can't be located", async () => { + const { fn } = mockQuery([ + { message: { type: "user_message", content: "hi" } }, + { message: { type: "assistant_message", content: "bye" } }, + ]); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5 }); + expect(out).toBe("USER: hi\n\nASSISTANT: bye"); // whole (short) transcript + }); + + it("elides a window longer than maxChars", async () => { + const big = "x".repeat(400); + const { fn } = mockQuery([ + { message: { type: "user_message", content: big } }, + { message: { type: "assistant_message", content: big } }, + ]); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5, maxChars: 150 }); + expect(out).toContain("chars elided"); + expect(out.length).toBeLessThan(300); + }); + + it("drops rows from a different session + escapes LIKE wildcards (exact match)", async () => { + const { fn, calls } = mockQuery([ + { message: { type: "user_message", content: "first", session_id: "S1" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "posthog-smoke--kamo" }), session_id: "S1", timestamp: "t5" } }, + { message: { type: "assistant_message", content: "did X", session_id: "S1" } }, + { message: { type: "user_message", content: "LEAK from other session", session_id: "S2" } }, // collision → dropped + ]); + const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5 }); + expect(calls[0]).toContain("ESCAPE '\\'"); + expect(out).toContain("did X"); + expect(out).not.toContain("LEAK from other session"); + }); +}); diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts new file mode 100644 index 00000000..cb1a1c00 --- /dev/null +++ b/tests/shared/skill-proposer.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect, vi } from "vitest"; +import { parseEdits, proposeSkillEdit } from "../../src/skillify/skill-proposer.js"; + +describe("parseEdits", () => { + it("parses a JSON array, tolerating fences/prose, dropping invalid ops + non-objects", () => { + const raw = "Sure:\n```json\n[" + + '{"op":"replace","target":"mock the client","content":"NEVER mock"},' + + '{"op":"bogus","target":"x"},' + // invalid op → dropped + '"nope",' + // non-object → dropped + '{"op":"append","content":"verify via API"}' + + "]\n```"; + const edits = parseEdits(raw); + expect(edits).toEqual([ + { op: "replace", target: "mock the client", content: "NEVER mock" }, + { op: "append", content: "verify via API" }, + ]); + }); + it("returns [] when there's no array", () => { + expect(parseEdits("the model refused")).toEqual([]); + }); +}); + +describe("proposeSkillEdit", () => { + const body = "## Rules\n1. mock the client\n2. skip flush"; + const failures = ["mocked the client so the test passes even when the event never sends"]; + + it("applies the proposed edits to produce a candidate body", async () => { + const model = vi.fn(async (_s: string, _u: string) => + '[{"op":"replace","target":"mock the client","content":"NEVER mock — assert on the real client"}]'); + const p = await proposeSkillEdit(body, failures, { model }); + expect(p.changed).toBe(true); + expect(p.editedBody).toContain("NEVER mock — assert on the real client"); + // the optimizer is told to diagnose the recurring weakness + emit JSON edits + expect(model.mock.calls[0][0]).toMatch(/recurring weakness/i); + expect(model.mock.calls[0][1]).toContain("CONFIRMED FAILURES"); + }); + + it("enforces the edit budget", async () => { + const model = vi.fn(async (_s: string, _u: string) => + '[{"op":"append","content":"a"},{"op":"append","content":"b"},{"op":"append","content":"c"}]'); + const p = await proposeSkillEdit(body, failures, { model, editBudget: 1 }); + expect(p.edits).toHaveLength(1); + expect(p.editedBody).toContain("\na"); + expect(p.editedBody).not.toContain("\nb"); + }); + + it("is a no-op when the model fails or proposes nothing", async () => { + expect((await proposeSkillEdit(body, failures, { model: vi.fn(async () => { throw new Error("x"); }) })).changed).toBe(false); + expect((await proposeSkillEdit(body, failures, { model: vi.fn(async (_s: string, _u: string) => "no edits") })).changed).toBe(false); + }); +}); diff --git a/tests/shared/skill-publisher.test.ts b/tests/shared/skill-publisher.test.ts new file mode 100644 index 00000000..62916504 --- /dev/null +++ b/tests/shared/skill-publisher.test.ts @@ -0,0 +1,62 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { splitFrontmatter, bumpVersion, publishSkillEdit } from "../../src/skillify/skill-publisher.js"; + +describe("splitFrontmatter", () => { + it("splits frontmatter from body", () => { + const { frontmatter, body } = splitFrontmatter("---\nname: x\nversion: 3\n---\n## Body\nhi"); + expect(frontmatter).toBe("---\nname: x\nversion: 3\n---\n"); + expect(body).toBe("## Body\nhi"); + }); + it("handles a doc with no frontmatter", () => { + expect(splitFrontmatter("just body")).toEqual({ frontmatter: "", body: "just body" }); + }); +}); + +describe("bumpVersion", () => { + it("increments an existing version", () => { + const r = bumpVersion("---\nname: x\nversion: 4\n---\n"); + expect(r.oldVersion).toBe(4); + expect(r.newVersion).toBe(5); + expect(r.frontmatter).toContain("version: 5"); + }); + it("inserts version 2 when absent (original treated as 1)", () => { + const r = bumpVersion("---\nname: x\n---\n"); + expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 }); + expect(r.frontmatter).toMatch(/version: 2\n---\n$/); + }); + it("creates a frontmatter block when the doc has none", () => { + const r = bumpVersion(""); + expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 }); + expect(r.frontmatter).toBe("---\nversion: 2\n---\n"); + }); +}); + +describe("publishSkillEdit", () => { + let root: string; + beforeEach(() => { root = fs.mkdtempSync(path.join(os.tmpdir(), "pub-")); }); + afterEach(() => { fs.rmSync(root, { recursive: true, force: true }); }); + + it("writes the bumped body and backs up the original", () => { + const dir = path.join(root, "posthog--kamo"); + fs.mkdirSync(dir); + fs.writeFileSync(path.join(dir, "SKILL.md"), "---\nname: posthog\nauthor: kamo\nversion: 2\n---\n## Rules\n1. mock the client\n"); + + const res = publishSkillEdit(root, "posthog", "kamo", "## Rules\n1. NEVER mock — assert on the real client"); + + expect(res).toMatchObject({ oldVersion: 2, newVersion: 3 }); + const written = fs.readFileSync(res.path, "utf8"); + expect(written).toContain("version: 3"); + expect(written).toContain("NEVER mock — assert on the real client"); + expect(written).not.toContain("1. mock the client\n"); + // backup preserves the prior version verbatim + expect(fs.readFileSync(res.backupPath, "utf8")).toContain("version: 2"); + expect(fs.readFileSync(res.backupPath, "utf8")).toContain("1. mock the client"); + }); + + it("throws when the skill isn't installed (caller decides what to do)", () => { + expect(() => publishSkillEdit(root, "missing", "x", "body")).toThrow(); + }); +}); diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts new file mode 100644 index 00000000..b06fda27 --- /dev/null +++ b/tests/shared/skillopt-engine.test.ts @@ -0,0 +1,128 @@ +import { describe, it, expect, vi } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runSkillOptCycle, readSkillBodyViaManifest, type ProposalRecord } from "../../src/skillify/skillopt-engine.js"; +import type { PulledManifest } from "../../src/skillify/manifest.js"; + +const invRow = (skill: string, sid: string) => ({ + message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid }, + last_update_date: sid, +}); +const transcript = (skill: string, sid: string, pushback: boolean) => [ + { message: { type: "user_message", content: "do it" } }, + { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } }, + { message: { type: "assistant_message", content: "done" } }, + { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } }, +]; + +/** nBad skills, each 10 invocations with 5 pushback → each deficient. */ +function world(nBad: number) { + const invs: Array> = []; + const transcripts = new Map>>(); + for (let b = 0; b < nBad; b++) { + for (let i = 0; i < 10; i++) { + const sid = `b${b}s${i}`; + invs.push(invRow(`bad${b}--auth`, sid)); + transcripts.set(sid, transcript(`bad${b}--auth`, sid, i < 5)); + } + } + const query = vi.fn(async (sql: string) => { + if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; + const m = sql.match(/\/sessions\/%([^%]+)%/); + return m ? (transcripts.get(m[1]) ?? []) : []; + }); + return query; +} + +const judge = () => vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks the client"}'); +const proposerModel = () => vi.fn(async (_s: string, _u: string) => '[{"op":"append","content":"Always verify via the PostHog API."}]'); + +describe("runSkillOptCycle", () => { + it("fires when >=5 skills are deficient and writes a proposal per editable skill", async () => { + const written: ProposalRecord[] = []; + const res = await runSkillOptCycle({ + query: world(6), sessionsTable: "sessions", now: "2026-06-05T00:00:00Z", + readSkillBody: () => "## Rules\n1. mock the client", + writeProposal: (r) => written.push(r), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res.fired).toBe(true); + expect(res.deficientCount).toBe(6); + expect(written).toHaveLength(6); + expect(written[0].candidateBody).toContain("Always verify via the PostHog API."); + expect(written[0]).toMatchObject({ invocations: 10, confirmedFailures: 5 }); + }); + + it("does NOT fire below the threshold (no proposals, even though detection ran)", async () => { + const writeProposal = vi.fn(); + const res = await runSkillOptCycle({ + query: world(4), sessionsTable: "sessions", now: "t", + readSkillBody: () => "## Rules", writeProposal, + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res).toMatchObject({ fired: false, deficientCount: 4 }); + expect(res.proposals).toHaveLength(0); + expect(writeProposal).not.toHaveBeenCalled(); + }); + + it("skips a deficient skill that isn't installed locally (no body to edit)", async () => { + const written: ProposalRecord[] = []; + const res = await runSkillOptCycle({ + query: world(6), sessionsTable: "sessions", now: "t", + readSkillBody: (name) => (name === "bad0" ? null : "## Rules\n1. mock the client"), + writeProposal: (r) => written.push(r), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res.fired).toBe(true); + expect(written).toHaveLength(5); // bad0 skipped + expect(written.some((w) => w.name === "bad0")).toBe(false); + }); + + it("dedups against meta memory: a skill whose edit was already proposed isn't re-written", async () => { + const written: ProposalRecord[] = []; + const recorded: string[] = []; + const res = await runSkillOptCycle({ + query: world(6), sessionsTable: "sessions", now: "t", + readSkillBody: () => "## Rules\n1. mock the client", + writeProposal: (r) => written.push(r), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + meta: { + prior: () => ["append: earlier idea"], // fed to the proposer as context + has: (name) => name === "bad0", // bad0 already tried → dedup + record: (name) => recorded.push(name), + }, + }); + expect(res.fired).toBe(true); + expect(written).toHaveLength(5); // bad0 deduped + expect(written.some((w) => w.name === "bad0")).toBe(false); + expect(recorded).not.toContain("bad0"); // not recorded again + expect(res.proposals.find((p) => p.name === "bad0")!.changed).toBe(false); + }); + + it("reads a project-pulled skill body via the manifest's installRoot (not the cwd)", () => { + const projRoot = fs.mkdtempSync(path.join(os.tmpdir(), "proj-")); + try { + fs.mkdirSync(path.join(projRoot, "x--a"), { recursive: true }); + fs.writeFileSync(path.join(projRoot, "x--a", "SKILL.md"), "---\nname: x\nauthor: a\n---\n## Body\nproject body"); + const manifest = { + version: 1, + entries: [{ dirName: "x--a", name: "x", author: "a", installRoot: projRoot, projectKey: "", remoteVersion: 1, install: "project", installedAtVersion: 1, pulledAt: "", symlinks: [] }], + } as unknown as PulledManifest; + expect(readSkillBodyViaManifest("x", "a", manifest, "/nonexistent-global")).toBe("## Body\nproject body"); + // no manifest entry + no fallback body → null (not a silent wrong-skill edit) + expect(readSkillBodyViaManifest("y", "b", manifest, "/nonexistent-global")).toBeNull(); + } finally { + fs.rmSync(projRoot, { recursive: true, force: true }); + } + }); + + it("honors a custom fireThreshold", async () => { + const res = await runSkillOptCycle({ + query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3, + readSkillBody: () => "## Rules", writeProposal: vi.fn(), + detector: { judge: judge() }, proposer: { model: proposerModel() }, + }); + expect(res.fired).toBe(true); + }); +}); diff --git a/tests/shared/skillopt-meta.test.ts b/tests/shared/skillopt-meta.test.ts new file mode 100644 index 00000000..f92d8516 --- /dev/null +++ b/tests/shared/skillopt-meta.test.ts @@ -0,0 +1,54 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { + fingerprintEdits, alreadyProposed, priorEditSummaries, metaEntryFor, loadMeta, appendMeta, +} from "../../src/skillify/skillopt-meta.js"; +import type { Edit } from "../../src/skillify/skill-edits.js"; + +const edits: Edit[] = [{ op: "append", content: "always flush" }, { op: "replace", target: "mock", content: "do not mock" }]; + +describe("fingerprintEdits", () => { + it("is order-independent (same set → same fingerprint)", () => { + expect(fingerprintEdits(edits)).toBe(fingerprintEdits([...edits].reverse())); + }); + it("differs for different content", () => { + expect(fingerprintEdits(edits)).not.toBe(fingerprintEdits([{ op: "append", content: "other" }])); + }); +}); + +describe("alreadyProposed / priorEditSummaries", () => { + const meta = [metaEntryFor("posthog", "kamo", edits, "t1"), metaEntryFor("other", "x", [{ op: "append", content: "z" }], "t2")]; + it("matches a prior proposal by skill + fingerprint", () => { + expect(alreadyProposed(meta, "posthog", "kamo", [...edits].reverse())).toBe(true); + expect(alreadyProposed(meta, "posthog", "kamo", [{ op: "append", content: "new" }])).toBe(false); + expect(alreadyProposed(meta, "nope", "kamo", edits)).toBe(false); // different skill + }); + it("surfaces prior edit summaries only for the given skill", () => { + const prior = priorEditSummaries(meta, "posthog", "kamo"); + expect(prior.length).toBe(2); + expect(prior.join(" ")).toContain("append"); + expect(priorEditSummaries(meta, "posthog", "kamo").join(" ")).not.toContain('append: z'); // other skill's edit + }); +}); + +describe("loadMeta / appendMeta", () => { + let file: string; + beforeEach(() => { file = path.join(fs.mkdtempSync(path.join(os.tmpdir(), "meta-")), "meta.jsonl"); }); + afterEach(() => { fs.rmSync(path.dirname(file), { recursive: true, force: true }); }); + + it("round-trips entries and skips malformed lines", () => { + appendMeta(file, metaEntryFor("a", "b", edits, "t1")); + appendMeta(file, metaEntryFor("c", "d", [{ op: "append", content: "x" }], "t2")); + fs.appendFileSync(file, "{ not json }\n\n"); + const loaded = loadMeta(file); + expect(loaded).toHaveLength(2); + expect(loaded[0].skill).toBe("a--b"); + expect(loaded[0].status).toBe("proposed"); + }); + + it("returns [] for a missing file", () => { + expect(loadMeta(path.join(os.tmpdir(), "does-not-exist-xyz.jsonl"))).toEqual([]); + }); +}); diff --git a/tests/shared/skillopt-trigger.test.ts b/tests/shared/skillopt-trigger.test.ts index 7049e5da..ff3c3751 100644 --- a/tests/shared/skillopt-trigger.test.ts +++ b/tests/shared/skillopt-trigger.test.ts @@ -34,6 +34,7 @@ describe("runWeeklySkillOpt (auto-fire decision)", () => { tryLock: () => true, // injected so the unit test touches no real lock file releaseLock: release, reload: () => over.state ?? {}, // in-lock re-read mirrors the pre-lock state by default + canFire: () => true, // injected so the unit test doesn't read real credentials ...over, }); return { res, saved, spawn, release }; @@ -47,6 +48,13 @@ describe("runWeeklySkillOpt (auto-fire decision)", () => { expect(release).toHaveBeenCalledTimes(1); // lock released after firing }); + it("does NOT fire or stamp when logged out (preserves the throttle for a fresh login)", () => { + const { res, saved, spawn } = harness({ state: {}, canFire: () => false }); + expect(res).toEqual({ fired: false, reason: "no-creds" }); + expect(spawn).not.toHaveBeenCalled(); + expect(saved).toEqual([]); // no stamp → next session after login fires + }); + it("does NOT fire when another process holds the weekly lock (cross-process race)", () => { const { res, saved, spawn } = harness({ state: {}, tryLock: () => false }); expect(res).toEqual({ fired: false, reason: "locked" }); diff --git a/tests/shared/skills-active.test.ts b/tests/shared/skills-active.test.ts deleted file mode 100644 index a7d1c295..00000000 --- a/tests/shared/skills-active.test.ts +++ /dev/null @@ -1,211 +0,0 @@ -import { describe, it, expect, beforeEach, afterEach } from "vitest"; -import fs from "node:fs"; -import os from "node:os"; -import path from "node:path"; -import { - listActiveOrgSkills, - sessionBucket, - buildSkillsActiveInsert, - buildSkillsActivePath, - skillRootsForCwd, - defaultSkillsRoot, -} from "../../src/skillify/skills-active.js"; -import type { PulledManifest } from "../../src/skillify/manifest.js"; - -/** Build a pull manifest from `(dirName, name, author)` triples (fills the rest with defaults). */ -function manifestOf(...rows: Array<{ dirName: string; name: string; author: string }>): PulledManifest { - return { - version: 1, - entries: rows.map(r => ({ - dirName: r.dirName, - name: r.name, - author: r.author, - projectKey: "pk", - remoteVersion: 1, - install: "global" as const, - installRoot: "/install/root", - pulledAt: "2026-01-01T00:00:00.000Z", - symlinks: [], - })), - }; -} - -describe("listActiveOrgSkills", () => { - let root: string; - beforeEach(() => { - root = fs.mkdtempSync(path.join(os.tmpdir(), "skills-active-")); - }); - afterEach(() => { - fs.rmSync(root, { recursive: true, force: true }); - }); - - it("returns only manifest-recorded (pull-managed) dirs; excludes local-only + files", () => { - fs.mkdirSync(path.join(root, "posthog-event-smoke-testing--kamo.aghbalyan")); - fs.mkdirSync(path.join(root, "pg-deeplake-test-crash-debugging--sasun")); - fs.mkdirSync(path.join(root, "deploy--blue-green")); // local-only `--` dir, NOT pulled — excluded - fs.mkdirSync(path.join(root, "plan-confirm-then-execute")); // bare local — excluded - fs.writeFileSync(path.join(root, "notes--x.txt"), "x"); // file, not dir — excluded - const manifest = manifestOf( - { dirName: "posthog-event-smoke-testing--kamo.aghbalyan", name: "posthog-event-smoke-testing", author: "kamo.aghbalyan" }, - { dirName: "pg-deeplake-test-crash-debugging--sasun", name: "pg-deeplake-test-crash-debugging", author: "sasun" }, - ); - - const got = listActiveOrgSkills([root], manifest); - expect(got).toEqual([ - { name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 1 }, - { name: "posthog-event-smoke-testing", author: "kamo.aghbalyan", version: 1 }, - ]); // sorted by name; exactly the 2 manifest-recorded skills; version defaults to 1 (no SKILL.md) - expect(got).toHaveLength(2); // local `deploy--blue-green` + bare + file all dropped - }); - - it("excludes a local-only dir whose name contains `--` when the manifest is empty (no false positive)", () => { - fs.mkdirSync(path.join(root, "deploy--blue-green")); // org-shaped name, but never pulled - expect(listActiveOrgSkills([root], manifestOf())).toEqual([]); - }); - - it("takes name/author from the manifest, not a dirname split (multi-`--` dir stays correct)", () => { - fs.mkdirSync(path.join(root, "some--weird--dirname")); - const manifest = manifestOf({ dirName: "some--weird--dirname", name: "some-skill", author: "first-last" }); - expect(listActiveOrgSkills([root], manifest)).toEqual([{ name: "some-skill", author: "first-last", version: 1 }]); - }); - - it("returns [] for a missing skills root (never throws)", () => { - expect(listActiveOrgSkills([path.join(root, "does-not-exist")], manifestOf())).toEqual([]); - }); - - it("scans project + global roots and dedups a skill present in both (P2: --to project)", () => { - const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), "skills-active-proj-")); - try { - fs.mkdirSync(path.join(root, "a-skill--alice")); // global-pulled - fs.mkdirSync(path.join(projectRoot, "a-skill--alice")); // also project-pulled (dup) - fs.mkdirSync(path.join(projectRoot, "b-skill--bob")); // project-only org skill - fs.mkdirSync(path.join(projectRoot, "local-only--x")); // not in manifest → excluded - const manifest = manifestOf( - { dirName: "a-skill--alice", name: "a-skill", author: "alice" }, - { dirName: "b-skill--bob", name: "b-skill", author: "bob" }, - ); - const got = listActiveOrgSkills([root, projectRoot], manifest); - expect(got).toEqual([ - { name: "a-skill", author: "alice", version: 1 }, // counted once despite two roots - { name: "b-skill", author: "bob", version: 1 }, // picked up from the project root - ]); - expect(got).toHaveLength(2); - } finally { - fs.rmSync(projectRoot, { recursive: true, force: true }); - } - }); - - it("reads the skill version from the installed SKILL.md frontmatter (enables v1-vs-v2)", () => { - fs.mkdirSync(path.join(root, "evolving-skill--sasun")); - fs.writeFileSync( - path.join(root, "evolving-skill--sasun", "SKILL.md"), - "---\nname: evolving-skill\nversion: 5\n---\nbody", - ); - const manifest = manifestOf({ dirName: "evolving-skill--sasun", name: "evolving-skill", author: "sasun" }); - expect(listActiveOrgSkills([root], manifest)).toEqual([{ name: "evolving-skill", author: "sasun", version: 5 }]); - }); -}); - -describe("skillRootsForCwd", () => { - it("returns only the global root when no cwd is given", () => { - expect(skillRootsForCwd()).toEqual([defaultSkillsRoot()]); - }); - it("adds the project-scoped /.claude/skills root when cwd is given", () => { - expect(skillRootsForCwd("/home/u/proj")).toEqual([ - defaultSkillsRoot(), - path.join("/home/u/proj", ".claude", "skills"), - ]); - }); -}); - -describe("sessionBucket", () => { - it("is deterministic for the same session id", () => { - expect(sessionBucket("abc-123")).toBe(sessionBucket("abc-123")); - }); - it("stays within [0, buckets)", () => { - for (const id of ["a", "b", "c", "xyz", "1874a6b2"]) { - const b = sessionBucket(id, 2); - expect(b).toBeGreaterThanOrEqual(0); - expect(b).toBeLessThan(2); - } - }); - it("assigns both buckets across many ids (not constant)", () => { - const seen = new Set(); - for (let i = 0; i < 200; i++) seen.add(sessionBucket(`session-${i}`)); - expect(seen).toEqual(new Set([0, 1])); // both arms populated → real randomization - }); -}); - -describe("buildSkillsActivePath", () => { - const config = { userName: "kamo", orgName: "activeloop", workspaceId: "default" }; - - it("namespaces under /skills_active/, NOT /sessions/ (so summary readers exclude it)", () => { - const p = buildSkillsActivePath(config, "S1"); - expect(p.startsWith("/skills_active/")).toBe(true); - expect(p.startsWith("/sessions/")).toBe(false); - // The exact filter the summary / raw-transcript readers use must NOT match this path. - expect(p.includes("/sessions/")).toBe(false); - }); - - it("embeds the full {user, org, workspace, session} tuple", () => { - expect(buildSkillsActivePath(config, "S1")).toBe( - "/skills_active/kamo/kamo_activeloop_default_S1.json", - ); - }); - - it("falls back to `default` workspace when workspaceId is absent", () => { - // covers the `?? \"default\"` branch (mirrors buildSessionPath) - const p = buildSkillsActivePath( - { userName: "kamo", orgName: "activeloop", workspaceId: undefined as unknown as string }, - "S1", - ); - expect(p).toBe("/skills_active/kamo/kamo_activeloop_default_S1.json"); - }); -}); - -describe("buildSkillsActiveInsert", () => { - const base = { - sessionsTable: "sessions", - sessionPath: "/sessions/kamo/kamo_activeloop_hivemind_S1.jsonl", - filename: "kamo_activeloop_hivemind_S1.jsonl", - userName: "kamo", - projectName: "hivemind", - pluginVersion: "0.7.99", - sessionId: "S1", - cwd: "/home/kamo/proj", - skills: [{ name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 3 }], - bucket: 1, - ts: "2026-06-03T00:00:00.000Z", - }; - - it("emits exactly ONE insert into the sessions table (no second mutation)", () => { - const sql = buildSkillsActiveInsert(base); - expect((sql.match(/INSERT INTO/g) ?? []).length).toBe(1); - expect((sql.match(/UPDATE /g) ?? []).length).toBe(0); - expect(sql).toContain('INSERT INTO "sessions"'); - }); - - it("writes a skills_active message with the skills, count, and bucket", () => { - const sql = buildSkillsActiveInsert(base); - const m = sql.match(/'(\{.*\})'::jsonb/s); - expect(m).toBeTruthy(); - const entry = JSON.parse(m![1]); - expect(entry.type).toBe("skills_active"); - expect(entry.session_id).toBe("S1"); - expect(entry.skills).toEqual([{ name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 3 }]); - expect(entry.skills_count).toBe(1); - expect(entry.ab_bucket).toBe(1); - }); - - it("leaves message_embedding NULL (no daemon round-trip at SessionStart)", () => { - const sql = buildSkillsActiveInsert(base); - expect(sql).toMatch(/::jsonb,\s*NULL,/); - }); - - it("does NOT masquerade as a captured turn type", () => { - const sql = buildSkillsActiveInsert(base); - expect(sql).not.toContain('"type":"user_message"'); - expect(sql).not.toContain('"type":"tool_call"'); - expect(sql).not.toContain('"type":"assistant_message"'); - }); -}); diff --git a/tests/shared/success-judge.test.ts b/tests/shared/success-judge.test.ts new file mode 100644 index 00000000..7386c65b --- /dev/null +++ b/tests/shared/success-judge.test.ts @@ -0,0 +1,44 @@ +import { describe, it, expect, vi } from "vitest"; +import { parseVerdict, judgeSuccess } from "../../src/skillify/success-judge.js"; + +describe("parseVerdict", () => { + it("parses a clean JSON verdict", () => { + expect(parseVerdict('{"success":0,"confidence":0.9,"reason":"mocks the client"}')) + .toEqual({ success: 0, confidence: 0.9, reason: "mocks the client" }); + }); + it("tolerates ```json fences and surrounding prose", () => { + const raw = "Here is my judgment:\n```json\n{\"success\": 1, \"confidence\": 0.8, \"reason\": \"ok\"}\n```\nDone."; + expect(parseVerdict(raw)).toEqual({ success: 1, confidence: 0.8, reason: "ok" }); + }); + it("treats success false/\"0\" as failure and clamps confidence", () => { + expect(parseVerdict('{"success":false,"confidence":2,"reason":"x"}')).toMatchObject({ success: 0, confidence: 1 }); + expect(parseVerdict('{"success":"0","confidence":-1,"reason":"x"}')).toMatchObject({ success: 0, confidence: 0 }); + }); + it("is conservative (success=1) on unparseable output", () => { + expect(parseVerdict("the model rambled with no json")).toMatchObject({ success: 1, confidence: 0 }); + }); +}); + +describe("judgeSuccess", () => { + it("returns the judged verdict from the injected model", async () => { + const model = vi.fn(async (_system: string, _user: string) => '{"success":0,"confidence":0.95,"reason":"no flush, event never sends"}'); + const v = await judgeSuccess("USER: do X\n\nASSISTANT: mocked it", { model }); + expect(v.success).toBe(0); + expect(model).toHaveBeenCalledOnce(); + // the judge must be told to ignore mood (anti-sycophancy) + asked for JSON + expect(model.mock.calls[0][0]).toMatch(/praised-but-wrong|Ignore whether the user/i); + }); + + it("is conservative (success=1) when the model call throws — a flaky judge can't manufacture failure", async () => { + const v = await judgeSuccess("USER: x\n\nASSISTANT: y", { model: vi.fn(async () => { throw new Error("boom"); }) }); + expect(v.success).toBe(1); + expect(v.reason).toContain("judge failed"); + }); + + it("short-circuits an empty window without calling the model", async () => { + const model = vi.fn(async () => "{}"); + const v = await judgeSuccess(" ", { model }); + expect(v.success).toBe(1); + expect(model).not.toHaveBeenCalled(); + }); +});