Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2608355
feat(skillopt): read-side skill cohorts + session reconstruction
kaghni Jun 5, 2026
338909a
feat(skillopt): skill-invocation data layer (Skill tool_use → windows)
kaghni Jun 5, 2026
1929c96
refactor(skillopt): drop skills_active availability attribution
kaghni Jun 5, 2026
72dca56
feat(skillopt): heuristic correction anchor (reward level 1)
kaghni Jun 5, 2026
8398a2f
feat(skillopt): success-judge (reward level 2)
kaghni Jun 5, 2026
f4efb8a
feat(skillopt): deficiency detector (invocation → anchor → judge → flag)
kaghni Jun 5, 2026
93149a0
feat(skillopt): structured edits + budget + slow-update region
kaghni Jun 5, 2026
a93ed3d
feat(skillopt): proposer (reflect -> structured edits) + shared claud…
kaghni Jun 5, 2026
5fc4940
feat(skillopt): live publish mechanism (version bump + backup)
kaghni Jun 5, 2026
de5bac2
feat(skillopt): weekly cycle orchestration + >=5 fire gate
kaghni Jun 5, 2026
3b7fa7a
feat(skillopt): wire the real cycle into the weekly worker
kaghni Jun 5, 2026
e243190
feat(skillopt): meta-skill — optimizer cross-run memory
kaghni Jun 5, 2026
b03c10a
feat(skillopt): edit-outcome gate (longitudinal before/after validation)
kaghni Jun 5, 2026
a1905fd
review: remove committed node_modules symlink (coderabbit)
kaghni Jun 5, 2026
67ee35c
review: cap the judged window at maxChars (codex P2)
kaghni Jun 5, 2026
f4047fc
review: deny all write-capable Claude tools in the judge/proposer (co…
kaghni Jun 5, 2026
fc3825a
review: worker reads project-scoped skill root too (codex P2)
kaghni Jun 5, 2026
a3e3186
review: run judge/proposer with capture disabled (codex P2)
kaghni Jun 5, 2026
a118d0a
review: strong pushback overrides benign in the anchor (codex P2)
kaghni Jun 5, 2026
5091365
review: reject protected-region overlaps, not just inside-starts (cod…
kaghni Jun 5, 2026
b934458
review: anchor only post-invocation turns + validate skill refs as pa…
kaghni Jun 5, 2026
b3db22e
review: no-tools allow-list for judge/proposer, not a deny-list (code…
kaghni Jun 5, 2026
ba78b28
review: resolve skill body via pull manifest, not the worker cwd (cod…
kaghni Jun 5, 2026
fe1b913
review: skip this package's SessionStart hook on internal claude -p c…
kaghni Jun 5, 2026
62907c6
review: match session rows exactly when reconstructing windows (codex…
kaghni Jun 5, 2026
78d68a8
review: create frontmatter when bumping a SKILL.md without one (codex…
kaghni Jun 5, 2026
3335238
review: resolve the claude binary via findAgentBin, not PATH (codex P2)
kaghni Jun 5, 2026
f8427c0
review: don't burn the weekly throttle when logged out (codex P2)
kaghni Jun 5, 2026
ae278af
review: strict MCP config on model calls + match worker's config gate…
kaghni Jun 5, 2026
06ca4cb
feat(skillopt): env-configurable worker thresholds (defaults unchanged)
kaghni Jun 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
node_modules/
node_modules
dist/
tmp/
*.js.map
Expand Down
42 changes: 0 additions & 42 deletions src/hooks/session-start.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import { loadConfig } from "../config.js";
import { DeeplakeApi } from "../deeplake-api.js";
import { sqlStr } from "../utils/sql.js";
import { projectNameFromCwd } from "../utils/project-name.js";
import { listActiveOrgSkills, sessionBucket, buildSkillsActiveInsert, buildSkillsActivePath, skillRootsForCwd } from "../skillify/skills-active.js";
import { readStdin } from "../utils/stdin.js";
import { log as _log } from "../utils/debug.js";
import { getInstalledVersion } from "../utils/version-check.js";
Expand Down Expand Up @@ -223,11 +222,6 @@ async function main(): Promise<void> {
// freezes SessionStart. Hard opt-out via HIVEMIND_AUTOPULL_DISABLED=1.
// All failures swallowed inside autoPullSkills (documented as
// never-rejecting), so no try/catch needed here.
//
// Runs BEFORE the skill-attribution snapshot below so that a skill pulled
// (or upgraded) during THIS SessionStart is reflected in the recorded
// skills_active set — otherwise the row would capture a stale/empty set
// while the session can already use the freshly-pulled skill.
const pullResult = await autoPullSkills();
log(`autopull: pulled=${pullResult.pulled} skipped=${pullResult.skipped}`);

Expand All @@ -244,42 +238,6 @@ async function main(): Promise<void> {
await api.ensureSessionsTable(sessionsTable);
await createPlaceholder(api, table, input.session_id, input.cwd ?? "", config.userName, config.orgName, config.workspaceId, pluginVersion);
log("placeholder created");

// Skill attribution (measurement): record which org-shared skills were in
// context this session + a deterministic A/B bucket. This is the label that
// makes skill value measurable (sessions with vs without skill X / v1 vs v2).
// Org skills are identified via the pull manifest (authoritative), not the
// `--` dirname pattern. Snapshot runs after auto-pull (above) so it reflects
// freshly-pulled skills. Opt-out: HIVEMIND_SKILL_ATTRIBUTION=0.
// Swallowed — must never fail SessionStart.
if (process.env.HIVEMIND_SKILL_ATTRIBUTION !== "0") {
try {
// Scan global + project-scoped (<cwd>/.claude/skills) roots so
// skills pulled with `--to project` are attributed too.
const skills = listActiveOrgSkills(skillRootsForCwd(input.cwd));
// Distinct `/skills_active/` namespace (NOT `/sessions/`) so the summary /
// raw-transcript readers never mistake this attribution row for a transcript.
const attrSessionPath = buildSkillsActivePath(config, input.session_id);
const attrFilename = attrSessionPath.slice(attrSessionPath.lastIndexOf("/") + 1);
const sql = buildSkillsActiveInsert({
sessionsTable,
sessionPath: attrSessionPath,
filename: attrFilename,
userName: config.userName,
projectName: projectNameFromCwd(input.cwd),
pluginVersion,
sessionId: input.session_id,
cwd: input.cwd,
skills,
bucket: sessionBucket(input.session_id),
ts: new Date().toISOString(),
});
await api.query(sql);
log(`skills_active recorded: ${skills.length} org skills, bucket ${sessionBucket(input.session_id)}`);
} catch (e: any) {
log(`skills_active attribution failed (swallowed): ${e?.message ?? e}`);
}
}
} else {
const reason = process.env.HIVEMIND_CAPTURE === "false"
? "HIVEMIND_CAPTURE=false"
Expand Down
53 changes: 53 additions & 0 deletions src/skillify/claude-model.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/**
* Shared `claude -p` backend for the engine's LLM steps (success-judge, proposer).
* All tools denied → pure-text generation. Runs on the USER's own agent, so cost
* lands on the user. Returned as an injectable ModelCall so every LLM step is
* unit-testable with zero real calls.
*/
import { spawn } from "node:child_process";
import { findAgentBin } from "./gate-runner.js";

/** (systemPrompt, userPrompt) -> raw model text. */
export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;

export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall {
const timeoutMs = opts.timeoutMs ?? 120_000;
return (system, user) => new Promise<string>((resolve, reject) => {
const args = [
"-p", user, "--model", model, "--no-session-persistence",
"--output-format", "json", "--system-prompt", system,
// Empty allow-list = NO tools available. Authoritative: it covers built-ins AND
// any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected
// transcript text in the judge/proposer prompt can never trigger tool use.
"--tools", "",
// --strict-mcp-config ignores the user's MCP config entirely (--tools only denies
// USE, not LOADING) — a broken/oversized user MCP schema would otherwise fail every
// judge/proposer call before it returns JSON, silently stopping proposals.
"--strict-mcp-config",
];
// HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND
// HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart
// hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph
// work) — one child per anchored invocation would otherwise contaminate the judge
// prompt and pile up background work. Same guard the other internal runners use.
// Resolve the claude binary the same way the rest of skillify does — a detached
// hook worker may not have it on PATH (e.g. ~/.claude/local/claude), and a bare
// "claude" would ENOENT and the callers would swallow it as no-change.
const child = spawn(findAgentBin("claude_code"), args, {
stdio: ["ignore", "pipe", "pipe"],
env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" },
});
let out = "";
let err = "";
const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs);
child.stdout.on("data", (d) => { out += String(d); });
child.stderr.on("data", (d) => { err += String(d); });
child.on("error", (e) => { clearTimeout(timer); reject(e); });
child.on("close", (code) => {
clearTimeout(timer);
if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`));
try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); }
catch { resolve(out); }
});
});
}
118 changes: 118 additions & 0 deletions src/skillify/deficiency-detector.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/**
* Deficiency detector — the core of the engine's "which skills are bad" step.
*
* For each org-skill invocation: window the transcript around it, run the FREE
* level-1 anchor (user pushback?), and only if anchored spend a level-2 judge
* call (was the task actually accomplished?). A "confirmed failure" requires BOTH
* — high precision, so we never churn a good skill. Aggregate per skill: a skill
* is deficient if it has enough invocations AND a high confirmed-failure rate.
*
* Token discipline: the judge runs ONLY on anchored windows (a fraction), on a
* windowed slice (not whole sessions). Everything injectable (query + judge model)
* so the whole orchestration is unit-tested with zero live Deeplake / LLM.
*
* The ≥5 fire gate lives with the caller (worker): we just return deficientCount.
*/
import {
listSkillInvocations, windowedTurns, elide, type QueryFn, type SkillInvocation,
} from "./skill-invocations.js";
import { detectAnchor } from "./session-anchor.js";
import { judgeSuccess, type ModelCall } from "./success-judge.js";

export interface SkillDeficiency {
name: string;
author: string;
invocations: number; // org-skill invocations examined
anchored: number; // had a level-1 anchor → judged
confirmedFailures: number; // anchor AND judge said success=0
failureRate: number; // confirmedFailures / invocations
deficient: boolean; // failureRate >= threshold AND invocations >= minInvocations
examples: string[]; // a few failure reasons (for the proposer)
}

export interface DetectorConfig {
minInvocations?: number; // min-n per skill before we trust the rate (default 8)
failureRateThreshold?: number; // confirmed-failure rate to flag deficient (default 0.4)
window?: { before?: number; after?: number; maxChars?: number };
judge?: ModelCall; // injected; default = real claude judge
sinceIso?: string; // lookback bound
limit?: number; // cap invocation rows pulled
}

const skillKey = (name: string, author: string) => `${name}--${author}`;

export interface ScoreConfig {
window?: { before?: number; after?: number; maxChars?: number };
judge?: ModelCall;
}

/**
* Score a set of invocations: window each, run the free anchor, and judge ONLY the
* anchored ones. Shared by the detector (per-skill deficiency) and the edit gate
* (a skill's failure rate in a time window).
*/
export async function scoreInvocations(
query: QueryFn,
sessionsTable: string,
invocations: SkillInvocation[],
cfg: ScoreConfig = {},
): Promise<{ anchored: number; confirmed: number; examples: string[] }> {
let anchored = 0;
let confirmed = 0;
const examples: string[] = [];
for (const inv of invocations) {
const { turns, pivot } = await windowedTurns(query, sessionsTable, inv, cfg.window);
const anchor = detectAnchor(turns, pivot); // anchor only on post-invocation reaction
if (!anchor.anchored) continue; // free filter — no judge call
anchored++;
const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000);
const verdict = await judgeSuccess(window, { model: cfg.judge });
if (verdict.success === 0) {
confirmed++;
if (examples.length < 3) examples.push(verdict.reason || anchor.evidence);
}
}
return { anchored, confirmed, examples };
}

export interface DetectionResult {
skills: SkillDeficiency[];
deficientCount: number;
}

export async function detectDeficientSkills(
query: QueryFn,
sessionsTable: string,
cfg: DetectorConfig = {},
): Promise<DetectionResult> {
const minInvocations = cfg.minInvocations ?? 8;
const threshold = cfg.failureRateThreshold ?? 0.4;

const invocations = await listSkillInvocations(query, sessionsTable, { sinceIso: cfg.sinceIso, limit: cfg.limit });

const groups = new Map<string, SkillInvocation[]>();
for (const inv of invocations) {
const k = skillKey(inv.name, inv.author);
const arr = groups.get(k);
if (arr) arr.push(inv); else groups.set(k, [inv]);
}

const skills: SkillDeficiency[] = [];
for (const list of groups.values()) {
const { anchored, confirmed, examples } = await scoreInvocations(query, sessionsTable, list, cfg);
const failureRate = list.length ? confirmed / list.length : 0;
skills.push({
name: list[0].name,
author: list[0].author,
invocations: list.length,
anchored,
confirmedFailures: confirmed,
failureRate,
deficient: list.length >= minInvocations && failureRate >= threshold,
examples,
});
}

skills.sort((a, b) => b.failureRate - a.failureRate || b.invocations - a.invocations);
return { skills, deficientCount: skills.filter((s) => s.deficient).length };
}
50 changes: 50 additions & 0 deletions src/skillify/session-anchor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
* Heuristic "anchor" — a HARD, observable signal in the transcript that a session
* went badly, independent of any LLM judgment: the user pushed back on / corrected
* what the assistant just did. Pure + free (no LLM, no I/O).
*
* It's the level-1 filter in the outcome pipeline: only windows with an anchor go
* to the (paid) success-judge, and a session is labelled a failure only when the
* anchor AND the judge agree. So this is deliberately tuned for RECALL over
* precision — a false positive just costs one judge call (which then drops it),
* but a false negative under-detects (conservative — it never churns a good skill).
* Patterns are meant to be tuned against real sessions; this is a starting set.
*/
import type { Turn } from "./skill-invocations.js";

export type AnchorKind = "correction" | "none";
export interface Anchor {
anchored: boolean;
kind: AnchorKind;
evidence: string; // the user turn that triggered it (truncated)
}

// Unambiguous correction — ALWAYS an anchor, even amid polite words. This must
// win over BENIGN so "thanks, but this is still failing" still fires.
const STRONG = /\b(wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail|not right)/i;

// Ambiguous negation: "no" is pushback ("no, that's off") but also benign
// ("no problem"), so it only anchors when the turn isn't a clear benign phrase.
const AMBIGUOUS = /\b(no|nope)\b/i;
const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i;

/**
* Detect a correction anchor in a windowed slice of turns. A pushback is a USER turn
* reacting to an ASSISTANT turn — and BOTH must be POST-invocation (index ≥ fromIndex),
* so a correction that happened BEFORE the skill ran (e.g. the skill was a repair
* attempt) isn't misattributed to this skill. fromIndex defaults to 0 (scan all).
* Recall-oriented: a strong correction fires regardless of polite framing; only the
* bare "no" is benign-gated.
*/
export function detectAnchor(turns: Turn[], fromIndex = 0): Anchor {
for (let i = Math.max(1, fromIndex); i < turns.length; i++) {
const t = turns[i];
if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue;
if (i - 1 < fromIndex) continue; // the assistant being reacted to must be post-invocation
const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text));
if (anchored) {
return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) };
}
}
return { anchored: false, kind: "none", evidence: "" };
}
89 changes: 89 additions & 0 deletions src/skillify/skill-edit-gate.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/**
* Edit-outcome gate — the validation organ (the paper's gate, adapted).
*
* A randomized A/B is the ideal, but it needs the skill VERSION recorded at
* invocation time (a capture change we don't have yet — the Skill tool_use only
* carries the skill name). So the feasible gate is LONGITUDINAL: after an edit is
* published, compare the skill's confirmed-failure rate in the window AFTER publish
* vs BEFORE. A real drop = the edit helped → keep; a real rise = it hurt → revert
* (one `cp` from the SKILL.v<old>.bak backup). No clear signal / too few post-publish
* uses → inconclusive (wait, or revert when stale).
*
* It's OBSERVATIONAL (confounded — the population shifts week to week), so it needs
* a margin + a minimum sample. Randomized A/B is the clean upgrade once invocation-
* version capture lands. Reuses scoreInvocations, so the same anchor+judge that
* detects deficiency also validates the fix. Injected query/judge → unit-testable.
*/
import { listSkillInvocations, type QueryFn } from "./skill-invocations.js";
import { scoreInvocations } from "./deficiency-detector.js";
import type { ModelCall } from "./claude-model.js";

export interface WindowStats {
invocations: number;
anchored: number;
confirmed: number;
failureRate: number; // confirmed / invocations
}

export interface GateDecision {
before: WindowStats;
after: WindowStats;
delta: number; // before.failureRate - after.failureRate (positive = improved)
decision: "keep" | "revert" | "inconclusive";
}

interface MeasureOpts {
sinceIso?: string;
untilIso?: string;
limit?: number;
window?: { before?: number; after?: number; maxChars?: number };
judge?: ModelCall;
}

/** Confirmed-failure rate for one skill over a time window. */
export async function measureSkillFailureRate(
query: QueryFn,
sessionsTable: string,
name: string,
author: string,
opts: MeasureOpts = {},
): Promise<WindowStats> {
const all = await listSkillInvocations(query, sessionsTable, { sinceIso: opts.sinceIso, untilIso: opts.untilIso, limit: opts.limit });
const mine = all.filter((i) => i.name === name && i.author === author);
const { anchored, confirmed } = await scoreInvocations(query, sessionsTable, mine, { window: opts.window, judge: opts.judge });
return { invocations: mine.length, anchored, confirmed, failureRate: mine.length ? confirmed / mine.length : 0 };
}

/** Pure decision from before/after stats. */
export function gateEditOutcome(
before: WindowStats,
after: WindowStats,
opts: { margin?: number; minAfter?: number } = {},
): GateDecision {
const margin = opts.margin ?? 0.2;
const minAfter = opts.minAfter ?? 5;
const delta = before.failureRate - after.failureRate;
let decision: GateDecision["decision"];
if (after.invocations < minAfter) decision = "inconclusive"; // not enough post-publish use
else if (delta >= margin) decision = "keep"; // failure rate dropped → helped
else if (after.failureRate - before.failureRate >= margin) decision = "revert"; // got measurably worse
else decision = "inconclusive"; // no clear signal
return { before, after, delta, decision };
}

/** Full gate: measure before/after a publish timestamp and decide. */
export async function gateEdit(
query: QueryFn,
sessionsTable: string,
name: string,
author: string,
publishIso: string,
opts: { windowDays?: number; nowIso?: string; margin?: number; minAfter?: number } & MeasureOpts = {},
): Promise<GateDecision> {
const windowDays = opts.windowDays ?? 14;
const beforeSince = new Date(Date.parse(publishIso) - windowDays * 24 * 60 * 60 * 1000).toISOString();
const shared = { limit: opts.limit, window: opts.window, judge: opts.judge };
const before = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: beforeSince, untilIso: publishIso });
const after = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: publishIso, untilIso: opts.nowIso });
return gateEditOutcome(before, after, opts);
}
Loading
Loading