activeloopai · kaghni · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,4 @@
-node_modules/
+node_modules
 dist/
 tmp/
 *.js.map

diff --git a/src/hooks/session-start.ts b/src/hooks/session-start.ts
@@ -14,7 +14,6 @@ import { loadConfig } from "../config.js";
 import { DeeplakeApi } from "../deeplake-api.js";
 import { sqlStr } from "../utils/sql.js";
 import { projectNameFromCwd } from "../utils/project-name.js";
-import { listActiveOrgSkills, sessionBucket, buildSkillsActiveInsert, buildSkillsActivePath, skillRootsForCwd } from "../skillify/skills-active.js";
 import { readStdin } from "../utils/stdin.js";
 import { log as _log } from "../utils/debug.js";
 import { getInstalledVersion } from "../utils/version-check.js";
@@ -223,11 +222,6 @@ async function main(): Promise<void> {
   // freezes SessionStart. Hard opt-out via HIVEMIND_AUTOPULL_DISABLED=1.
   // All failures swallowed inside autoPullSkills (documented as
   // never-rejecting), so no try/catch needed here.
-  //
-  // Runs BEFORE the skill-attribution snapshot below so that a skill pulled
-  // (or upgraded) during THIS SessionStart is reflected in the recorded
-  // skills_active set — otherwise the row would capture a stale/empty set
-  // while the session can already use the freshly-pulled skill.
   const pullResult = await autoPullSkills();
   log(`autopull: pulled=${pullResult.pulled} skipped=${pullResult.skipped}`);
 
@@ -244,42 +238,6 @@ async function main(): Promise<void> {
           await api.ensureSessionsTable(sessionsTable);
           await createPlaceholder(api, table, input.session_id, input.cwd ?? "", config.userName, config.orgName, config.workspaceId, pluginVersion);
           log("placeholder created");
-
-          // Skill attribution (measurement): record which org-shared skills were in
-          // context this session + a deterministic A/B bucket. This is the label that
-          // makes skill value measurable (sessions with vs without skill X / v1 vs v2).
-          // Org skills are identified via the pull manifest (authoritative), not the
-          // `--` dirname pattern. Snapshot runs after auto-pull (above) so it reflects
-          // freshly-pulled skills. Opt-out: HIVEMIND_SKILL_ATTRIBUTION=0.
-          // Swallowed — must never fail SessionStart.
-          if (process.env.HIVEMIND_SKILL_ATTRIBUTION !== "0") {
-            try {
-              // Scan global + project-scoped (<cwd>/.claude/skills) roots so
-              // skills pulled with `--to project` are attributed too.
-              const skills = listActiveOrgSkills(skillRootsForCwd(input.cwd));
-              // Distinct `/skills_active/` namespace (NOT `/sessions/`) so the summary /
-              // raw-transcript readers never mistake this attribution row for a transcript.
-              const attrSessionPath = buildSkillsActivePath(config, input.session_id);
-              const attrFilename = attrSessionPath.slice(attrSessionPath.lastIndexOf("/") + 1);
-              const sql = buildSkillsActiveInsert({
-                sessionsTable,
-                sessionPath: attrSessionPath,
-                filename: attrFilename,
-                userName: config.userName,
-                projectName: projectNameFromCwd(input.cwd),
-                pluginVersion,
-                sessionId: input.session_id,
-                cwd: input.cwd,
-                skills,
-                bucket: sessionBucket(input.session_id),
-                ts: new Date().toISOString(),
-              });
-              await api.query(sql);
-              log(`skills_active recorded: ${skills.length} org skills, bucket ${sessionBucket(input.session_id)}`);
-            } catch (e: any) {
-              log(`skills_active attribution failed (swallowed): ${e?.message ?? e}`);
-            }
-          }
         } else {
           const reason = process.env.HIVEMIND_CAPTURE === "false"
             ? "HIVEMIND_CAPTURE=false"

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
@@ -0,0 +1,53 @@
+/**
+ * Shared `claude -p` backend for the engine's LLM steps (success-judge, proposer).
+ * All tools denied → pure-text generation. Runs on the USER's own agent, so cost
+ * lands on the user. Returned as an injectable ModelCall so every LLM step is
+ * unit-testable with zero real calls.
+ */
+import { spawn } from "node:child_process";
+import { findAgentBin } from "./gate-runner.js";
+
+/** (systemPrompt, userPrompt) -> raw model text. */
+export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
+
+export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall {
+  const timeoutMs = opts.timeoutMs ?? 120_000;
+  return (system, user) => new Promise<string>((resolve, reject) => {
+    const args = [
+      "-p", user, "--model", model, "--no-session-persistence",
+      "--output-format", "json", "--system-prompt", system,
+      // Empty allow-list = NO tools available. Authoritative: it covers built-ins AND
+      // any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected
+      // transcript text in the judge/proposer prompt can never trigger tool use.
+      "--tools", "",
+      // --strict-mcp-config ignores the user's MCP config entirely (--tools only denies
+      // USE, not LOADING) — a broken/oversized user MCP schema would otherwise fail every
+      // judge/proposer call before it returns JSON, silently stopping proposals.
+      "--strict-mcp-config",
+    ];
+    // HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND
+    // HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart
+    // hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph
+    // work) — one child per anchored invocation would otherwise contaminate the judge
+    // prompt and pile up background work. Same guard the other internal runners use.
+    // Resolve the claude binary the same way the rest of skillify does — a detached
+    // hook worker may not have it on PATH (e.g. ~/.claude/local/claude), and a bare
+    // "claude" would ENOENT and the callers would swallow it as no-change.
+    const child = spawn(findAgentBin("claude_code"), args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" },
+    });
+    let out = "";
+    let err = "";
+    const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs);
+    child.stdout.on("data", (d) => { out += String(d); });
+    child.stderr.on("data", (d) => { err += String(d); });
+    child.on("error", (e) => { clearTimeout(timer); reject(e); });
+    child.on("close", (code) => {
+      clearTimeout(timer);
+      if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`));
+      try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); }
+      catch { resolve(out); }
+    });
+  });
+}
diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts
@@ -0,0 +1,118 @@
+/**
+ * Deficiency detector — the core of the engine's "which skills are bad" step.
+ *
+ * For each org-skill invocation: window the transcript around it, run the FREE
+ * level-1 anchor (user pushback?), and only if anchored spend a level-2 judge
+ * call (was the task actually accomplished?). A "confirmed failure" requires BOTH
+ * — high precision, so we never churn a good skill. Aggregate per skill: a skill
+ * is deficient if it has enough invocations AND a high confirmed-failure rate.
+ *
+ * Token discipline: the judge runs ONLY on anchored windows (a fraction), on a
+ * windowed slice (not whole sessions). Everything injectable (query + judge model)
+ * so the whole orchestration is unit-tested with zero live Deeplake / LLM.
+ *
+ * The ≥5 fire gate lives with the caller (worker): we just return deficientCount.
+ */
+import {
+  listSkillInvocations, windowedTurns, elide, type QueryFn, type SkillInvocation,
+} from "./skill-invocations.js";
+import { detectAnchor } from "./session-anchor.js";
+import { judgeSuccess, type ModelCall } from "./success-judge.js";
+
+export interface SkillDeficiency {
+  name: string;
+  author: string;
+  invocations: number;        // org-skill invocations examined
+  anchored: number;           // had a level-1 anchor → judged
+  confirmedFailures: number;  // anchor AND judge said success=0
+  failureRate: number;        // confirmedFailures / invocations
+  deficient: boolean;         // failureRate >= threshold AND invocations >= minInvocations
+  examples: string[];         // a few failure reasons (for the proposer)
+}
+
+export interface DetectorConfig {
+  minInvocations?: number;       // min-n per skill before we trust the rate (default 8)
+  failureRateThreshold?: number; // confirmed-failure rate to flag deficient (default 0.4)
+  window?: { before?: number; after?: number; maxChars?: number };
+  judge?: ModelCall;             // injected; default = real claude judge
+  sinceIso?: string;             // lookback bound
+  limit?: number;                // cap invocation rows pulled
+}
+
+const skillKey = (name: string, author: string) => `${name}--${author}`;
+
+export interface ScoreConfig {
+  window?: { before?: number; after?: number; maxChars?: number };
+  judge?: ModelCall;
+}
+
+/**
+ * Score a set of invocations: window each, run the free anchor, and judge ONLY the
+ * anchored ones. Shared by the detector (per-skill deficiency) and the edit gate
+ * (a skill's failure rate in a time window).
+ */
+export async function scoreInvocations(
+  query: QueryFn,
+  sessionsTable: string,
+  invocations: SkillInvocation[],
+  cfg: ScoreConfig = {},
+): Promise<{ anchored: number; confirmed: number; examples: string[] }> {
+  let anchored = 0;
+  let confirmed = 0;
+  const examples: string[] = [];
+  for (const inv of invocations) {
+    const { turns, pivot } = await windowedTurns(query, sessionsTable, inv, cfg.window);
+    const anchor = detectAnchor(turns, pivot); // anchor only on post-invocation reaction
+    if (!anchor.anchored) continue; // free filter — no judge call
+    anchored++;
+    const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000);
+    const verdict = await judgeSuccess(window, { model: cfg.judge });
+    if (verdict.success === 0) {
+      confirmed++;
+      if (examples.length < 3) examples.push(verdict.reason || anchor.evidence);
+    }
+  }
+  return { anchored, confirmed, examples };
+}
+
+export interface DetectionResult {
+  skills: SkillDeficiency[];
+  deficientCount: number;
+}
+
+export async function detectDeficientSkills(
+  query: QueryFn,
+  sessionsTable: string,
+  cfg: DetectorConfig = {},
+): Promise<DetectionResult> {
+  const minInvocations = cfg.minInvocations ?? 8;
+  const threshold = cfg.failureRateThreshold ?? 0.4;
+
+  const invocations = await listSkillInvocations(query, sessionsTable, { sinceIso: cfg.sinceIso, limit: cfg.limit });
+
+  const groups = new Map<string, SkillInvocation[]>();
+  for (const inv of invocations) {
+    const k = skillKey(inv.name, inv.author);
+    const arr = groups.get(k);
+    if (arr) arr.push(inv); else groups.set(k, [inv]);
+  }
+
+  const skills: SkillDeficiency[] = [];
+  for (const list of groups.values()) {
+    const { anchored, confirmed, examples } = await scoreInvocations(query, sessionsTable, list, cfg);
+    const failureRate = list.length ? confirmed / list.length : 0;
+    skills.push({
+      name: list[0].name,
+      author: list[0].author,
+      invocations: list.length,
+      anchored,
+      confirmedFailures: confirmed,
+      failureRate,
+      deficient: list.length >= minInvocations && failureRate >= threshold,
+      examples,
+    });
+  }
+
+  skills.sort((a, b) => b.failureRate - a.failureRate || b.invocations - a.invocations);
+  return { skills, deficientCount: skills.filter((s) => s.deficient).length };
+}
diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts
@@ -0,0 +1,50 @@
+/**
+ * Heuristic "anchor" — a HARD, observable signal in the transcript that a session
+ * went badly, independent of any LLM judgment: the user pushed back on / corrected
+ * what the assistant just did. Pure + free (no LLM, no I/O).
+ *
+ * It's the level-1 filter in the outcome pipeline: only windows with an anchor go
+ * to the (paid) success-judge, and a session is labelled a failure only when the
+ * anchor AND the judge agree. So this is deliberately tuned for RECALL over
+ * precision — a false positive just costs one judge call (which then drops it),
+ * but a false negative under-detects (conservative — it never churns a good skill).
+ * Patterns are meant to be tuned against real sessions; this is a starting set.
+ */
+import type { Turn } from "./skill-invocations.js";
+
+export type AnchorKind = "correction" | "none";
+export interface Anchor {
+  anchored: boolean;
+  kind: AnchorKind;
+  evidence: string; // the user turn that triggered it (truncated)
+}
+
+// Unambiguous correction — ALWAYS an anchor, even amid polite words. This must
+// win over BENIGN so "thanks, but this is still failing" still fires.
+const STRONG = /\b(wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail|not right)/i;
+
+// Ambiguous negation: "no" is pushback ("no, that's off") but also benign
+// ("no problem"), so it only anchors when the turn isn't a clear benign phrase.
+const AMBIGUOUS = /\b(no|nope)\b/i;
+const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i;
+
+/**
+ * Detect a correction anchor in a windowed slice of turns. A pushback is a USER turn
+ * reacting to an ASSISTANT turn — and BOTH must be POST-invocation (index ≥ fromIndex),
+ * so a correction that happened BEFORE the skill ran (e.g. the skill was a repair
+ * attempt) isn't misattributed to this skill. fromIndex defaults to 0 (scan all).
+ * Recall-oriented: a strong correction fires regardless of polite framing; only the
+ * bare "no" is benign-gated.
+ */
+export function detectAnchor(turns: Turn[], fromIndex = 0): Anchor {
+  for (let i = Math.max(1, fromIndex); i < turns.length; i++) {
+    const t = turns[i];
+    if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue;
+    if (i - 1 < fromIndex) continue; // the assistant being reacted to must be post-invocation
+    const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text));
+    if (anchored) {
+      return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) };
+    }
+  }
+  return { anchored: false, kind: "none", evidence: "" };
+}
diff --git a/src/skillify/skill-edit-gate.ts b/src/skillify/skill-edit-gate.ts
@@ -0,0 +1,89 @@
+/**
+ * Edit-outcome gate — the validation organ (the paper's gate, adapted).
+ *
+ * A randomized A/B is the ideal, but it needs the skill VERSION recorded at
+ * invocation time (a capture change we don't have yet — the Skill tool_use only
+ * carries the skill name). So the feasible gate is LONGITUDINAL: after an edit is
+ * published, compare the skill's confirmed-failure rate in the window AFTER publish
+ * vs BEFORE. A real drop = the edit helped → keep; a real rise = it hurt → revert
+ * (one `cp` from the SKILL.v<old>.bak backup). No clear signal / too few post-publish
+ * uses → inconclusive (wait, or revert when stale).
+ *
+ * It's OBSERVATIONAL (confounded — the population shifts week to week), so it needs
+ * a margin + a minimum sample. Randomized A/B is the clean upgrade once invocation-
+ * version capture lands. Reuses scoreInvocations, so the same anchor+judge that
+ * detects deficiency also validates the fix. Injected query/judge → unit-testable.
+ */
+import { listSkillInvocations, type QueryFn } from "./skill-invocations.js";
+import { scoreInvocations } from "./deficiency-detector.js";
+import type { ModelCall } from "./claude-model.js";
+
+export interface WindowStats {
+  invocations: number;
+  anchored: number;
+  confirmed: number;
+  failureRate: number; // confirmed / invocations
+}
+
+export interface GateDecision {
+  before: WindowStats;
+  after: WindowStats;
+  delta: number; // before.failureRate - after.failureRate (positive = improved)
+  decision: "keep" | "revert" | "inconclusive";
+}
+
+interface MeasureOpts {
+  sinceIso?: string;
+  untilIso?: string;
+  limit?: number;
+  window?: { before?: number; after?: number; maxChars?: number };
+  judge?: ModelCall;
+}
+
+/** Confirmed-failure rate for one skill over a time window. */
+export async function measureSkillFailureRate(
+  query: QueryFn,
+  sessionsTable: string,
+  name: string,
+  author: string,
+  opts: MeasureOpts = {},
+): Promise<WindowStats> {
+  const all = await listSkillInvocations(query, sessionsTable, { sinceIso: opts.sinceIso, untilIso: opts.untilIso, limit: opts.limit });
+  const mine = all.filter((i) => i.name === name && i.author === author);
+  const { anchored, confirmed } = await scoreInvocations(query, sessionsTable, mine, { window: opts.window, judge: opts.judge });
+  return { invocations: mine.length, anchored, confirmed, failureRate: mine.length ? confirmed / mine.length : 0 };
+}
+
+/** Pure decision from before/after stats. */
+export function gateEditOutcome(
+  before: WindowStats,
+  after: WindowStats,
+  opts: { margin?: number; minAfter?: number } = {},
+): GateDecision {
+  const margin = opts.margin ?? 0.2;
+  const minAfter = opts.minAfter ?? 5;
+  const delta = before.failureRate - after.failureRate;
+  let decision: GateDecision["decision"];
+  if (after.invocations < minAfter) decision = "inconclusive";              // not enough post-publish use
+  else if (delta >= margin) decision = "keep";                              // failure rate dropped → helped
+  else if (after.failureRate - before.failureRate >= margin) decision = "revert"; // got measurably worse
+  else decision = "inconclusive";                                          // no clear signal
+  return { before, after, delta, decision };
+}
+
+/** Full gate: measure before/after a publish timestamp and decide. */
+export async function gateEdit(
+  query: QueryFn,
+  sessionsTable: string,
+  name: string,
+  author: string,
+  publishIso: string,
+  opts: { windowDays?: number; nowIso?: string; margin?: number; minAfter?: number } & MeasureOpts = {},
+): Promise<GateDecision> {
+  const windowDays = opts.windowDays ?? 14;
+  const beforeSince = new Date(Date.parse(publishIso) - windowDays * 24 * 60 * 60 * 1000).toISOString();
+  const shared = { limit: opts.limit, window: opts.window, judge: opts.judge };
+  const before = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: beforeSince, untilIso: publishIso });
+  const after = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: publishIso, untilIso: opts.nowIso });
+  return gateEditOutcome(before, after, opts);
+}