From 26083557d34b106e69776e72ac2ca10f37eaa77e Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 03:50:52 +0000
Subject: [PATCH 01/30] feat(skillopt): read-side skill cohorts + session
 reconstruction

Read side of the attribution the foundation writes. listSkillSessions reads the
skills_active rows (filtered on description='skills_active', deduped per session)
into per-session {skills, bucket}; cohortsForSkill partitions a skill's
treatment/control; reconstructSession rebuilds the raw transcript from the
session's captured rows. All queries injected (unit-testable); nothing touches
the model channel. First piece of the deficiency detector. 10 unit tests.
---
 src/skillify/skill-cohorts.ts      | 153 +++++++++++++++++++++++++++++
 tests/shared/skill-cohorts.test.ts | 130 ++++++++++++++++++++++++
 2 files changed, 283 insertions(+)
 create mode 100644 src/skillify/skill-cohorts.ts
 create mode 100644 tests/shared/skill-cohorts.test.ts
diff --git a/src/skillify/skill-cohorts.ts b/src/skillify/skill-cohorts.ts
new file mode 100644
index 00000000..e436a8e2
--- /dev/null
+++ b/src/skillify/skill-cohorts.ts
@@ -0,0 +1,153 @@
+/**
+ * Read side of skill attribution. The foundation's skills-active.ts WRITES a
+ * `skills_active` row per session (which org skills were active + an A/B bucket);
+ * this module READS those rows + the captured transcript rows and turns them into
+ * the inputs the deficiency detector needs:
+ *
+ *   - per-session attribution (which skills were active, the bucket),
+ *   - treatment/control cohorts for a given skill,
+ *   - raw-session reconstruction for the outcome scorer.
+ *
+ * Scope discipline (see design notes): we only ever touch sessions that have a
+ * skills_active row — never the whole table — so the detector scores a small,
+ * relevant cohort, not every session.
+ *
+ * Every query goes through an injected `QueryFn` (DeeplakeApi.query bound), so the
+ * cohort + reconstruction logic is unit-testable with zero live Deeplake.
+ */
+import type { ActiveSkill } from "./skills-active.js";
+import { sqlStr } from "../utils/sql.js";
+
+export type QueryFn = (sql: string) => Promise<Array<Record<string, unknown>>>;
+
+export interface SessionAttribution {
+  sessionId: string;
+  skills: ActiveSkill[];
+  bucket: number;
+  ts: string; // last_update_date of the skills_active row
+}
+
+/** Stable identity for an org skill (matches the `<name>--<author>` dir convention). */
+export function skillKey(name: string, author: string): string {
+  return `${name}--${author}`;
+}
+
+interface ParsedMsg {
+  type?: string;
+  content?: unknown;
+  session_id?: unknown;
+  skills?: unknown;
+  ab_bucket?: unknown;
+}
+
+/** Deeplake may hand `message` back as a JSON string or an already-parsed object. */
+function parseMessage(m: unknown): ParsedMsg | null {
+  if (m == null) return null;
+  if (typeof m === "string") {
+    try { return JSON.parse(m) as ParsedMsg; } catch { return null; }
+  }
+  if (typeof m === "object") return m as ParsedMsg;
+  return null;
+}
+
+function asActiveSkills(v: unknown): ActiveSkill[] {
+  if (!Array.isArray(v)) return [];
+  const out: ActiveSkill[] = [];
+  for (const s of v) {
+    if (s && typeof s === "object"
+      && typeof (s as ActiveSkill).name === "string"
+      && typeof (s as ActiveSkill).author === "string") {
+      const sk = s as ActiveSkill;
+      out.push({ name: sk.name, author: sk.author, version: typeof sk.version === "number" ? sk.version : 1 });
+    }
+  }
+  return out;
+}
+
+/**
+ * Every session that has a skills_active attribution row, newest first.
+ * `sinceIso` bounds the lookback window; `limit` caps the rows pulled.
+ * The `description = 'skills_active'` column filter is the index — it's the value
+ * skills-active.ts writes into the row's `description`, so this never scans
+ * transcript rows.
+ */
+export async function listSkillSessions(
+  query: QueryFn,
+  sessionsTable: string,
+  opts: { sinceIso?: string; limit?: number } = {},
+): Promise<SessionAttribution[]> {
+  const where = ["description = 'skills_active'"];
+  if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`);
+  const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : "";
+  const rows = await query(
+    `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`,
+  );
+  const out: SessionAttribution[] = [];
+  const seen = new Set<string>(); // a session can have >1 skills_active row (one per start); keep the newest
+  for (const r of rows) {
+    const m = parseMessage(r.message);
+    if (!m || m.type !== "skills_active" || typeof m.session_id !== "string") continue;
+    if (seen.has(m.session_id)) continue;
+    seen.add(m.session_id);
+    out.push({
+      sessionId: m.session_id,
+      skills: asActiveSkills(m.skills),
+      bucket: typeof m.ab_bucket === "number" ? m.ab_bucket : 0,
+      ts: typeof r.last_update_date === "string" ? r.last_update_date : "",
+    });
+  }
+  return out;
+}
+
+/**
+ * Partition sessions into treatment (the skill was active) vs control (it wasn't).
+ * NOTE: this is OBSERVATIONAL (the foundation records availability, it does not yet
+ * randomize withholding), so control is not a clean counterfactual — the detector
+ * treats treatment's ABSOLUTE outcome as the primary signal and uses control only
+ * as weak context until a real withholding arm lands.
+ */
+export function cohortsForSkill(
+  sessions: SessionAttribution[],
+  name: string,
+  author: string,
+): { treatment: SessionAttribution[]; control: SessionAttribution[] } {
+  const key = skillKey(name, author);
+  const treatment: SessionAttribution[] = [];
+  const control: SessionAttribution[] = [];
+  for (const s of sessions) {
+    const has = s.skills.some((sk) => skillKey(sk.name, sk.author) === key);
+    (has ? treatment : control).push(s);
+  }
+  return { treatment, control };
+}
+
+/**
+ * Reconstruct a session's transcript (USER/ASSISTANT turns, tool noise dropped)
+ * from its captured rows, oldest-first. Long transcripts are head+tail elided to
+ * `maxChars` so a giant session can't blow the judge's context.
+ */
+export async function reconstructSession(
+  query: QueryFn,
+  sessionsTable: string,
+  sessionId: string,
+  maxChars = 14_000,
+): Promise<string> {
+  const sid = sqlStr(sessionId);
+  const rows = await query(
+    `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`,
+  );
+  const parts: string[] = [];
+  for (const r of rows) {
+    const j = parseMessage(r.message);
+    if (!j) continue;
+    const text = typeof j.content === "string" ? j.content.trim() : "";
+    if (!text) continue;
+    if (j.type === "user_message") parts.push(`USER: ${text}`);
+    else if (j.type === "assistant_message") parts.push(`ASSISTANT: ${text}`);
+  }
+  const joined = parts.join("\n\n");
+  if (joined.length <= maxChars) return joined;
+  const head = joined.slice(0, Math.floor(maxChars * 0.55));
+  const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45));
+  return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`;
+}
diff --git a/tests/shared/skill-cohorts.test.ts b/tests/shared/skill-cohorts.test.ts
new file mode 100644
index 00000000..329ec964
--- /dev/null
+++ b/tests/shared/skill-cohorts.test.ts
@@ -0,0 +1,130 @@
+import { describe, it, expect, vi } from "vitest";
+import {
+  listSkillSessions,
+  cohortsForSkill,
+  reconstructSession,
+  skillKey,
+  type SessionAttribution,
+} from "../../src/skillify/skill-cohorts.js";
+
+const TABLE = "sessions";
+
+/** A query mock that returns canned rows and records the SQL it was asked. */
+function mockQuery(rows: Array<Record<string, unknown>>) {
+  const calls: string[] = [];
+  const fn = vi.fn(async (sql: string) => { calls.push(sql); return rows; });
+  return { fn, calls };
+}
+
+const activeRow = (sessionId: string, skills: unknown, bucket: number, ts: string, asString = false) => {
+  const msg = { type: "skills_active", session_id: sessionId, skills, ab_bucket: bucket };
+  return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts };
+};
+
+describe("listSkillSessions", () => {
+  it("filters on description='skills_active' and orders newest-first with the limit", async () => {
+    const { fn, calls } = mockQuery([]);
+    await listSkillSessions(fn, TABLE, { sinceIso: "2026-06-01T00:00:00Z", limit: 50 });
+    expect(calls[0]).toContain(`FROM "sessions"`);
+    expect(calls[0]).toContain("description = 'skills_active'");
+    expect(calls[0]).toContain("last_update_date >= '2026-06-01T00:00:00Z'");
+    expect(calls[0]).toContain("ORDER BY last_update_date DESC");
+    expect(calls[0]).toContain("LIMIT 50");
+  });
+
+  it("parses both JSON-string and object message payloads", async () => {
+    const { fn } = mockQuery([
+      activeRow("S1", [{ name: "a", author: "x", version: 2 }], 1, "t2", false), // object
+      activeRow("S2", [{ name: "b", author: "y", version: 3 }], 0, "t1", true),  // JSON string
+    ]);
+    const got = await listSkillSessions(fn, TABLE);
+    expect(got).toEqual([
+      { sessionId: "S1", skills: [{ name: "a", author: "x", version: 2 }], bucket: 1, ts: "t2" },
+      { sessionId: "S2", skills: [{ name: "b", author: "y", version: 3 }], bucket: 0, ts: "t1" },
+    ]);
+  });
+
+  it("dedups a session to its newest row (rows arrive newest-first) and drops malformed", async () => {
+    const { fn } = mockQuery([
+      activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "newer"),
+      activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "older"), // same session → skipped
+      { message: "not json", last_update_date: "t" },                        // unparseable → skipped
+      { message: { type: "user_message", content: "hi" }, last_update_date: "t" }, // wrong type → skipped
+      { message: { type: "skills_active", skills: [] }, last_update_date: "t" },   // no session_id → skipped
+    ]);
+    const got = await listSkillSessions(fn, TABLE);
+    expect(got).toHaveLength(1);
+    expect(got[0]).toMatchObject({ sessionId: "S1", ts: "newer" });
+  });
+
+  it("coerces missing/garbage skill fields safely (defaults version 1, drops non-objects)", async () => {
+    const { fn } = mockQuery([
+      activeRow("S1", [{ name: "a", author: "x" }, "garbage", { name: "b" /* no author */ }], 0, "t"),
+    ]);
+    const got = await listSkillSessions(fn, TABLE);
+    expect(got[0].skills).toEqual([{ name: "a", author: "x", version: 1 }]);
+  });
+
+  it("omits the LIMIT clause when no limit is given", async () => {
+    const { fn, calls } = mockQuery([]);
+    await listSkillSessions(fn, TABLE);
+    expect(calls[0]).not.toContain("LIMIT");
+  });
+});
+
+describe("cohortsForSkill", () => {
+  const S = (id: string, skills: Array<[string, string]>): SessionAttribution => ({
+    sessionId: id, bucket: 0, ts: "t",
+    skills: skills.map(([name, author]) => ({ name, author, version: 1 })),
+  });
+
+  it("splits sessions into treatment (skill present) and control (absent)", () => {
+    const sessions = [
+      S("s1", [["posthog", "kamo"], ["other", "z"]]), // treatment
+      S("s2", [["other", "z"]]),                        // control
+      S("s3", [["posthog", "kamo"]]),                   // treatment
+      S("s4", []),                                       // control (no skills)
+      S("s5", [["posthog", "DIFFERENT"]]),              // control (same name, other author)
+    ];
+    const { treatment, control } = cohortsForSkill(sessions, "posthog", "kamo");
+    expect(treatment.map((s) => s.sessionId)).toEqual(["s1", "s3"]);
+    expect(control.map((s) => s.sessionId)).toEqual(["s2", "s4", "s5"]); // s5: name matches, author doesn't
+  });
+
+  it("skillKey is name--author", () => {
+    expect(skillKey("posthog", "kamo")).toBe("posthog--kamo");
+  });
+});
+
+describe("reconstructSession", () => {
+  it("orders by creation_date, keeps user/assistant turns, drops tool noise + empty", async () => {
+    const { fn, calls } = mockQuery([
+      { message: { type: "user_message", content: "do X" } },
+      { message: { type: "tool_call", tool_input: "{}", tool_response: "{}" } }, // dropped (no content)
+      { message: { type: "assistant_message", content: "did X" } },
+      { message: { type: "assistant_message", content: "   " } },                 // dropped (blank)
+      { message: JSON.stringify({ type: "user_message", content: "thanks" }) },   // string payload
+    ]);
+    const out = await reconstructSession(fn, TABLE, "abc-123");
+    expect(calls[0]).toContain("path LIKE '/sessions/%abc-123%'");
+    expect(calls[0]).toContain("ORDER BY creation_date ASC");
+    expect(out).toBe("USER: do X\n\nASSISTANT: did X\n\nUSER: thanks");
+  });
+
+  it("head+tail elides a transcript longer than maxChars", async () => {
+    const big = "x".repeat(500);
+    const { fn } = mockQuery([
+      { message: { type: "user_message", content: big } },
+      { message: { type: "assistant_message", content: big } },
+    ]);
+    const out = await reconstructSession(fn, TABLE, "s", 200);
+    expect(out).toContain("chars elided");
+    expect(out.length).toBeLessThan(400); // ~maxChars + the elision marker, far below the ~1000 raw
+  });
+
+  it("escapes single quotes in the session id (no SQL break)", async () => {
+    const { fn, calls } = mockQuery([]);
+    await reconstructSession(fn, TABLE, "a'b");
+    expect(calls[0]).toContain("/sessions/%a''b%");
+  });
+});

From 338909acbe795215a7951f882ffd2053f5e8df15 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:14:05 +0000
Subject: [PATCH 02/30] =?UTF-8?q?feat(skillopt):=20skill-invocation=20data?=
 =?UTF-8?q?=20layer=20(Skill=20tool=5Fuse=20=E2=86=92=20windows)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Read side keyed on actual invocations: capture.ts persists each Skill tool_use
as a tool_call row (tool_name=Skill, tool_input={skill}). listSkillInvocations
pulls org-skill (<name>--<author>) invocations; windowAroundInvocation slices the
transcript before/after the invocation turn for tight, cheap judging. Supersedes
availability-based attribution. Injected queries; 8 unit tests.
---
 src/skillify/skill-cohorts.ts          | 153 ------------------
 src/skillify/skill-invocations.ts      | 158 ++++++++++++++++++
 src/skillify/skills-active.ts          | Bin 7404 -> 0 bytes
 tests/shared/skill-cohorts.test.ts     | 130 ---------------
 tests/shared/skill-invocations.test.ts | 103 ++++++++++++
 tests/shared/skills-active.test.ts     | 211 -------------------------
 6 files changed, 261 insertions(+), 494 deletions(-)
 delete mode 100644 src/skillify/skill-cohorts.ts
 create mode 100644 src/skillify/skill-invocations.ts
 delete mode 100644 src/skillify/skills-active.ts
 delete mode 100644 tests/shared/skill-cohorts.test.ts
 create mode 100644 tests/shared/skill-invocations.test.ts
 delete mode 100644 tests/shared/skills-active.test.ts

diff --git a/src/skillify/skill-cohorts.ts b/src/skillify/skill-cohorts.ts
deleted file mode 100644
index e436a8e2..00000000
--- a/src/skillify/skill-cohorts.ts
+++ /dev/null
@@ -1,153 +0,0 @@
-/**
- * Read side of skill attribution. The foundation's skills-active.ts WRITES a
- * `skills_active` row per session (which org skills were active + an A/B bucket);
- * this module READS those rows + the captured transcript rows and turns them into
- * the inputs the deficiency detector needs:
- *
- *   - per-session attribution (which skills were active, the bucket),
- *   - treatment/control cohorts for a given skill,
- *   - raw-session reconstruction for the outcome scorer.
- *
- * Scope discipline (see design notes): we only ever touch sessions that have a
- * skills_active row — never the whole table — so the detector scores a small,
- * relevant cohort, not every session.
- *
- * Every query goes through an injected `QueryFn` (DeeplakeApi.query bound), so the
- * cohort + reconstruction logic is unit-testable with zero live Deeplake.
- */
-import type { ActiveSkill } from "./skills-active.js";
-import { sqlStr } from "../utils/sql.js";
-
-export type QueryFn = (sql: string) => Promise<Array<Record<string, unknown>>>;
-
-export interface SessionAttribution {
-  sessionId: string;
-  skills: ActiveSkill[];
-  bucket: number;
-  ts: string; // last_update_date of the skills_active row
-}
-
-/** Stable identity for an org skill (matches the `<name>--<author>` dir convention). */
-export function skillKey(name: string, author: string): string {
-  return `${name}--${author}`;
-}
-
-interface ParsedMsg {
-  type?: string;
-  content?: unknown;
-  session_id?: unknown;
-  skills?: unknown;
-  ab_bucket?: unknown;
-}
-
-/** Deeplake may hand `message` back as a JSON string or an already-parsed object. */
-function parseMessage(m: unknown): ParsedMsg | null {
-  if (m == null) return null;
-  if (typeof m === "string") {
-    try { return JSON.parse(m) as ParsedMsg; } catch { return null; }
-  }
-  if (typeof m === "object") return m as ParsedMsg;
-  return null;
-}
-
-function asActiveSkills(v: unknown): ActiveSkill[] {
-  if (!Array.isArray(v)) return [];
-  const out: ActiveSkill[] = [];
-  for (const s of v) {
-    if (s && typeof s === "object"
-      && typeof (s as ActiveSkill).name === "string"
-      && typeof (s as ActiveSkill).author === "string") {
-      const sk = s as ActiveSkill;
-      out.push({ name: sk.name, author: sk.author, version: typeof sk.version === "number" ? sk.version : 1 });
-    }
-  }
-  return out;
-}
-
-/**
- * Every session that has a skills_active attribution row, newest first.
- * `sinceIso` bounds the lookback window; `limit` caps the rows pulled.
- * The `description = 'skills_active'` column filter is the index — it's the value
- * skills-active.ts writes into the row's `description`, so this never scans
- * transcript rows.
- */
-export async function listSkillSessions(
-  query: QueryFn,
-  sessionsTable: string,
-  opts: { sinceIso?: string; limit?: number } = {},
-): Promise<SessionAttribution[]> {
-  const where = ["description = 'skills_active'"];
-  if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`);
-  const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : "";
-  const rows = await query(
-    `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`,
-  );
-  const out: SessionAttribution[] = [];
-  const seen = new Set<string>(); // a session can have >1 skills_active row (one per start); keep the newest
-  for (const r of rows) {
-    const m = parseMessage(r.message);
-    if (!m || m.type !== "skills_active" || typeof m.session_id !== "string") continue;
-    if (seen.has(m.session_id)) continue;
-    seen.add(m.session_id);
-    out.push({
-      sessionId: m.session_id,
-      skills: asActiveSkills(m.skills),
-      bucket: typeof m.ab_bucket === "number" ? m.ab_bucket : 0,
-      ts: typeof r.last_update_date === "string" ? r.last_update_date : "",
-    });
-  }
-  return out;
-}
-
-/**
- * Partition sessions into treatment (the skill was active) vs control (it wasn't).
- * NOTE: this is OBSERVATIONAL (the foundation records availability, it does not yet
- * randomize withholding), so control is not a clean counterfactual — the detector
- * treats treatment's ABSOLUTE outcome as the primary signal and uses control only
- * as weak context until a real withholding arm lands.
- */
-export function cohortsForSkill(
-  sessions: SessionAttribution[],
-  name: string,
-  author: string,
-): { treatment: SessionAttribution[]; control: SessionAttribution[] } {
-  const key = skillKey(name, author);
-  const treatment: SessionAttribution[] = [];
-  const control: SessionAttribution[] = [];
-  for (const s of sessions) {
-    const has = s.skills.some((sk) => skillKey(sk.name, sk.author) === key);
-    (has ? treatment : control).push(s);
-  }
-  return { treatment, control };
-}
-
-/**
- * Reconstruct a session's transcript (USER/ASSISTANT turns, tool noise dropped)
- * from its captured rows, oldest-first. Long transcripts are head+tail elided to
- * `maxChars` so a giant session can't blow the judge's context.
- */
-export async function reconstructSession(
-  query: QueryFn,
-  sessionsTable: string,
-  sessionId: string,
-  maxChars = 14_000,
-): Promise<string> {
-  const sid = sqlStr(sessionId);
-  const rows = await query(
-    `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`,
-  );
-  const parts: string[] = [];
-  for (const r of rows) {
-    const j = parseMessage(r.message);
-    if (!j) continue;
-    const text = typeof j.content === "string" ? j.content.trim() : "";
-    if (!text) continue;
-    if (j.type === "user_message") parts.push(`USER: ${text}`);
-    else if (j.type === "assistant_message") parts.push(`ASSISTANT: ${text}`);
-  }
-  const joined = parts.join("\n\n");
-  if (joined.length <= maxChars) return joined;
-  const head = joined.slice(0, Math.floor(maxChars * 0.55));
-  const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45));
-  return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`;
-}
diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts
new file mode 100644
index 00000000..3cb0fd31
--- /dev/null
+++ b/src/skillify/skill-invocations.ts
@@ -0,0 +1,158 @@
+/**
+ * Read side of skill *invocation* attribution — the basis for deficiency detection.
+ *
+ * A skill can only help or hurt if the agent actually INVOKED it. Claude Code
+ * records each invocation as a `Skill` tool_use, which capture.ts persists as a
+ * tool_call row: `message.tool_name === "Skill"`, `message.tool_input` a JSON
+ * string `{ skill: "<name>--<author>", args? }`. We key on these real invocations
+ * rather than availability (the dropped skills_active) because:
+ *   - it's accurate — availability-without-invocation is pure noise, and
+ *   - it pins the exact turn, so we can window the judge tightly around it.
+ *
+ * Org skills only: the invoked `skill` is `<name>--<author>`. Plugin-namespaced
+ * (`hivemind:...`) and bare skills are not org-mined skills and are skipped.
+ *
+ * Every query is injected (QueryFn), so this is unit-testable with no live Deeplake.
+ */
+import { sqlStr } from "../utils/sql.js";
+
+export type QueryFn = (sql: string) => Promise<Array<Record<string, unknown>>>;
+
+export interface SkillInvocation {
+  sessionId: string;
+  name: string;
+  author: string;
+  ts: string; // invocation timestamp (message.timestamp, else the row's last_update_date)
+}
+
+interface ParsedMsg {
+  type?: string;
+  tool_name?: string;
+  tool_input?: unknown;
+  content?: unknown;
+  session_id?: unknown;
+  timestamp?: unknown;
+}
+
+function parseMessage(m: unknown): ParsedMsg | null {
+  if (m == null) return null;
+  if (typeof m === "string") {
+    try { return JSON.parse(m) as ParsedMsg; } catch { return null; }
+  }
+  if (typeof m === "object") return m as ParsedMsg;
+  return null;
+}
+
+/** The skill ref invoked by a tool_call message (e.g. "name--author"), else null. */
+export function invokedSkillRef(msg: ParsedMsg): string | null {
+  if (msg.type !== "tool_call" || msg.tool_name !== "Skill") return null;
+  let input: unknown = msg.tool_input;
+  if (typeof input === "string") { try { input = JSON.parse(input); } catch { return null; } }
+  const skill = (input as { skill?: unknown })?.skill;
+  return typeof skill === "string" && skill.length > 0 ? skill : null;
+}
+
+/** Split "<name>--<author>" → parts. null for plugin-namespaced / bare / malformed refs. */
+export function splitOrgSkill(skill: string): { name: string; author: string } | null {
+  if (skill.includes(":")) return null; // plugin-namespaced (e.g. hivemind:hivemind-memory)
+  const i = skill.lastIndexOf("--");
+  if (i <= 0 || i + 2 >= skill.length) return null; // bare or malformed
+  return { name: skill.slice(0, i), author: skill.slice(i + 2) };
+}
+
+/**
+ * Org-skill invocations across captured sessions, newest first. Coarse prefilter
+ * on `"Skill"` (robust to JSONB colon-spacing) then a precise in-code check, so a
+ * stray "Skill" in prose can't slip through as a real invocation.
+ */
+export async function listSkillInvocations(
+  query: QueryFn,
+  sessionsTable: string,
+  opts: { sinceIso?: string; limit?: number } = {},
+): Promise<SkillInvocation[]> {
+  const where = [`CAST(message AS TEXT) LIKE '%"Skill"%'`];
+  if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`);
+  const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : "";
+  const rows = await query(
+    `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`,
+  );
+  const out: SkillInvocation[] = [];
+  for (const r of rows) {
+    const m = parseMessage(r.message);
+    if (!m) continue;
+    const ref = invokedSkillRef(m);
+    if (!ref) continue;
+    const parts = splitOrgSkill(ref);
+    if (!parts) continue;
+    const sessionId = typeof m.session_id === "string" ? m.session_id : "";
+    if (!sessionId) continue;
+    out.push({
+      sessionId,
+      name: parts.name,
+      author: parts.author,
+      ts: typeof m.timestamp === "string" ? m.timestamp
+        : (typeof r.last_update_date === "string" ? r.last_update_date : ""),
+    });
+  }
+  return out;
+}
+
+interface Turn { role: "USER" | "ASSISTANT"; text: string }
+
+/**
+ * Reconstruct the transcript turns of a session, and mark where (between which two
+ * turns) the given invocation happened — so callers can window around it.
+ */
+async function sessionTurns(
+  query: QueryFn, sessionsTable: string, inv: SkillInvocation,
+): Promise<{ turns: Turn[]; invIndex: number }> {
+  const sid = sqlStr(inv.sessionId);
+  const rows = await query(
+    `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`,
+  );
+  const turns: Turn[] = [];
+  let invIndex = -1;
+  for (const r of rows) {
+    const j = parseMessage(r.message);
+    if (!j) continue;
+    // The invocation itself is a tool_call (not a turn): mark its position then skip.
+    const ref = invokedSkillRef(j);
+    if (ref) {
+      const p = splitOrgSkill(ref);
+      if (invIndex < 0 && p && p.name === inv.name && p.author === inv.author
+        && (typeof j.timestamp !== "string" || !inv.ts || j.timestamp === inv.ts)) {
+        invIndex = turns.length;
+      }
+      continue;
+    }
+    const text = typeof j.content === "string" ? j.content.trim() : "";
+    if (!text) continue;
+    if (j.type === "user_message") turns.push({ role: "USER", text });
+    else if (j.type === "assistant_message") turns.push({ role: "ASSISTANT", text });
+  }
+  if (invIndex < 0) invIndex = turns.length; // invocation not located → treat as session end
+  return { turns, invIndex };
+}
+
+/**
+ * The transcript window around an invocation: `before` turns before it and `after`
+ * turns after — where the help-or-harm signal lives — head+tail elided to maxChars.
+ * `before`/`after` are tunable; defaults chosen as a small starting point.
+ */
+export async function windowAroundInvocation(
+  query: QueryFn,
+  sessionsTable: string,
+  inv: SkillInvocation,
+  opts: { before?: number; after?: number; maxChars?: number } = {},
+): Promise<string> {
+  const before = opts.before ?? 3;
+  const after = opts.after ?? 6;
+  const maxChars = opts.maxChars ?? 4000;
+  const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv);
+  const slice = turns.slice(Math.max(0, invIndex - before), invIndex + after);
+  const joined = slice.map((t) => `${t.role}: ${t.text}`).join("\n\n");
+  if (joined.length <= maxChars) return joined;
+  const head = joined.slice(0, Math.floor(maxChars * 0.55));
+  const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45));
+  return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`;
+}
diff --git a/src/skillify/skills-active.ts b/src/skillify/skills-active.ts
deleted file mode 100644
index d9b3c46d26b7f47f7ce2ecccaa210c04a94f91fa..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7404
zcma)B?Q+}3742_5#VyAfftCPD>2xNcEqmnHZq>+gYTE9Pe25^i6cK>{g9RirGCTd%
z2k7((^CUgz?k)++&P+XHNn9-M-p_N-UJf2U+>?hgy3UJ2(x$2NN!#RADYL4Ug-LB&
zn}sQxq10wt)!CseO)b_~i%o}eJI|+cdHv=u@#wFoZ(ckXOF@<*Z%u6^y-D*TofJm$
zQl?eem=6t(k&zQHYSOxa$fC89N@f~UFY+?CO+J;=!84h((`(Zt5J!psn6HNJ0J%wv
z*2pw1r7DY+Obl1@ENx6ClNC-{EYmtKuY#slnmLZ0ry0&s9py%~Z040U?qQZ^<$fbk
zHL<3?N$G{Okb%@`nN^GYSBQpA{Y2hYS-O(4YUVt_`NcX{P)+9oXMe})-~RPqe{x>_
zY^LhuBEK<~u4(3G$0uS}wlND?*07{Gq*$D%PE3~#Vv-4dmR3}1wG+7<5*Nz~5rW^L
z?HmhfmeB=97CF|THG*P_;Mmc$TACrl7?h`16Zx(!3Y=8c@T0~Rua)p%BjbM#l4+5)
znHjhzgU4mMFi+z6aoWPC^^?JvN=MchSwc+G*#09as%cup8ZyKgqP^G@sWQQjm_l#>
z4#|-#rHcUqY(7j29G1~u%0&=iMPXuhniK5ZJ5Mc~CDR%%h+D+cXG>FsgbsFp2S>^Q
zL}ey6vl)UC&uS<<XX-{-Zm$ttq|AYg{(deN#R1Pv+PuhM3%yyI=C-P@<?Leg;>}yh
zHKnqugf%PO0CNi{05@pC#Fhz9U--&N3m&kNc~$rTU1<(KH)dI+*XDGYQ`x~DBU(2y
zvob?UNUy9iGo0Dpqwr*zHuLT?e%?O4!wK+gUdy_>sp<&06Z=asYPLI)WB@2A>;UV@
z&pSJdD$QP{Wj-?&<=d>5#^*{{+#Hh(7Ji=-v1dN0Aq;OlOQ%LoQ3^L^qyTicC&Cm!
znX9uO;j@cycjd;^M4zFQ?P6l;M|<nNy}`qW@`kY*ho@s_Xey&`&(6=2MaIu%vjE89
z9e~X&ZHop~QOU97Gf7eQWix0?Mud=+(C@RhbfwBwRr0+%J#wl0rmBFAPTNC)gTse@
zP)}%7Q?K;55Cm-_lL|pUK{yl2eVG?#v?`|&gO>bU<z+{@N<EZbFr(MUhqjr0)$7we
zLE-}9C^dnB(|I)bDfr<B`!M<;JLnG<et&HU9kP(8a-j~47T+KL*q5OkQ{Wn~3JG3a
zzu7&Zqka4~e(`=(*I0TZg5|zcXt@`#wah1CL^C{w^zf&1Cv9@kWR<@~iRrr35UfBM
zdI;MSK){vNP;2d~s3rjHPoQb6{Y5eKc5X_%HoXPDSh+5%Tl6p%GH{ld99tqGlIStE
zJh=yP<3wI>kr2IyDx&ic-tZ1Gx)PH^8tQ$lYUi_+K9xAeGk?ZMr{@^<6X$q(dWFoR
zG$w;Z7@PT_*j-d#v`8Dvj>pFi6z<1D4`mGO%c}n7Hj7}z(-4hKSib*JW9t&pIrc=p
z|4bZzWICWSD5md1kStp}@1!Z7d_IhtYd-dRMmMhK=<w_+v7H0-H(g-b)-@afxX)1|
zL{9Xc5=8|ZZp#dQb#B!(8wU_S%h)HHl|)1iV6_8`K1l$YtHPy*7-5nCM%p*k_1dlS
z2JQaFxDb%$NHqgOv(fDYa@^5uBbe=xZTRB#TX&jW7J1X4S5O`g8Q*V?V<H9etpY4i
zYj$ZK7RN|d1*nn<no41=@+L<~t8UoJ4rH4Z2o)=!(@$@}chV9m;u>{ilB-0<8QMs-
zisK3BQhbHpRx<BwMa}YZW@>F|y-kPj(SI^aCv^@Kfd74e-53kK)y-+aE{BxP+NsgD
zH%kjJDew*`Oje_W>$Yl(42wn4Hbx9EX|=;u>grY^y2vtiNPk;tS&iG0Wi?i~bTExg
zo9Hn=P%4ZPUk*ws8AAUK``Bnhn!F}Y3|`Nn-?BCY{7r_PUOa~a489h#3J!vV1jS{u
z$xsy!yPk(oD^b`QyFSv_R|0Ked_hrN=U`-6=$q7Z!3Y^2aGn5ga~$8;_~uLOpB2a#
zvdpyEM`lc$76rmep^HeXgP|i}g!Qgw<#EecOK>ca$H;>KMgj1G=k6m&Uttp-X_v}<
zRxZB;jHy#}2chJMS$;<s6^U2@!ZlZ>RSTV2-BUxo2cNW+0ubSt0Wf!@IUTW^5ZnTy
z0myy_xS;hW35NHn$q9nDr7!41dw|9c3-S0k7%*Kfn3uo(`#-)AJW$9TivR$w=mAQF
zT6D=xvt#4}AUzZKs9wQ7-}JppPCy%P<yE?T9B9McF6r@NgPF2^5-FVWl*qxhVd{sJ
zOLz`ay6;<2Voihk_u3yy5O0#M)<ZFgukJO(Z?vph5Sm?gz{}bT+}WV?B^*y9MyByb
zP(`83m?~tyJnDxde?Y2a+(i!ubqvuAg)JJ`z9$`pY1DVHX-JOr9AH^CPKPd+lCWz_
zrC_u&Daxbew`j<uS8u^lA*`u!ZKSy*PP<G;qXg7$jT+)FN*z9^I%+PUN|vBy6|ajk
zTyUK<{n;^1W3&kkxe))cpZ?J#xusFs0ir&870QW6`(4h{3)7@9&E-vUrBS6{z5ZSp
zcXcE%3cM@sWMb6m6azHV+7yKMzM2zIYUk{0LxMVfa7Xc)IO{+74?gS1L0Ou_q)2e{
zitt|79%>UVIic#cnSjWcv@UXVb;in{tzvLlRWU(6GScym;KOi9*r>*OcmJGjL)`SB
zwfnGr7$~vAuY$^ZPv%{^xB@|ggwk6I5@4GsU`pN$>;{<qxtol_=QaN0{SmKgz7CXQ
zf_N%gD6_-0v3sp14<f#%S~k3T;x@{hce31zsBc_mbIdt@Ma%2XO0*Ny<|?f-C})Wc
zGvvIVAut73I1L1fSma3=oob<D6s3nU>C4FzabZq0Rm?X!LNU=qW8asLAMxBN^2rXj
z+(%mao7nTNb}$(9%BW@-9>t!tcLqEXy}WoIAE#2m*MZbu<{4%!pJ@&L{bxEG3ODPs
zEWjnK(I*z^`{nK}jJPt#^3jK{j*q8bXVX73#k6GkDCeU``1LsGaF!*7DX*G2Ru2vW
zJk8~&lg^GQg7*!s<EKrO`^w}YuXvZs7i|%N%pZL?IzBo&I{x}kx`R35<DlEX8KSv-
zl%u|U5!4UPBs=&HvNSd9l2>MG+?~~dMcG82n<B?`n66$a8TFib;2mZMAOO=D`>h*d
zjNI*s!jW2)G9KG57Aew&W3St|!R*UU>wMX0$)H%2>AI;OQVcg~N*nWbC+BD1zL5Rx
zX<zK)JkEb^@|tWVZdg0<FD*zQm>B@n4>@8$JVzoVAtl==VA`a5Ap%O=GgL6$>8M5E
zM1m*sl0LyW5PYz{%`FBx7@UEwI2$``lnO5}S_$*E7r2DgtsB8P>rr1dX`#-kFyyoH
zx6$j1N6ziQ)_Q?!W)Pe=q9m7M1J5NV?dZrJq|OCv12vA7WC<BgBCob1R-tFeOf70<
zO7KusVF1X4V|k#f(OAbVC*j6?>4f&k>k0dS97aP?1|wC@QZd0G+nwbfNIf~>`aI{-
z$(G|<uKzPL$hzR<S$;J{JLNd-0v#w6+asdb?g9s5nx@;5lr=Hfdu+R`#H%wZa7}OX
zK5EDvoNSiPGI{z`dj783WA*wn8v8k<cgNttoqsoIy#CD6<z|<+VvUu}Vf?Vsd5!T^
zRifo>M<Ck~R@Y}|C0x{;)>l@^j2B?vGWR!L+j@g8?k*Fz+2Xrv!G!LbA13U$9#Sz$
zzsgJB)w=sPhIALZ;{p0o6B8_HhyC$^L)0+Q;>wnVcX8?&XQK{B(9Up4^@C9btZ;kx
zAsXMWY!pLY*A*~k>So*kg?+s)1SdKKVQYqqF%w4W7b|=T!TK8Gvd==fG?)oy8IB*z
z#k=!!j0GymQnNro*SL0M0JVIHvsqjthYll7WpQKUY<pWWL1OYd1^0gzkxPgu9l9@B
ztaI)*O*V6x&Q!*=>=RFibM&@py;?R^;=YS`_wMXD?=7S}-Z$u$Lp113nB7AJn5&%9
z=;nD<U0(uW%jVK0S+DDXp4)_Z1ID`PV#!y5)A}ZTFuadB?or4ZoUZq@?&_s-KGjU0
z`F85Iq?5~#SZU(^;)6Bw%V9x*F9`qxgx|yg1x9L-dPn=H+r`QR{EvPqu$Ql3sEX<N
z)I@{*!4>9H`}=NI>al{5CNlQ9d3N#kwe%j`ZJg(7b={LF=hs~drlBkkK`b@RHh*mM
zEnEiyz33p9lT~BT<b0vB+qq7v-<#>gPrvv=xn6pPd59>Z?_t9wzsWkpw=K50Y?u6?
zD}5Qufm8nd>G``CBZ>9_VV|bPdjC-H#0~VLjb#Lz{{3gcwD4f-&Gr6p$WN0)t)yL%
z0hIK$jam=xp0%W|39Ua@>J%{|2m(gdF=p25T`?UTa|*Z|@)$ug+}BJ<q|Ds2Y6t9s
S`b=4GqWiOtFQP=L>Hh#j^3+=Z

diff --git a/tests/shared/skill-cohorts.test.ts b/tests/shared/skill-cohorts.test.ts
deleted file mode 100644
index 329ec964..00000000
--- a/tests/shared/skill-cohorts.test.ts
+++ /dev/null
@@ -1,130 +0,0 @@
-import { describe, it, expect, vi } from "vitest";
-import {
-  listSkillSessions,
-  cohortsForSkill,
-  reconstructSession,
-  skillKey,
-  type SessionAttribution,
-} from "../../src/skillify/skill-cohorts.js";
-
-const TABLE = "sessions";
-
-/** A query mock that returns canned rows and records the SQL it was asked. */
-function mockQuery(rows: Array<Record<string, unknown>>) {
-  const calls: string[] = [];
-  const fn = vi.fn(async (sql: string) => { calls.push(sql); return rows; });
-  return { fn, calls };
-}
-
-const activeRow = (sessionId: string, skills: unknown, bucket: number, ts: string, asString = false) => {
-  const msg = { type: "skills_active", session_id: sessionId, skills, ab_bucket: bucket };
-  return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts };
-};
-
-describe("listSkillSessions", () => {
-  it("filters on description='skills_active' and orders newest-first with the limit", async () => {
-    const { fn, calls } = mockQuery([]);
-    await listSkillSessions(fn, TABLE, { sinceIso: "2026-06-01T00:00:00Z", limit: 50 });
-    expect(calls[0]).toContain(`FROM "sessions"`);
-    expect(calls[0]).toContain("description = 'skills_active'");
-    expect(calls[0]).toContain("last_update_date >= '2026-06-01T00:00:00Z'");
-    expect(calls[0]).toContain("ORDER BY last_update_date DESC");
-    expect(calls[0]).toContain("LIMIT 50");
-  });
-
-  it("parses both JSON-string and object message payloads", async () => {
-    const { fn } = mockQuery([
-      activeRow("S1", [{ name: "a", author: "x", version: 2 }], 1, "t2", false), // object
-      activeRow("S2", [{ name: "b", author: "y", version: 3 }], 0, "t1", true),  // JSON string
-    ]);
-    const got = await listSkillSessions(fn, TABLE);
-    expect(got).toEqual([
-      { sessionId: "S1", skills: [{ name: "a", author: "x", version: 2 }], bucket: 1, ts: "t2" },
-      { sessionId: "S2", skills: [{ name: "b", author: "y", version: 3 }], bucket: 0, ts: "t1" },
-    ]);
-  });
-
-  it("dedups a session to its newest row (rows arrive newest-first) and drops malformed", async () => {
-    const { fn } = mockQuery([
-      activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "newer"),
-      activeRow("S1", [{ name: "a", author: "x", version: 1 }], 1, "older"), // same session → skipped
-      { message: "not json", last_update_date: "t" },                        // unparseable → skipped
-      { message: { type: "user_message", content: "hi" }, last_update_date: "t" }, // wrong type → skipped
-      { message: { type: "skills_active", skills: [] }, last_update_date: "t" },   // no session_id → skipped
-    ]);
-    const got = await listSkillSessions(fn, TABLE);
-    expect(got).toHaveLength(1);
-    expect(got[0]).toMatchObject({ sessionId: "S1", ts: "newer" });
-  });
-
-  it("coerces missing/garbage skill fields safely (defaults version 1, drops non-objects)", async () => {
-    const { fn } = mockQuery([
-      activeRow("S1", [{ name: "a", author: "x" }, "garbage", { name: "b" /* no author */ }], 0, "t"),
-    ]);
-    const got = await listSkillSessions(fn, TABLE);
-    expect(got[0].skills).toEqual([{ name: "a", author: "x", version: 1 }]);
-  });
-
-  it("omits the LIMIT clause when no limit is given", async () => {
-    const { fn, calls } = mockQuery([]);
-    await listSkillSessions(fn, TABLE);
-    expect(calls[0]).not.toContain("LIMIT");
-  });
-});
-
-describe("cohortsForSkill", () => {
-  const S = (id: string, skills: Array<[string, string]>): SessionAttribution => ({
-    sessionId: id, bucket: 0, ts: "t",
-    skills: skills.map(([name, author]) => ({ name, author, version: 1 })),
-  });
-
-  it("splits sessions into treatment (skill present) and control (absent)", () => {
-    const sessions = [
-      S("s1", [["posthog", "kamo"], ["other", "z"]]), // treatment
-      S("s2", [["other", "z"]]),                        // control
-      S("s3", [["posthog", "kamo"]]),                   // treatment
-      S("s4", []),                                       // control (no skills)
-      S("s5", [["posthog", "DIFFERENT"]]),              // control (same name, other author)
-    ];
-    const { treatment, control } = cohortsForSkill(sessions, "posthog", "kamo");
-    expect(treatment.map((s) => s.sessionId)).toEqual(["s1", "s3"]);
-    expect(control.map((s) => s.sessionId)).toEqual(["s2", "s4", "s5"]); // s5: name matches, author doesn't
-  });
-
-  it("skillKey is name--author", () => {
-    expect(skillKey("posthog", "kamo")).toBe("posthog--kamo");
-  });
-});
-
-describe("reconstructSession", () => {
-  it("orders by creation_date, keeps user/assistant turns, drops tool noise + empty", async () => {
-    const { fn, calls } = mockQuery([
-      { message: { type: "user_message", content: "do X" } },
-      { message: { type: "tool_call", tool_input: "{}", tool_response: "{}" } }, // dropped (no content)
-      { message: { type: "assistant_message", content: "did X" } },
-      { message: { type: "assistant_message", content: "   " } },                 // dropped (blank)
-      { message: JSON.stringify({ type: "user_message", content: "thanks" }) },   // string payload
-    ]);
-    const out = await reconstructSession(fn, TABLE, "abc-123");
-    expect(calls[0]).toContain("path LIKE '/sessions/%abc-123%'");
-    expect(calls[0]).toContain("ORDER BY creation_date ASC");
-    expect(out).toBe("USER: do X\n\nASSISTANT: did X\n\nUSER: thanks");
-  });
-
-  it("head+tail elides a transcript longer than maxChars", async () => {
-    const big = "x".repeat(500);
-    const { fn } = mockQuery([
-      { message: { type: "user_message", content: big } },
-      { message: { type: "assistant_message", content: big } },
-    ]);
-    const out = await reconstructSession(fn, TABLE, "s", 200);
-    expect(out).toContain("chars elided");
-    expect(out.length).toBeLessThan(400); // ~maxChars + the elision marker, far below the ~1000 raw
-  });
-
-  it("escapes single quotes in the session id (no SQL break)", async () => {
-    const { fn, calls } = mockQuery([]);
-    await reconstructSession(fn, TABLE, "a'b");
-    expect(calls[0]).toContain("/sessions/%a''b%");
-  });
-});
diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts
new file mode 100644
index 00000000..ecdfadbb
--- /dev/null
+++ b/tests/shared/skill-invocations.test.ts
@@ -0,0 +1,103 @@
+import { describe, it, expect, vi } from "vitest";
+import {
+  invokedSkillRef,
+  splitOrgSkill,
+  listSkillInvocations,
+  windowAroundInvocation,
+  type SkillInvocation,
+} from "../../src/skillify/skill-invocations.js";
+
+const TABLE = "sessions";
+function mockQuery(rows: Array<Record<string, unknown>>) {
+  const calls: string[] = [];
+  return { fn: vi.fn(async (sql: string) => { calls.push(sql); return rows; }), calls };
+}
+const toolCall = (skill: string, sessionId = "S1", ts = "t", asString = false) => {
+  const msg = { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sessionId, timestamp: ts };
+  return { message: asString ? JSON.stringify(msg) : msg, last_update_date: ts };
+};
+
+describe("invokedSkillRef", () => {
+  it("returns the skill ref for a Skill tool_call (object or stringified input)", () => {
+    expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "a--b" }) })).toBe("a--b");
+    expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: { skill: "a--b" } as unknown })).toBe("a--b");
+  });
+  it("returns null for non-Skill tools and non-tool_call messages", () => {
+    expect(invokedSkillRef({ type: "tool_call", tool_name: "Bash", tool_input: "{}" })).toBeNull();
+    expect(invokedSkillRef({ type: "assistant_message", content: "use the Skill tool" })).toBeNull();
+    expect(invokedSkillRef({ type: "tool_call", tool_name: "Skill", tool_input: "not json" })).toBeNull();
+  });
+});
+
+describe("splitOrgSkill", () => {
+  it("splits <name>--<author>, last -- wins", () => {
+    expect(splitOrgSkill("posthog-smoke--kamo.aghbalyan")).toEqual({ name: "posthog-smoke", author: "kamo.aghbalyan" });
+    expect(splitOrgSkill("some-skill--first-last")).toEqual({ name: "some-skill", author: "first-last" });
+  });
+  it("rejects plugin-namespaced, bare, and malformed refs", () => {
+    expect(splitOrgSkill("hivemind:hivemind-memory")).toBeNull(); // plugin
+    expect(splitOrgSkill("update-config")).toBeNull();            // bare
+    expect(splitOrgSkill("baz--")).toBeNull();                    // empty author
+  });
+});
+
+describe("listSkillInvocations", () => {
+  it("coarse-prefilters on \"Skill\" then keeps only org-skill tool_calls", async () => {
+    const { fn, calls } = mockQuery([
+      toolCall("posthog-smoke--kamo"),                 // org → kept
+      toolCall("hivemind:hivemind-memory"),            // plugin → dropped
+      toolCall("update-config"),                       // bare → dropped
+      { message: { type: "assistant_message", content: "mentions Skill" }, last_update_date: "t" }, // prose → dropped
+      toolCall("pg-debug--sasun", "S2", "t2", true),   // org, stringified message → kept
+    ]);
+    const got = await listSkillInvocations(fn, TABLE, { sinceIso: "2026-06-01", limit: 100 });
+    expect(calls[0]).toContain(`CAST(message AS TEXT) LIKE '%"Skill"%'`);
+    expect(calls[0]).toContain("last_update_date >= '2026-06-01'");
+    expect(calls[0]).toContain("LIMIT 100");
+    expect(got).toEqual([
+      { sessionId: "S1", name: "posthog-smoke", author: "kamo", ts: "t" },
+      { sessionId: "S2", name: "pg-debug", author: "sasun", ts: "t2" },
+    ]);
+  });
+});
+
+describe("windowAroundInvocation", () => {
+  const inv: SkillInvocation = { sessionId: "S1", name: "posthog-smoke", author: "kamo", ts: "t5" };
+  // turns: u1, a1, [skill invoked here], u2(pushback), a2  → window before=1/after=2 ⇒ a1..a2
+  const rows = [
+    { message: { type: "user_message", content: "first" } },
+    { message: { type: "assistant_message", content: "ack" } },
+    { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "posthog-smoke--kamo" }), timestamp: "t5" } },
+    { message: { type: "tool_call", tool_name: "Bash", tool_input: "{}" } }, // non-skill tool → ignored
+    { message: { type: "user_message", content: "no that's wrong" } },
+    { message: { type: "assistant_message", content: "fixing" } },
+  ];
+
+  it("windows `before` turns before and `after` after the invocation", async () => {
+    const { fn, calls } = mockQuery(rows);
+    const out = await windowAroundInvocation(fn, TABLE, inv, { before: 1, after: 2 });
+    expect(calls[0]).toContain("path LIKE '/sessions/%S1%'");
+    // invIndex = 2 (two turns before the skill tool_call). before 1 → from turn 1; after 2 → turns 2,3.
+    expect(out).toBe("ASSISTANT: ack\n\nUSER: no that's wrong\n\nASSISTANT: fixing");
+  });
+
+  it("falls back to session end when the invocation can't be located", async () => {
+    const { fn } = mockQuery([
+      { message: { type: "user_message", content: "hi" } },
+      { message: { type: "assistant_message", content: "bye" } },
+    ]);
+    const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5 });
+    expect(out).toBe("USER: hi\n\nASSISTANT: bye"); // whole (short) transcript
+  });
+
+  it("elides a window longer than maxChars", async () => {
+    const big = "x".repeat(400);
+    const { fn } = mockQuery([
+      { message: { type: "user_message", content: big } },
+      { message: { type: "assistant_message", content: big } },
+    ]);
+    const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5, maxChars: 150 });
+    expect(out).toContain("chars elided");
+    expect(out.length).toBeLessThan(300);
+  });
+});
diff --git a/tests/shared/skills-active.test.ts b/tests/shared/skills-active.test.ts
deleted file mode 100644
index a7d1c295..00000000
--- a/tests/shared/skills-active.test.ts
+++ /dev/null
@@ -1,211 +0,0 @@
-import { describe, it, expect, beforeEach, afterEach } from "vitest";
-import fs from "node:fs";
-import os from "node:os";
-import path from "node:path";
-import {
-  listActiveOrgSkills,
-  sessionBucket,
-  buildSkillsActiveInsert,
-  buildSkillsActivePath,
-  skillRootsForCwd,
-  defaultSkillsRoot,
-} from "../../src/skillify/skills-active.js";
-import type { PulledManifest } from "../../src/skillify/manifest.js";
-
-/** Build a pull manifest from `(dirName, name, author)` triples (fills the rest with defaults). */
-function manifestOf(...rows: Array<{ dirName: string; name: string; author: string }>): PulledManifest {
-  return {
-    version: 1,
-    entries: rows.map(r => ({
-      dirName: r.dirName,
-      name: r.name,
-      author: r.author,
-      projectKey: "pk",
-      remoteVersion: 1,
-      install: "global" as const,
-      installRoot: "/install/root",
-      pulledAt: "2026-01-01T00:00:00.000Z",
-      symlinks: [],
-    })),
-  };
-}
-
-describe("listActiveOrgSkills", () => {
-  let root: string;
-  beforeEach(() => {
-    root = fs.mkdtempSync(path.join(os.tmpdir(), "skills-active-"));
-  });
-  afterEach(() => {
-    fs.rmSync(root, { recursive: true, force: true });
-  });
-
-  it("returns only manifest-recorded (pull-managed) dirs; excludes local-only + files", () => {
-    fs.mkdirSync(path.join(root, "posthog-event-smoke-testing--kamo.aghbalyan"));
-    fs.mkdirSync(path.join(root, "pg-deeplake-test-crash-debugging--sasun"));
-    fs.mkdirSync(path.join(root, "deploy--blue-green"));        // local-only `--` dir, NOT pulled — excluded
-    fs.mkdirSync(path.join(root, "plan-confirm-then-execute")); // bare local — excluded
-    fs.writeFileSync(path.join(root, "notes--x.txt"), "x");     // file, not dir — excluded
-    const manifest = manifestOf(
-      { dirName: "posthog-event-smoke-testing--kamo.aghbalyan", name: "posthog-event-smoke-testing", author: "kamo.aghbalyan" },
-      { dirName: "pg-deeplake-test-crash-debugging--sasun", name: "pg-deeplake-test-crash-debugging", author: "sasun" },
-    );
-
-    const got = listActiveOrgSkills([root], manifest);
-    expect(got).toEqual([
-      { name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 1 },
-      { name: "posthog-event-smoke-testing", author: "kamo.aghbalyan", version: 1 },
-    ]); // sorted by name; exactly the 2 manifest-recorded skills; version defaults to 1 (no SKILL.md)
-    expect(got).toHaveLength(2); // local `deploy--blue-green` + bare + file all dropped
-  });
-
-  it("excludes a local-only dir whose name contains `--` when the manifest is empty (no false positive)", () => {
-    fs.mkdirSync(path.join(root, "deploy--blue-green")); // org-shaped name, but never pulled
-    expect(listActiveOrgSkills([root], manifestOf())).toEqual([]);
-  });
-
-  it("takes name/author from the manifest, not a dirname split (multi-`--` dir stays correct)", () => {
-    fs.mkdirSync(path.join(root, "some--weird--dirname"));
-    const manifest = manifestOf({ dirName: "some--weird--dirname", name: "some-skill", author: "first-last" });
-    expect(listActiveOrgSkills([root], manifest)).toEqual([{ name: "some-skill", author: "first-last", version: 1 }]);
-  });
-
-  it("returns [] for a missing skills root (never throws)", () => {
-    expect(listActiveOrgSkills([path.join(root, "does-not-exist")], manifestOf())).toEqual([]);
-  });
-
-  it("scans project + global roots and dedups a skill present in both (P2: --to project)", () => {
-    const projectRoot = fs.mkdtempSync(path.join(os.tmpdir(), "skills-active-proj-"));
-    try {
-      fs.mkdirSync(path.join(root, "a-skill--alice"));         // global-pulled
-      fs.mkdirSync(path.join(projectRoot, "a-skill--alice"));  // also project-pulled (dup)
-      fs.mkdirSync(path.join(projectRoot, "b-skill--bob"));    // project-only org skill
-      fs.mkdirSync(path.join(projectRoot, "local-only--x"));   // not in manifest → excluded
-      const manifest = manifestOf(
-        { dirName: "a-skill--alice", name: "a-skill", author: "alice" },
-        { dirName: "b-skill--bob", name: "b-skill", author: "bob" },
-      );
-      const got = listActiveOrgSkills([root, projectRoot], manifest);
-      expect(got).toEqual([
-        { name: "a-skill", author: "alice", version: 1 }, // counted once despite two roots
-        { name: "b-skill", author: "bob", version: 1 },   // picked up from the project root
-      ]);
-      expect(got).toHaveLength(2);
-    } finally {
-      fs.rmSync(projectRoot, { recursive: true, force: true });
-    }
-  });
-
-  it("reads the skill version from the installed SKILL.md frontmatter (enables v1-vs-v2)", () => {
-    fs.mkdirSync(path.join(root, "evolving-skill--sasun"));
-    fs.writeFileSync(
-      path.join(root, "evolving-skill--sasun", "SKILL.md"),
-      "---\nname: evolving-skill\nversion: 5\n---\nbody",
-    );
-    const manifest = manifestOf({ dirName: "evolving-skill--sasun", name: "evolving-skill", author: "sasun" });
-    expect(listActiveOrgSkills([root], manifest)).toEqual([{ name: "evolving-skill", author: "sasun", version: 5 }]);
-  });
-});
-
-describe("skillRootsForCwd", () => {
-  it("returns only the global root when no cwd is given", () => {
-    expect(skillRootsForCwd()).toEqual([defaultSkillsRoot()]);
-  });
-  it("adds the project-scoped <cwd>/.claude/skills root when cwd is given", () => {
-    expect(skillRootsForCwd("/home/u/proj")).toEqual([
-      defaultSkillsRoot(),
-      path.join("/home/u/proj", ".claude", "skills"),
-    ]);
-  });
-});
-
-describe("sessionBucket", () => {
-  it("is deterministic for the same session id", () => {
-    expect(sessionBucket("abc-123")).toBe(sessionBucket("abc-123"));
-  });
-  it("stays within [0, buckets)", () => {
-    for (const id of ["a", "b", "c", "xyz", "1874a6b2"]) {
-      const b = sessionBucket(id, 2);
-      expect(b).toBeGreaterThanOrEqual(0);
-      expect(b).toBeLessThan(2);
-    }
-  });
-  it("assigns both buckets across many ids (not constant)", () => {
-    const seen = new Set<number>();
-    for (let i = 0; i < 200; i++) seen.add(sessionBucket(`session-${i}`));
-    expect(seen).toEqual(new Set([0, 1])); // both arms populated → real randomization
-  });
-});
-
-describe("buildSkillsActivePath", () => {
-  const config = { userName: "kamo", orgName: "activeloop", workspaceId: "default" };
-
-  it("namespaces under /skills_active/, NOT /sessions/ (so summary readers exclude it)", () => {
-    const p = buildSkillsActivePath(config, "S1");
-    expect(p.startsWith("/skills_active/")).toBe(true);
-    expect(p.startsWith("/sessions/")).toBe(false);
-    // The exact filter the summary / raw-transcript readers use must NOT match this path.
-    expect(p.includes("/sessions/")).toBe(false);
-  });
-
-  it("embeds the full {user, org, workspace, session} tuple", () => {
-    expect(buildSkillsActivePath(config, "S1")).toBe(
-      "/skills_active/kamo/kamo_activeloop_default_S1.json",
-    );
-  });
-
-  it("falls back to `default` workspace when workspaceId is absent", () => {
-    // covers the `?? \"default\"` branch (mirrors buildSessionPath)
-    const p = buildSkillsActivePath(
-      { userName: "kamo", orgName: "activeloop", workspaceId: undefined as unknown as string },
-      "S1",
-    );
-    expect(p).toBe("/skills_active/kamo/kamo_activeloop_default_S1.json");
-  });
-});
-
-describe("buildSkillsActiveInsert", () => {
-  const base = {
-    sessionsTable: "sessions",
-    sessionPath: "/sessions/kamo/kamo_activeloop_hivemind_S1.jsonl",
-    filename: "kamo_activeloop_hivemind_S1.jsonl",
-    userName: "kamo",
-    projectName: "hivemind",
-    pluginVersion: "0.7.99",
-    sessionId: "S1",
-    cwd: "/home/kamo/proj",
-    skills: [{ name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 3 }],
-    bucket: 1,
-    ts: "2026-06-03T00:00:00.000Z",
-  };
-
-  it("emits exactly ONE insert into the sessions table (no second mutation)", () => {
-    const sql = buildSkillsActiveInsert(base);
-    expect((sql.match(/INSERT INTO/g) ?? []).length).toBe(1);
-    expect((sql.match(/UPDATE /g) ?? []).length).toBe(0);
-    expect(sql).toContain('INSERT INTO "sessions"');
-  });
-
-  it("writes a skills_active message with the skills, count, and bucket", () => {
-    const sql = buildSkillsActiveInsert(base);
-    const m = sql.match(/'(\{.*\})'::jsonb/s);
-    expect(m).toBeTruthy();
-    const entry = JSON.parse(m![1]);
-    expect(entry.type).toBe("skills_active");
-    expect(entry.session_id).toBe("S1");
-    expect(entry.skills).toEqual([{ name: "pg-deeplake-test-crash-debugging", author: "sasun", version: 3 }]);
-    expect(entry.skills_count).toBe(1);
-    expect(entry.ab_bucket).toBe(1);
-  });
-
-  it("leaves message_embedding NULL (no daemon round-trip at SessionStart)", () => {
-    const sql = buildSkillsActiveInsert(base);
-    expect(sql).toMatch(/::jsonb,\s*NULL,/);
-  });
-
-  it("does NOT masquerade as a captured turn type", () => {
-    const sql = buildSkillsActiveInsert(base);
-    expect(sql).not.toContain('"type":"user_message"');
-    expect(sql).not.toContain('"type":"tool_call"');
-    expect(sql).not.toContain('"type":"assistant_message"');
-  });
-});

From 1929c96e2264c5ef3e9ffc6c92d409edb86b04af Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:14:05 +0000
Subject: [PATCH 03/30] refactor(skillopt): drop skills_active availability
 attribution

Skill *invocation* (Skill tool_use, already captured) is a more accurate signal
than availability and pins the turn for windowing, so the SessionStart
skills_active write + helper + the availability-based cohorts module are removed.
The weekly trigger/worker stay. session-start-hook tests revert to the
pre-attribution query sequence; trigger tests unchanged.
---
 node_modules                                 |  1 +
 src/hooks/session-start.ts                   | 42 ---------------
 tests/claude-code/session-start-hook.test.ts | 56 ++++----------------
 3 files changed, 12 insertions(+), 87 deletions(-)
 create mode 120000 node_modules

diff --git a/node_modules b/node_modules
new file mode 120000
index 00000000..1dd8c0a6
--- /dev/null
+++ b/node_modules
@@ -0,0 +1 @@
+/home/ubuntu/al-projects/hivemind/node_modules
\ No newline at end of file
diff --git a/src/hooks/session-start.ts b/src/hooks/session-start.ts
index af944953..0e63390b 100644
--- a/src/hooks/session-start.ts
+++ b/src/hooks/session-start.ts
@@ -14,7 +14,6 @@ import { loadConfig } from "../config.js";
 import { DeeplakeApi } from "../deeplake-api.js";
 import { sqlStr } from "../utils/sql.js";
 import { projectNameFromCwd } from "../utils/project-name.js";
-import { listActiveOrgSkills, sessionBucket, buildSkillsActiveInsert, buildSkillsActivePath, skillRootsForCwd } from "../skillify/skills-active.js";
 import { readStdin } from "../utils/stdin.js";
 import { log as _log } from "../utils/debug.js";
 import { getInstalledVersion } from "../utils/version-check.js";
@@ -223,11 +222,6 @@ async function main(): Promise<void> {
   // freezes SessionStart. Hard opt-out via HIVEMIND_AUTOPULL_DISABLED=1.
   // All failures swallowed inside autoPullSkills (documented as
   // never-rejecting), so no try/catch needed here.
-  //
-  // Runs BEFORE the skill-attribution snapshot below so that a skill pulled
-  // (or upgraded) during THIS SessionStart is reflected in the recorded
-  // skills_active set — otherwise the row would capture a stale/empty set
-  // while the session can already use the freshly-pulled skill.
   const pullResult = await autoPullSkills();
   log(`autopull: pulled=${pullResult.pulled} skipped=${pullResult.skipped}`);
 
@@ -244,42 +238,6 @@ async function main(): Promise<void> {
           await api.ensureSessionsTable(sessionsTable);
           await createPlaceholder(api, table, input.session_id, input.cwd ?? "", config.userName, config.orgName, config.workspaceId, pluginVersion);
           log("placeholder created");
-
-          // Skill attribution (measurement): record which org-shared skills were in
-          // context this session + a deterministic A/B bucket. This is the label that
-          // makes skill value measurable (sessions with vs without skill X / v1 vs v2).
-          // Org skills are identified via the pull manifest (authoritative), not the
-          // `--` dirname pattern. Snapshot runs after auto-pull (above) so it reflects
-          // freshly-pulled skills. Opt-out: HIVEMIND_SKILL_ATTRIBUTION=0.
-          // Swallowed — must never fail SessionStart.
-          if (process.env.HIVEMIND_SKILL_ATTRIBUTION !== "0") {
-            try {
-              // Scan global + project-scoped (<cwd>/.claude/skills) roots so
-              // skills pulled with `--to project` are attributed too.
-              const skills = listActiveOrgSkills(skillRootsForCwd(input.cwd));
-              // Distinct `/skills_active/` namespace (NOT `/sessions/`) so the summary /
-              // raw-transcript readers never mistake this attribution row for a transcript.
-              const attrSessionPath = buildSkillsActivePath(config, input.session_id);
-              const attrFilename = attrSessionPath.slice(attrSessionPath.lastIndexOf("/") + 1);
-              const sql = buildSkillsActiveInsert({
-                sessionsTable,
-                sessionPath: attrSessionPath,
-                filename: attrFilename,
-                userName: config.userName,
-                projectName: projectNameFromCwd(input.cwd),
-                pluginVersion,
-                sessionId: input.session_id,
-                cwd: input.cwd,
-                skills,
-                bucket: sessionBucket(input.session_id),
-                ts: new Date().toISOString(),
-              });
-              await api.query(sql);
-              log(`skills_active recorded: ${skills.length} org skills, bucket ${sessionBucket(input.session_id)}`);
-            } catch (e: any) {
-              log(`skills_active attribution failed (swallowed): ${e?.message ?? e}`);
-            }
-          }
         } else {
           const reason = process.env.HIVEMIND_CAPTURE === "false"
             ? "HIVEMIND_CAPTURE=false"
diff --git a/tests/claude-code/session-start-hook.test.ts b/tests/claude-code/session-start-hook.test.ts
index de5fa30a..c1639878 100644
--- a/tests/claude-code/session-start-hook.test.ts
+++ b/tests/claude-code/session-start-hook.test.ts
@@ -102,7 +102,6 @@ const stdoutSpy = vi.spyOn(process.stdout, "write");
 async function runHook(env: Record<string, string | undefined> = {}): Promise<string | null> {
   delete process.env.HIVEMIND_WIKI_WORKER;
   delete process.env.HIVEMIND_CAPTURE;
-  delete process.env.HIVEMIND_SKILL_ATTRIBUTION;
   for (const [k, v] of Object.entries(env)) {
     if (v === undefined) delete process.env[k];
     else process.env[k] = v;
@@ -235,28 +234,21 @@ describe("session-start hook — placeholder branching", () => {
     expect(ensureTableMock).toHaveBeenCalled();
     expect(ensureSessionsTableMock).toHaveBeenCalledWith("sessions");
     // 1 SELECT (existing-summary check) + 1 INSERT (placeholder)
-    // + 1 INSERT (skills_active attribution) + 2 renderer SELECTs
-    // (listRules + listOpenGoals) = 5 queries.
-    expect(queryMock).toHaveBeenCalledTimes(5);
+    // + 2 renderer SELECTs (listRules + listOpenGoals) = 4 queries.
+    expect(queryMock).toHaveBeenCalledTimes(4);
     expect(queryMock.mock.calls[0][0]).toMatch(/^SELECT path FROM/);
     expect(queryMock.mock.calls[1][0]).toMatch(/^INSERT INTO/);
-    // skills_active attribution row — shape, not just count (asserts the new
-    // write is the attribution INSERT, so a second stray mutation can't sneak in).
-    expect(queryMock.mock.calls[2][0]).toMatch(/^INSERT INTO "sessions"/);
-    expect(queryMock.mock.calls[2][0]).toContain("skills_active");
-    expect(queryMock.mock.calls[3][0]).toMatch(/^SELECT .* FROM "hivemind_rules"/);
-    expect(queryMock.mock.calls[4][0]).toMatch(/^SELECT .* FROM "hivemind_goals"/);
+    expect(queryMock.mock.calls[2][0]).toMatch(/^SELECT .* FROM "hivemind_rules"/);
+    expect(queryMock.mock.calls[3][0]).toMatch(/^SELECT .* FROM "hivemind_goals"/);
     expect(debugLogMock).toHaveBeenCalledWith("placeholder created");
   });
 
   it("skips placeholder INSERT when summary already exists (resumed session)", async () => {
     queryMock.mockResolvedValueOnce([{ path: "/summaries/alice/sid-1.md" }]);
     await runHook();
-    // 1 placeholder SELECT (returns row, no INSERT) + 1 skills_active
-    // attribution INSERT (runs regardless of placeholder branch) + 2 renderer
-    // SELECTs (rules + goals) = 4 queries.
-    expect(queryMock).toHaveBeenCalledTimes(4);
-    expect(queryMock.mock.calls[1][0]).toContain("skills_active");
+    // 1 placeholder SELECT (returns row, no INSERT) + 2 renderer SELECTs
+    // (rules + goals) = 3 queries.
+    expect(queryMock).toHaveBeenCalledTimes(3);
   });
 
   it("non-empty rules block is appended to additionalContext", async () => {
@@ -272,7 +264,6 @@ describe("session-start hook — placeholder branching", () => {
     };
     queryMock.mockResolvedValueOnce([]);     // placeholder SELECT
     queryMock.mockResolvedValueOnce([]);     // placeholder INSERT
-    queryMock.mockResolvedValueOnce([]);     // skills_active attribution INSERT
     queryMock.mockResolvedValueOnce([rule]); // renderer rules
     queryMock.mockResolvedValueOnce([]);     // renderer goals (empty)
     const out = await runHook();
@@ -296,13 +287,12 @@ describe("session-start hook — placeholder branching", () => {
     };
     queryMock.mockResolvedValueOnce([]);     // placeholder SELECT
     queryMock.mockResolvedValueOnce([]);     // placeholder INSERT
-    queryMock.mockResolvedValueOnce([]);     // skills_active attribution INSERT
     queryMock.mockResolvedValueOnce([rule]); // renderer rules
     queryMock.mockResolvedValueOnce([]);     // renderer goals (empty)
     const out = await runHook();
     const parsed = JSON.parse(out!);
     expect(parsed.hookSpecificOutput.additionalContext).toContain("no DROP TABLE on prod");
-    expect(queryMock).toHaveBeenCalledTimes(5);
+    expect(queryMock).toHaveBeenCalledTimes(4);
   });
 
   it("skips the renderer SELECTs when the trusted table list omits rules + goals", async () => {
@@ -310,12 +300,11 @@ describe("session-start hook — placeholder branching", () => {
     // SELECT. Only the placeholder SELECT + INSERT run.
     knownTablesMock.mockResolvedValue([]);
     await runHook();
-    // placeholder SELECT + placeholder INSERT + skills_active attribution
-    // INSERT = 3 (renderer fires no SELECT when no tables are trusted).
-    expect(queryMock).toHaveBeenCalledTimes(3);
+    // placeholder SELECT + placeholder INSERT = 2 (renderer fires no SELECT
+    // when no tables are trusted).
+    expect(queryMock).toHaveBeenCalledTimes(2);
     expect(queryMock.mock.calls[0][0]).toMatch(/^SELECT path FROM/);
     expect(queryMock.mock.calls[1][0]).toMatch(/^INSERT INTO/);
-    expect(queryMock.mock.calls[2][0]).toContain("skills_active");
   });
 
   it("HIVEMIND_CAPTURE=false: no placeholder, no DDL (ensure), but renderer still runs", async () => {
@@ -336,29 +325,6 @@ describe("session-start hook — placeholder branching", () => {
     );
   });
 
-  it("HIVEMIND_SKILL_ATTRIBUTION=0: skips the skills_active write entirely", async () => {
-    await runHook({ HIVEMIND_SKILL_ATTRIBUTION: "0" });
-    // placeholder SELECT + INSERT + 2 renderer SELECTs = 4 (NO attribution row).
-    expect(queryMock).toHaveBeenCalledTimes(4);
-    // negative assertion: the attribution INSERT must not be present at all.
-    for (const call of queryMock.mock.calls) {
-      expect(call[0]).not.toContain("skills_active");
-    }
-  });
-
-  it("swallows a failed skills_active attribution write (never breaks SessionStart)", async () => {
-    // Content-based (not position-based) so it's robust to query ordering: the
-    // attribution INSERT is the only query carrying the skills_active marker.
-    queryMock.mockImplementation((sql: string) =>
-      sql.includes("skills_active") ? Promise.reject(new Error("attr boom")) : Promise.resolve([]),
-    );
-    const out = await runHook();
-    expect(out).toBeTruthy(); // hook still completes and emits context
-    expect(debugLogMock).toHaveBeenCalledWith(
-      expect.stringContaining("skills_active attribution failed (swallowed): attr boom"),
-    );
-  });
-
   it("logs the SkillOpt fired branch when the weekly trigger spawns a worker", async () => {
     runWeeklySkillOptMock.mockReturnValue({ fired: true, reason: "spawned" });
     await runHook();

From 72dca56ea10acb7d2349b87345506c753b6971ad Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:16:56 +0000
Subject: [PATCH 04/30] feat(skillopt): heuristic correction anchor (reward
 level 1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pure, free pushback detector: a USER turn right after an ASSISTANT turn matching
correction patterns = a hard, observable failure signal, independent of any LLM.
Tuned for recall (false positives just cost a judge call; false negatives only
under-detect — never churns a good skill). Exposes windowedTurns() from
skill-invocations so the anchor reads the same windowed slice the judge will.
6 anchor tests + patterns to tune on real data.
---
 src/skillify/session-anchor.ts      | 43 +++++++++++++++++++++++++++++
 src/skillify/skill-invocations.ts   | 19 +++++++++----
 tests/shared/session-anchor.test.ts | 41 +++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 5 deletions(-)
 create mode 100644 src/skillify/session-anchor.ts
 create mode 100644 tests/shared/session-anchor.test.ts

diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts
new file mode 100644
index 00000000..677b8598
--- /dev/null
+++ b/src/skillify/session-anchor.ts
@@ -0,0 +1,43 @@
+/**
+ * Heuristic "anchor" — a HARD, observable signal in the transcript that a session
+ * went badly, independent of any LLM judgment: the user pushed back on / corrected
+ * what the assistant just did. Pure + free (no LLM, no I/O).
+ *
+ * It's the level-1 filter in the outcome pipeline: only windows with an anchor go
+ * to the (paid) success-judge, and a session is labelled a failure only when the
+ * anchor AND the judge agree. So this is deliberately tuned for RECALL over
+ * precision — a false positive just costs one judge call (which then drops it),
+ * but a false negative under-detects (conservative — it never churns a good skill).
+ * Patterns are meant to be tuned against real sessions; this is a starting set.
+ */
+import type { Turn } from "./skill-invocations.js";
+
+export type AnchorKind = "correction" | "none";
+export interface Anchor {
+  anchored: boolean;
+  kind: AnchorKind;
+  evidence: string; // the user turn that triggered it (truncated)
+}
+
+// User pushback: rejection / correction of what the assistant just produced.
+const PUSHBACK = /\b(no|nope|wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|that'?s wrong|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail)/i;
+
+// Clear benign negatives we don't want to fire on (keeps obvious false positives
+// out of the judge to save tokens). Intentionally narrow — when in doubt, fire.
+const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good|thank)/i;
+
+/**
+ * Detect a correction anchor in a windowed slice of turns. Only a USER turn that
+ * immediately follows an ASSISTANT turn can be pushback (the first user turn is
+ * the request, not a reaction).
+ */
+export function detectAnchor(turns: Turn[]): Anchor {
+  for (let i = 1; i < turns.length; i++) {
+    const t = turns[i];
+    if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue;
+    if (PUSHBACK.test(t.text) && !BENIGN.test(t.text)) {
+      return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) };
+    }
+  }
+  return { anchored: false, kind: "none", evidence: "" };
+}
diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts
index 3cb0fd31..e5ef664f 100644
--- a/src/skillify/skill-invocations.ts
+++ b/src/skillify/skill-invocations.ts
@@ -97,7 +97,7 @@ export async function listSkillInvocations(
   return out;
 }
 
-interface Turn { role: "USER" | "ASSISTANT"; text: string }
+export interface Turn { role: "USER" | "ASSISTANT"; text: string }
 
 /**
  * Reconstruct the transcript turns of a session, and mark where (between which two
@@ -139,17 +139,26 @@ async function sessionTurns(
  * turns after — where the help-or-harm signal lives — head+tail elided to maxChars.
  * `before`/`after` are tunable; defaults chosen as a small starting point.
  */
+export async function windowedTurns(
+  query: QueryFn,
+  sessionsTable: string,
+  inv: SkillInvocation,
+  opts: { before?: number; after?: number } = {},
+): Promise<Turn[]> {
+  const before = opts.before ?? 3;
+  const after = opts.after ?? 6;
+  const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv);
+  return turns.slice(Math.max(0, invIndex - before), invIndex + after);
+}
+
 export async function windowAroundInvocation(
   query: QueryFn,
   sessionsTable: string,
   inv: SkillInvocation,
   opts: { before?: number; after?: number; maxChars?: number } = {},
 ): Promise<string> {
-  const before = opts.before ?? 3;
-  const after = opts.after ?? 6;
   const maxChars = opts.maxChars ?? 4000;
-  const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv);
-  const slice = turns.slice(Math.max(0, invIndex - before), invIndex + after);
+  const slice = await windowedTurns(query, sessionsTable, inv, opts);
   const joined = slice.map((t) => `${t.role}: ${t.text}`).join("\n\n");
   if (joined.length <= maxChars) return joined;
   const head = joined.slice(0, Math.floor(maxChars * 0.55));
diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts
new file mode 100644
index 00000000..1780584f
--- /dev/null
+++ b/tests/shared/session-anchor.test.ts
@@ -0,0 +1,41 @@
+import { describe, it, expect } from "vitest";
+import { detectAnchor } from "../../src/skillify/session-anchor.js";
+import type { Turn } from "../../src/skillify/skill-invocations.js";
+
+const u = (text: string): Turn => ({ role: "USER", text });
+const a = (text: string): Turn => ({ role: "ASSISTANT", text });
+
+describe("detectAnchor", () => {
+  it("fires on user pushback right after an assistant turn", () => {
+    const r = detectAnchor([u("add a smoke test"), a("done, here it is"), u("no that's wrong, it mocks the client")]);
+    expect(r.anchored).toBe(true);
+    expect(r.kind).toBe("correction");
+    expect(r.evidence).toContain("wrong");
+  });
+
+  it("does NOT fire on the opening request (no preceding assistant turn)", () => {
+    const r = detectAnchor([u("this won't work without a flush — add a smoke test")]);
+    expect(r.anchored).toBe(false);
+  });
+
+  it("does NOT fire on a user turn that follows another user turn", () => {
+    const r = detectAnchor([u("first"), u("that didn't work")]); // no assistant in between
+    expect(r.anchored).toBe(false);
+  });
+
+  it("suppresses clear benign negatives (no problem / works now / thanks)", () => {
+    expect(detectAnchor([a("fixed it"), u("no problem, thanks!")]).anchored).toBe(false);
+    expect(detectAnchor([a("try this"), u("works now, perfect")]).anchored).toBe(false);
+  });
+
+  it("catches several real correction phrasings", () => {
+    for (const p of ["that doesn't work", "still failing", "that's incorrect", "try again", "nope", "you broke the build"]) {
+      expect(detectAnchor([a("here"), u(p)]).anchored, p).toBe(true);
+    }
+  });
+
+  it("returns none when the user is satisfied / silent", () => {
+    expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false);
+    expect(detectAnchor([]).anchored).toBe(false);
+  });
+});

From 8398a2f27466bc62e90612018e6f87f487a70d19 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:19:28 +0000
Subject: [PATCH 05/30] feat(skillopt): success-judge (reward level 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

LLM judge of the windowed slice: was the task accomplished CORRECTLY? Prompted to
ignore user mood (a praised-but-wrong answer is a FAILURE) — the anti-sycophancy
axis the research validated. Runs on the user's claude -p (cheap default model,
all tools denied), injected for tests. Conservative: unparseable/errored/empty
judgments return success=1 so a flaky judge can only under-detect, never
manufacture deficiency. 7 unit tests.
---
 src/skillify/success-judge.ts      | 93 ++++++++++++++++++++++++++++++
 tests/shared/success-judge.test.ts | 44 ++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 src/skillify/success-judge.ts
 create mode 100644 tests/shared/success-judge.test.ts

diff --git a/src/skillify/success-judge.ts b/src/skillify/success-judge.ts
new file mode 100644
index 00000000..3ad672a1
--- /dev/null
+++ b/src/skillify/success-judge.ts
@@ -0,0 +1,93 @@
+/**
+ * Success-judge — reward level 2. Given a windowed transcript slice, asks the
+ * model the ONE question that resists sycophancy: was the user's task actually
+ * accomplished CORRECTLY? (Ignore whether the user seemed happy — a praised-but-
+ * wrong answer is a failure.) Returns success 0|1 + confidence + reason.
+ *
+ * Runs on the USER's own agent (claude -p) — cost lands on the user, so the
+ * default model is cheap and this is only ever called on anchor-flagged windows
+ * (level 1), never on every session.
+ *
+ * The model call is injected (ModelCall) so the judging logic is unit-tested with
+ * zero real LLM calls; the default shells out to `claude -p`, all tools denied.
+ *
+ * Conservative on failure: an unparseable/errored/empty judgment returns
+ * success=1 (do NOT count as a failure), so a flaky judge can never manufacture
+ * deficiency — it can only fail to detect (which the next run catches).
+ */
+import { spawn } from "node:child_process";
+
+export interface SuccessVerdict {
+  success: 0 | 1;
+  confidence: number; // 0..1
+  reason: string;
+}
+
+/** (systemPrompt, userPrompt) -> raw model text. Injected for tests. */
+export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
+
+const SYSTEM =
+  "You are a strict engineering reviewer. Judge ONLY whether the user's task was " +
+  "actually accomplished CORRECTLY in this session slice. Ignore whether the user " +
+  "seemed happy or polite — a praised-but-wrong answer is a FAILURE. Reply with " +
+  'ONLY a JSON object: {"success": 0 or 1, "confidence": 0.0-1.0, "reason": ' +
+  '"<=200 chars citing concrete evidence"}.';
+
+function buildUserPrompt(window: string): string {
+  return `Session slice (USER/ASSISTANT turns around a skill invocation):\n\n${window}\n\n` +
+    "Did the user's task get accomplished correctly? JSON only.";
+}
+
+function extractJson(raw: string): Record<string, unknown> | null {
+  let s = raw.trim();
+  const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
+  if (fence) s = fence[1].trim();
+  const a = s.indexOf("{");
+  const b = s.lastIndexOf("}");
+  if (a === -1 || b <= a) return null;
+  try { return JSON.parse(s.slice(a, b + 1)) as Record<string, unknown>; } catch { return null; }
+}
+
+/** Parse a model response into a verdict; unparseable → conservative success=1. */
+export function parseVerdict(raw: string): SuccessVerdict {
+  const j = extractJson(raw);
+  if (!j) return { success: 1, confidence: 0, reason: "unparseable judge output" };
+  const fail = j.success === 0 || j.success === "0" || j.success === false;
+  const confidence = typeof j.confidence === "number" ? Math.max(0, Math.min(1, j.confidence)) : 0.5;
+  const reason = typeof j.reason === "string" ? j.reason.slice(0, 240) : "";
+  return { success: fail ? 0 : 1, confidence, reason };
+}
+
+/** Default backend: claude -p, cheap model, all tools denied (pure-text judgment). */
+function claudeJudge(model = "haiku"): ModelCall {
+  return (system, user) => new Promise<string>((resolve, reject) => {
+    const args = [
+      "-p", user, "--model", model, "--no-session-persistence",
+      "--output-format", "json", "--system-prompt", system,
+      "--disallowed-tools", "Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task",
+    ];
+    const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] });
+    let out = "";
+    let err = "";
+    const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("judge timed out")); }, 120_000);
+    child.stdout.on("data", (d) => { out += String(d); });
+    child.stderr.on("data", (d) => { err += String(d); });
+    child.on("error", (e) => { clearTimeout(timer); reject(e); });
+    child.on("close", (code) => {
+      clearTimeout(timer);
+      if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`));
+      try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); }
+      catch { resolve(out); }
+    });
+  });
+}
+
+export async function judgeSuccess(window: string, opts: { model?: ModelCall } = {}): Promise<SuccessVerdict> {
+  if (!window.trim()) return { success: 1, confidence: 0, reason: "empty window" };
+  const model = opts.model ?? claudeJudge();
+  try {
+    return parseVerdict(await model(SYSTEM, buildUserPrompt(window)));
+  } catch (e: unknown) {
+    return { success: 1, confidence: 0, reason: `judge failed: ${(e as Error)?.message ?? String(e)}` };
+  }
+}
diff --git a/tests/shared/success-judge.test.ts b/tests/shared/success-judge.test.ts
new file mode 100644
index 00000000..7386c65b
--- /dev/null
+++ b/tests/shared/success-judge.test.ts
@@ -0,0 +1,44 @@
+import { describe, it, expect, vi } from "vitest";
+import { parseVerdict, judgeSuccess } from "../../src/skillify/success-judge.js";
+
+describe("parseVerdict", () => {
+  it("parses a clean JSON verdict", () => {
+    expect(parseVerdict('{"success":0,"confidence":0.9,"reason":"mocks the client"}'))
+      .toEqual({ success: 0, confidence: 0.9, reason: "mocks the client" });
+  });
+  it("tolerates ```json fences and surrounding prose", () => {
+    const raw = "Here is my judgment:\n```json\n{\"success\": 1, \"confidence\": 0.8, \"reason\": \"ok\"}\n```\nDone.";
+    expect(parseVerdict(raw)).toEqual({ success: 1, confidence: 0.8, reason: "ok" });
+  });
+  it("treats success false/\"0\" as failure and clamps confidence", () => {
+    expect(parseVerdict('{"success":false,"confidence":2,"reason":"x"}')).toMatchObject({ success: 0, confidence: 1 });
+    expect(parseVerdict('{"success":"0","confidence":-1,"reason":"x"}')).toMatchObject({ success: 0, confidence: 0 });
+  });
+  it("is conservative (success=1) on unparseable output", () => {
+    expect(parseVerdict("the model rambled with no json")).toMatchObject({ success: 1, confidence: 0 });
+  });
+});
+
+describe("judgeSuccess", () => {
+  it("returns the judged verdict from the injected model", async () => {
+    const model = vi.fn(async (_system: string, _user: string) => '{"success":0,"confidence":0.95,"reason":"no flush, event never sends"}');
+    const v = await judgeSuccess("USER: do X\n\nASSISTANT: mocked it", { model });
+    expect(v.success).toBe(0);
+    expect(model).toHaveBeenCalledOnce();
+    // the judge must be told to ignore mood (anti-sycophancy) + asked for JSON
+    expect(model.mock.calls[0][0]).toMatch(/praised-but-wrong|Ignore whether the user/i);
+  });
+
+  it("is conservative (success=1) when the model call throws — a flaky judge can't manufacture failure", async () => {
+    const v = await judgeSuccess("USER: x\n\nASSISTANT: y", { model: vi.fn(async () => { throw new Error("boom"); }) });
+    expect(v.success).toBe(1);
+    expect(v.reason).toContain("judge failed");
+  });
+
+  it("short-circuits an empty window without calling the model", async () => {
+    const model = vi.fn(async () => "{}");
+    const v = await judgeSuccess("   ", { model });
+    expect(v.success).toBe(1);
+    expect(model).not.toHaveBeenCalled();
+  });
+});

From f4efb8ae3ae43110714304b21ce5d2e5a769c6c4 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:22:02 +0000
Subject: [PATCH 06/30] =?UTF-8?q?feat(skillopt):=20deficiency=20detector?=
 =?UTF-8?q?=20(invocation=20=E2=86=92=20anchor=20=E2=86=92=20judge=20?=
 =?UTF-8?q?=E2=86=92=20flag)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Orchestrates the engine's detection step: group org-skill invocations, window each,
run the free anchor, and judge ONLY anchored windows. A confirmed failure needs
anchor AND judge agreement (precision). A skill is deficient at >=minInvocations
AND >=failureRateThreshold confirmed-failure rate. Returns per-skill stats +
deficientCount (the caller's >=5 fire gate). All injected; 2 tests assert the
min-n gate and that the judge runs only on anchored windows.
---
 src/skillify/deficiency-detector.ts      | 98 ++++++++++++++++++++++++
 tests/shared/deficiency-detector.test.ts | 69 +++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 src/skillify/deficiency-detector.ts
 create mode 100644 tests/shared/deficiency-detector.test.ts

diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts
new file mode 100644
index 00000000..70152597
--- /dev/null
+++ b/src/skillify/deficiency-detector.ts
@@ -0,0 +1,98 @@
+/**
+ * Deficiency detector — the core of the engine's "which skills are bad" step.
+ *
+ * For each org-skill invocation: window the transcript around it, run the FREE
+ * level-1 anchor (user pushback?), and only if anchored spend a level-2 judge
+ * call (was the task actually accomplished?). A "confirmed failure" requires BOTH
+ * — high precision, so we never churn a good skill. Aggregate per skill: a skill
+ * is deficient if it has enough invocations AND a high confirmed-failure rate.
+ *
+ * Token discipline: the judge runs ONLY on anchored windows (a fraction), on a
+ * windowed slice (not whole sessions). Everything injectable (query + judge model)
+ * so the whole orchestration is unit-tested with zero live Deeplake / LLM.
+ *
+ * The ≥5 fire gate lives with the caller (worker): we just return deficientCount.
+ */
+import {
+  listSkillInvocations, windowedTurns, type QueryFn, type SkillInvocation,
+} from "./skill-invocations.js";
+import { detectAnchor } from "./session-anchor.js";
+import { judgeSuccess, type ModelCall } from "./success-judge.js";
+
+export interface SkillDeficiency {
+  name: string;
+  author: string;
+  invocations: number;        // org-skill invocations examined
+  anchored: number;           // had a level-1 anchor → judged
+  confirmedFailures: number;  // anchor AND judge said success=0
+  failureRate: number;        // confirmedFailures / invocations
+  deficient: boolean;         // failureRate >= threshold AND invocations >= minInvocations
+  examples: string[];         // a few failure reasons (for the proposer)
+}
+
+export interface DetectorConfig {
+  minInvocations?: number;       // min-n per skill before we trust the rate (default 8)
+  failureRateThreshold?: number; // confirmed-failure rate to flag deficient (default 0.4)
+  window?: { before?: number; after?: number; maxChars?: number };
+  judge?: ModelCall;             // injected; default = real claude judge
+  sinceIso?: string;             // lookback bound
+  limit?: number;                // cap invocation rows pulled
+}
+
+const skillKey = (name: string, author: string) => `${name}--${author}`;
+
+export interface DetectionResult {
+  skills: SkillDeficiency[];
+  deficientCount: number;
+}
+
+export async function detectDeficientSkills(
+  query: QueryFn,
+  sessionsTable: string,
+  cfg: DetectorConfig = {},
+): Promise<DetectionResult> {
+  const minInvocations = cfg.minInvocations ?? 8;
+  const threshold = cfg.failureRateThreshold ?? 0.4;
+
+  const invocations = await listSkillInvocations(query, sessionsTable, { sinceIso: cfg.sinceIso, limit: cfg.limit });
+
+  const groups = new Map<string, SkillInvocation[]>();
+  for (const inv of invocations) {
+    const k = skillKey(inv.name, inv.author);
+    const arr = groups.get(k);
+    if (arr) arr.push(inv); else groups.set(k, [inv]);
+  }
+
+  const skills: SkillDeficiency[] = [];
+  for (const list of groups.values()) {
+    let anchored = 0;
+    let confirmed = 0;
+    const examples: string[] = [];
+    for (const inv of list) {
+      const turns = await windowedTurns(query, sessionsTable, inv, cfg.window);
+      const anchor = detectAnchor(turns);
+      if (!anchor.anchored) continue;          // free filter — no judge call
+      anchored++;
+      const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n");
+      const verdict = await judgeSuccess(window, { model: cfg.judge });
+      if (verdict.success === 0) {             // confirmed: anchor AND judge agree
+        confirmed++;
+        if (examples.length < 3) examples.push(verdict.reason || anchor.evidence);
+      }
+    }
+    const failureRate = list.length ? confirmed / list.length : 0;
+    skills.push({
+      name: list[0].name,
+      author: list[0].author,
+      invocations: list.length,
+      anchored,
+      confirmedFailures: confirmed,
+      failureRate,
+      deficient: list.length >= minInvocations && failureRate >= threshold,
+      examples,
+    });
+  }
+
+  skills.sort((a, b) => b.failureRate - a.failureRate || b.invocations - a.invocations);
+  return { skills, deficientCount: skills.filter((s) => s.deficient).length };
+}
diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts
new file mode 100644
index 00000000..6e00b3d2
--- /dev/null
+++ b/tests/shared/deficiency-detector.test.ts
@@ -0,0 +1,69 @@
+import { describe, it, expect, vi } from "vitest";
+import { detectDeficientSkills } from "../../src/skillify/deficiency-detector.js";
+
+const TABLE = "sessions";
+
+const invRow = (skill: string, sid: string) => ({
+  message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid },
+  last_update_date: sid,
+});
+const transcript = (skill: string, sid: string, pushback: boolean) => [
+  { message: { type: "user_message", content: "do it" } },
+  { message: { type: "assistant_message", content: "done" } },
+  { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+  { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } },
+];
+
+function world() {
+  const invs: Array<Record<string, unknown>> = [];
+  const transcripts = new Map<string, Array<Record<string, unknown>>>();
+  const add = (skill: string, sid: string, pushback: boolean) => {
+    invs.push(invRow(skill, sid));
+    transcripts.set(sid, transcript(skill, sid, pushback));
+  };
+  for (let i = 0; i < 10; i++) add("bad--auth", `bad${i}`, i < 5);     // 5/10 pushback → deficient
+  for (let i = 0; i < 10; i++) add("good--auth", `good${i}`, false);   // 0 pushback → healthy
+  for (let i = 0; i < 3; i++) add("sparse--auth", `sparse${i}`, true); // all fail but too few (min-n)
+  return { invs, transcripts };
+}
+
+describe("detectDeficientSkills", () => {
+  it("flags only skills with enough invocations AND a high confirmed-failure rate", async () => {
+    const { invs, transcripts } = world();
+    const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks the client"}');
+    const query = vi.fn(async (sql: string) => {
+      if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs; // the invocation list
+      const m = sql.match(/\/sessions\/%([^%]+)%/);                                           // a window query
+      return m ? (transcripts.get(m[1]) ?? []) : [];
+    });
+
+    const { skills, deficientCount } = await detectDeficientSkills(query, TABLE, { judge });
+    const bad = skills.find((s) => s.name === "bad")!;
+    const good = skills.find((s) => s.name === "good")!;
+    const sparse = skills.find((s) => s.name === "sparse")!;
+
+    expect(bad).toMatchObject({ invocations: 10, anchored: 5, confirmedFailures: 5, deficient: true });
+    expect(bad.failureRate).toBeCloseTo(0.5);
+    expect(good).toMatchObject({ invocations: 10, anchored: 0, confirmedFailures: 0, deficient: false });
+    expect(sparse).toMatchObject({ invocations: 3, confirmedFailures: 3, deficient: false }); // min-n blocks it
+    expect(deficientCount).toBe(1);
+
+    // token discipline: judge runs ONLY on anchored windows (5 bad + 3 sparse = 8), never the 10 good
+    expect(judge).toHaveBeenCalledTimes(8);
+  });
+
+  it("respects a custom threshold + min-n", async () => {
+    const { invs, transcripts } = world();
+    const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"x"}');
+    const query = vi.fn(async (sql: string) => {
+      if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs;
+      const m = sql.match(/\/sessions\/%([^%]+)%/);
+      return m ? (transcripts.get(m[1]) ?? []) : [];
+    });
+    // minInvocations 3, threshold 0.9 → only "sparse" (rate 1.0, 3 inv) qualifies; "bad" (0.5) doesn't
+    const { deficientCount, skills } = await detectDeficientSkills(query, TABLE, { judge, minInvocations: 3, failureRateThreshold: 0.9 });
+    expect(skills.find((s) => s.name === "sparse")!.deficient).toBe(true);
+    expect(skills.find((s) => s.name === "bad")!.deficient).toBe(false);
+    expect(deficientCount).toBe(1);
+  });
+});

From 93149a0d7109853a8f7effae462162aab8c908f1 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:26:26 +0000
Subject: [PATCH 07/30] feat(skillopt): structured edits + budget + slow-update
 region

Port of SkillOpt's edit-application: append/insert_after/replace/delete ops, the
edit budget (textual learning rate), and the protected
<!-- SLOW_UPDATE_START/END --> region (the paper's slow-update) that fast edits
must not touch. Pure + deterministic; 7 tests cover each op, the budget, and the
protection.
---
 src/skillify/skill-edits.ts      | 105 +++++++++++++++++++++++++++++++
 tests/shared/skill-edits.test.ts |  55 ++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 src/skillify/skill-edits.ts
 create mode 100644 tests/shared/skill-edits.test.ts

diff --git a/src/skillify/skill-edits.ts b/src/skillify/skill-edits.ts
new file mode 100644
index 00000000..218ba6a0
--- /dev/null
+++ b/src/skillify/skill-edits.ts
@@ -0,0 +1,105 @@
+/**
+ * Structured, bounded edits over a markdown SKILL.md — the paper's edit operations
+ * (append / insert_after / replace / delete) and "textual learning rate" (edit
+ * budget). Port of SkillOpt's skillopt/optimizer/skill.py.
+ *
+ * A protected region — between <!-- SLOW_UPDATE_START --> and <!-- SLOW_UPDATE_END -->
+ * — holds longitudinal guidance that fast per-edit changes must NOT touch (the
+ * paper's slow-update). Edits targeting it are skipped, and `append` lands above it.
+ *
+ * Pure + deterministic — no I/O, fully unit-testable.
+ */
+export type EditOp = "append" | "insert_after" | "replace" | "delete";
+export interface Edit {
+  op: EditOp;
+  target?: string;  // anchor text for insert_after / replace / delete
+  content?: string; // new text for append / insert_after / replace
+}
+
+export const SU_START = "<!-- SLOW_UPDATE_START -->";
+export const SU_END = "<!-- SLOW_UPDATE_END -->";
+
+function protectedRange(skill: string): [number, number] | null {
+  const a = skill.indexOf(SU_START);
+  const b = skill.indexOf(SU_END);
+  if (a === -1 || b === -1 || b < a) return null;
+  return [a, b + SU_END.length];
+}
+
+function targetsProtected(skill: string, target: string): boolean {
+  const r = protectedRange(skill);
+  if (!r || !target) return false;
+  const idx = skill.indexOf(target);
+  return idx !== -1 && idx >= r[0] && idx < r[1];
+}
+
+/** Enforce the edit budget ("textual learning rate"): keep at most `budget` edits. */
+export function selectEdits(edits: Edit[], budget: number): Edit[] {
+  return edits.slice(0, Math.max(0, budget));
+}
+
+export interface ApplyResult {
+  skill: string;
+  report: string[];
+  applied: number; // how many edits actually changed the doc
+}
+
+/** Apply bounded structured edits; protected-region targets are skipped. */
+export function applyEdits(skill: string, edits: Edit[]): ApplyResult {
+  let s = skill;
+  const report: string[] = [];
+  let applied = 0;
+  const ok = (msg: string) => { applied++; report.push(`OK ${msg}`); };
+
+  for (const e of edits) {
+    if (e.target && targetsProtected(s, e.target)) {
+      report.push(`SKIP ${e.op}: targets protected slow-update region`);
+      continue;
+    }
+    switch (e.op) {
+      case "append": {
+        const content = (e.content ?? "").trim();
+        if (!content) { report.push("SKIP append: empty content"); break; }
+        const r = protectedRange(s);
+        if (r) s = s.slice(0, r[0]) + content + "\n\n" + s.slice(r[0]);
+        else s = s.replace(/\s*$/, "") + "\n\n" + content + "\n";
+        ok(`append (+${content.length} chars)`);
+        break;
+      }
+      case "insert_after": {
+        const target = e.target ?? "";
+        const content = (e.content ?? "").trim();
+        if (!target || !content) { report.push("SKIP insert_after: missing target/content"); break; }
+        const idx = s.indexOf(target);
+        if (idx === -1) { report.push("SKIP insert_after: target not found"); break; }
+        const lineEnd = s.indexOf("\n", idx + target.length);
+        const at = lineEnd === -1 ? s.length : lineEnd;
+        s = s.slice(0, at) + "\n" + content + s.slice(at);
+        ok("insert_after");
+        break;
+      }
+      case "replace": {
+        const target = e.target ?? "";
+        const content = e.content ?? "";
+        if (!target) { report.push("SKIP replace: missing target"); break; }
+        const idx = s.indexOf(target);
+        if (idx === -1) { report.push("SKIP replace: target not found"); break; }
+        s = s.slice(0, idx) + content + s.slice(idx + target.length);
+        ok("replace");
+        break;
+      }
+      case "delete": {
+        const target = e.target ?? "";
+        if (!target) { report.push("SKIP delete: missing target"); break; }
+        const idx = s.indexOf(target);
+        if (idx === -1) { report.push("SKIP delete: target not found"); break; }
+        s = s.slice(0, idx) + s.slice(idx + target.length);
+        ok("delete");
+        break;
+      }
+      default:
+        report.push(`SKIP unknown op: ${(e as Edit).op}`);
+    }
+  }
+  return { skill: s, report, applied };
+}
diff --git a/tests/shared/skill-edits.test.ts b/tests/shared/skill-edits.test.ts
new file mode 100644
index 00000000..3322cb48
--- /dev/null
+++ b/tests/shared/skill-edits.test.ts
@@ -0,0 +1,55 @@
+import { describe, it, expect } from "vitest";
+import { applyEdits, selectEdits, SU_START, SU_END } from "../../src/skillify/skill-edits.js";
+
+describe("applyEdits", () => {
+  const base = "## Rules\n1. mock the client\n2. skip flush";
+
+  it("append adds content at the end", () => {
+    const r = applyEdits(base, [{ op: "append", content: "3. verify via the API" }]);
+    expect(r.skill).toContain("3. verify via the API");
+    expect(r.applied).toBe(1);
+  });
+
+  it("insert_after inserts on the line after the target", () => {
+    const r = applyEdits(base, [{ op: "insert_after", target: "1. mock the client", content: "(NEVER mock — it hides failures)" }]);
+    expect(r.skill).toMatch(/1\. mock the client\n\(NEVER mock — it hides failures\)\n2\. skip flush/);
+  });
+
+  it("replace swaps the target text", () => {
+    const r = applyEdits(base, [{ op: "replace", target: "skip flush", content: "ALWAYS flush" }]);
+    expect(r.skill).toContain("2. ALWAYS flush");
+    expect(r.skill).not.toContain("skip flush");
+  });
+
+  it("delete removes the target text", () => {
+    const r = applyEdits(base, [{ op: "delete", target: "\n2. skip flush" }]);
+    expect(r.skill).toBe("## Rules\n1. mock the client");
+  });
+
+  it("skips edits whose target isn't found (and counts only applied)", () => {
+    const r = applyEdits(base, [{ op: "replace", target: "nonexistent", content: "x" }, { op: "append", content: "added" }]);
+    expect(r.applied).toBe(1);
+    expect(r.report.some((l) => l.includes("SKIP replace: target not found"))).toBe(true);
+  });
+
+  it("protects the slow-update region: skips edits targeting it, appends ABOVE it", () => {
+    const doc = `## Rules\n1. a\n\n${SU_START}\nLongitudinal: prefer X over Y.\n${SU_END}`;
+    const r = applyEdits(doc, [
+      { op: "delete", target: "prefer X over Y" },      // targets protected → skipped
+      { op: "append", content: "2. b" },                // lands above the region
+    ]);
+    expect(r.skill).toContain("prefer X over Y");        // protected content untouched
+    expect(r.report.some((l) => l.includes("protected slow-update region"))).toBe(true);
+    // appended content sits before the protected block
+    expect(r.skill.indexOf("2. b")).toBeLessThan(r.skill.indexOf(SU_START));
+  });
+});
+
+describe("selectEdits (edit budget)", () => {
+  const edits = [1, 2, 3, 4].map((i) => ({ op: "append" as const, content: `${i}` }));
+  it("keeps at most `budget` edits", () => {
+    expect(selectEdits(edits, 2)).toHaveLength(2);
+    expect(selectEdits(edits, 0)).toHaveLength(0);
+    expect(selectEdits(edits, 99)).toHaveLength(4);
+  });
+});

From a93ed3d4b416472f36ace19271d3ac2d76411950 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:26:26 +0000
Subject: [PATCH 08/30] feat(skillopt): proposer (reflect -> structured edits)
 + shared claude backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The paper's backward pass: from a deficient skill's body + its confirmed failures,
the optimizer diagnoses the single recurring weakness and emits bounded structured
edits (slow-update region off-limits), applied locally to a candidate body —
nothing published. Extracts the shared claude -p backend (claude-model.ts) used by
both judge and proposer; success-judge refactored onto it. Injected model; tolerant
edit parse; 5 proposer tests.
---
 src/skillify/claude-model.ts        | 35 ++++++++++++
 src/skillify/skill-proposer.ts      | 87 +++++++++++++++++++++++++++++
 src/skillify/success-judge.ts       | 33 ++---------
 tests/shared/skill-proposer.test.ts | 51 +++++++++++++++++
 4 files changed, 177 insertions(+), 29 deletions(-)
 create mode 100644 src/skillify/claude-model.ts
 create mode 100644 src/skillify/skill-proposer.ts
 create mode 100644 tests/shared/skill-proposer.test.ts

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
new file mode 100644
index 00000000..47b5e8f0
--- /dev/null
+++ b/src/skillify/claude-model.ts
@@ -0,0 +1,35 @@
+/**
+ * Shared `claude -p` backend for the engine's LLM steps (success-judge, proposer).
+ * All tools denied → pure-text generation. Runs on the USER's own agent, so cost
+ * lands on the user. Returned as an injectable ModelCall so every LLM step is
+ * unit-testable with zero real calls.
+ */
+import { spawn } from "node:child_process";
+
+/** (systemPrompt, userPrompt) -> raw model text. */
+export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
+
+const DENY = ["Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task"];
+
+export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall {
+  const timeoutMs = opts.timeoutMs ?? 120_000;
+  return (system, user) => new Promise<string>((resolve, reject) => {
+    const args = [
+      "-p", user, "--model", model, "--no-session-persistence",
+      "--output-format", "json", "--system-prompt", system, "--disallowed-tools", ...DENY,
+    ];
+    const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] });
+    let out = "";
+    let err = "";
+    const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs);
+    child.stdout.on("data", (d) => { out += String(d); });
+    child.stderr.on("data", (d) => { err += String(d); });
+    child.on("error", (e) => { clearTimeout(timer); reject(e); });
+    child.on("close", (code) => {
+      clearTimeout(timer);
+      if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`));
+      try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); }
+      catch { resolve(out); }
+    });
+  });
+}
diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts
new file mode 100644
index 00000000..aa9c0159
--- /dev/null
+++ b/src/skillify/skill-proposer.ts
@@ -0,0 +1,87 @@
+/**
+ * Proposer — the engine's "reflect → edit" step (the paper's backward pass).
+ * Given a deficient skill's body + the concrete failures the detector confirmed,
+ * the optimizer diagnoses the single recurring weakness and proposes a SMALL set
+ * of structured edits, bounded by the edit budget ("textual learning rate"). The
+ * protected slow-update region is off-limits. Edits are applied locally to produce
+ * the candidate body — NOTHING is published here (publish is a separate, gated step).
+ *
+ * Runs on the user's agent via an injected ModelCall (default = claude sonnet),
+ * so the reflect logic is unit-testable with zero real LLM calls.
+ */
+import { applyEdits, selectEdits, SU_START, SU_END, type Edit, type EditOp } from "./skill-edits.js";
+import { claudeModel, type ModelCall } from "./claude-model.js";
+
+export interface Proposal {
+  edits: Edit[];       // edits kept after the budget
+  editedBody: string;  // skill body after applying them
+  report: string[];    // per-edit OK/SKIP log
+  changed: boolean;    // did anything actually change?
+}
+
+export interface ProposeConfig {
+  editBudget?: number; // max edits to keep (default 3)
+  model?: ModelCall;   // injected; default = claude sonnet
+}
+
+const SYSTEM =
+  "You improve an engineering SKILL document that has been producing repeated, " +
+  "confirmed failures. Diagnose the SINGLE recurring weakness behind the failures " +
+  "and propose a SMALL set of structured edits that fix it. Do NOT rewrite the " +
+  `whole doc, and do NOT touch anything between ${SU_START} and ${SU_END}. Reply ` +
+  'with ONLY a JSON array of edits, each: {"op":"append|insert_after|replace|' +
+  'delete","target":"<exact existing text to anchor on; required for ' +
+  'insert_after/replace/delete>","content":"<new text; required for ' +
+  'append/insert_after/replace>"}. Prefer the smallest change that fixes the weakness.';
+
+function buildUserPrompt(body: string, failures: string[]): string {
+  const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n");
+  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}\n\nPropose the bounded edits. JSON array only.`;
+}
+
+const OPS = new Set<EditOp>(["append", "insert_after", "replace", "delete"]);
+
+/** Tolerant parse of a JSON array of edits (handles ```fences / surrounding prose). */
+export function parseEdits(raw: string): Edit[] {
+  let s = raw.trim();
+  const fence = s.match(/```(?:json)?\s*([\s\S]*?)```/);
+  if (fence) s = fence[1].trim();
+  const a = s.indexOf("[");
+  const b = s.lastIndexOf("]");
+  if (a === -1 || b <= a) return [];
+  let arr: unknown;
+  try { arr = JSON.parse(s.slice(a, b + 1)); } catch { return []; }
+  if (!Array.isArray(arr)) return [];
+  const out: Edit[] = [];
+  for (const e of arr) {
+    if (!e || typeof e !== "object") continue;
+    const op = (e as { op?: unknown }).op;
+    if (typeof op !== "string" || !OPS.has(op as EditOp)) continue;
+    const target = (e as { target?: unknown }).target;
+    const content = (e as { content?: unknown }).content;
+    out.push({
+      op: op as EditOp,
+      ...(typeof target === "string" ? { target } : {}),
+      ...(typeof content === "string" ? { content } : {}),
+    });
+  }
+  return out;
+}
+
+export async function proposeSkillEdit(
+  skillBody: string,
+  failures: string[],
+  cfg: ProposeConfig = {},
+): Promise<Proposal> {
+  const budget = cfg.editBudget ?? 3;
+  const model = cfg.model ?? claudeModel("sonnet");
+  let raw: string;
+  try {
+    raw = await model(SYSTEM, buildUserPrompt(skillBody, failures));
+  } catch {
+    return { edits: [], editedBody: skillBody, report: ["proposer model call failed"], changed: false };
+  }
+  const edits = selectEdits(parseEdits(raw), budget);
+  const { skill, report, applied } = applyEdits(skillBody, edits);
+  return { edits, editedBody: skill, report, changed: applied > 0 };
+}
diff --git a/src/skillify/success-judge.ts b/src/skillify/success-judge.ts
index 3ad672a1..c2d65626 100644
--- a/src/skillify/success-judge.ts
+++ b/src/skillify/success-judge.ts
@@ -15,7 +15,9 @@
  * success=1 (do NOT count as a failure), so a flaky judge can never manufacture
  * deficiency — it can only fail to detect (which the next run catches).
  */
-import { spawn } from "node:child_process";
+import { claudeModel, type ModelCall } from "./claude-model.js";
+
+export type { ModelCall };
 
 export interface SuccessVerdict {
   success: 0 | 1;
@@ -23,9 +25,6 @@ export interface SuccessVerdict {
   reason: string;
 }
 
-/** (systemPrompt, userPrompt) -> raw model text. Injected for tests. */
-export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
-
 const SYSTEM =
   "You are a strict engineering reviewer. Judge ONLY whether the user's task was " +
   "actually accomplished CORRECTLY in this session slice. Ignore whether the user " +
@@ -58,33 +57,9 @@ export function parseVerdict(raw: string): SuccessVerdict {
   return { success: fail ? 0 : 1, confidence, reason };
 }
 
-/** Default backend: claude -p, cheap model, all tools denied (pure-text judgment). */
-function claudeJudge(model = "haiku"): ModelCall {
-  return (system, user) => new Promise<string>((resolve, reject) => {
-    const args = [
-      "-p", user, "--model", model, "--no-session-persistence",
-      "--output-format", "json", "--system-prompt", system,
-      "--disallowed-tools", "Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task",
-    ];
-    const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] });
-    let out = "";
-    let err = "";
-    const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("judge timed out")); }, 120_000);
-    child.stdout.on("data", (d) => { out += String(d); });
-    child.stderr.on("data", (d) => { err += String(d); });
-    child.on("error", (e) => { clearTimeout(timer); reject(e); });
-    child.on("close", (code) => {
-      clearTimeout(timer);
-      if (code !== 0) return reject(new Error(`claude exit ${code}: ${err.slice(0, 200)}`));
-      try { resolve(String((JSON.parse(out) as { result?: unknown }).result ?? "")); }
-      catch { resolve(out); }
-    });
-  });
-}
-
 export async function judgeSuccess(window: string, opts: { model?: ModelCall } = {}): Promise<SuccessVerdict> {
   if (!window.trim()) return { success: 1, confidence: 0, reason: "empty window" };
-  const model = opts.model ?? claudeJudge();
+  const model = opts.model ?? claudeModel("haiku"); // cheap default; judge only runs on anchored windows
   try {
     return parseVerdict(await model(SYSTEM, buildUserPrompt(window)));
   } catch (e: unknown) {
diff --git a/tests/shared/skill-proposer.test.ts b/tests/shared/skill-proposer.test.ts
new file mode 100644
index 00000000..cb1a1c00
--- /dev/null
+++ b/tests/shared/skill-proposer.test.ts
@@ -0,0 +1,51 @@
+import { describe, it, expect, vi } from "vitest";
+import { parseEdits, proposeSkillEdit } from "../../src/skillify/skill-proposer.js";
+
+describe("parseEdits", () => {
+  it("parses a JSON array, tolerating fences/prose, dropping invalid ops + non-objects", () => {
+    const raw = "Sure:\n```json\n[" +
+      '{"op":"replace","target":"mock the client","content":"NEVER mock"},' +
+      '{"op":"bogus","target":"x"},' +              // invalid op → dropped
+      '"nope",' +                                    // non-object → dropped
+      '{"op":"append","content":"verify via API"}' +
+      "]\n```";
+    const edits = parseEdits(raw);
+    expect(edits).toEqual([
+      { op: "replace", target: "mock the client", content: "NEVER mock" },
+      { op: "append", content: "verify via API" },
+    ]);
+  });
+  it("returns [] when there's no array", () => {
+    expect(parseEdits("the model refused")).toEqual([]);
+  });
+});
+
+describe("proposeSkillEdit", () => {
+  const body = "## Rules\n1. mock the client\n2. skip flush";
+  const failures = ["mocked the client so the test passes even when the event never sends"];
+
+  it("applies the proposed edits to produce a candidate body", async () => {
+    const model = vi.fn(async (_s: string, _u: string) =>
+      '[{"op":"replace","target":"mock the client","content":"NEVER mock — assert on the real client"}]');
+    const p = await proposeSkillEdit(body, failures, { model });
+    expect(p.changed).toBe(true);
+    expect(p.editedBody).toContain("NEVER mock — assert on the real client");
+    // the optimizer is told to diagnose the recurring weakness + emit JSON edits
+    expect(model.mock.calls[0][0]).toMatch(/recurring weakness/i);
+    expect(model.mock.calls[0][1]).toContain("CONFIRMED FAILURES");
+  });
+
+  it("enforces the edit budget", async () => {
+    const model = vi.fn(async (_s: string, _u: string) =>
+      '[{"op":"append","content":"a"},{"op":"append","content":"b"},{"op":"append","content":"c"}]');
+    const p = await proposeSkillEdit(body, failures, { model, editBudget: 1 });
+    expect(p.edits).toHaveLength(1);
+    expect(p.editedBody).toContain("\na");
+    expect(p.editedBody).not.toContain("\nb");
+  });
+
+  it("is a no-op when the model fails or proposes nothing", async () => {
+    expect((await proposeSkillEdit(body, failures, { model: vi.fn(async () => { throw new Error("x"); }) })).changed).toBe(false);
+    expect((await proposeSkillEdit(body, failures, { model: vi.fn(async (_s: string, _u: string) => "no edits") })).changed).toBe(false);
+  });
+});

From 5fc4940a87488fe7bba82b27f612b1b98fd5f0cb Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:28:21 +0000
Subject: [PATCH 09/30] feat(skillopt): live publish mechanism (version bump +
 backup)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Writes an accepted edit to the live SKILL.md via the native skills dir (never the
model channel — PR #223), bumps the frontmatter version (v1-vs-v2), and backs up
the prior version for one-command revert. Mechanism only: the worker won't call it
on an unvalidated edit (offline gate isn't trustworthy) — reserved for the
real-usage A/B gate. 6 fs tests.
---
 src/skillify/skill-publisher.ts      | 59 ++++++++++++++++++++++++++++
 tests/shared/skill-publisher.test.ts | 57 +++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 src/skillify/skill-publisher.ts
 create mode 100644 tests/shared/skill-publisher.test.ts

diff --git a/src/skillify/skill-publisher.ts b/src/skillify/skill-publisher.ts
new file mode 100644
index 00000000..68a3f52d
--- /dev/null
+++ b/src/skillify/skill-publisher.ts
@@ -0,0 +1,59 @@
+/**
+ * Publish mechanism: write an accepted skill edit to the LIVE SKILL.md via the
+ * native skills dir (the only legitimate channel — never the model's prompt
+ * context; see PR #223). Bumps the frontmatter version (enables v1-vs-v2) and
+ * keeps a backup so a bad edit is one `cp` from revert.
+ *
+ * This is the mechanism only. The worker does NOT call it on an unvalidated edit
+ * (the offline gate isn't trustworthy — see the spike findings); it writes a
+ * review proposal instead, and live publish is reserved for an edit that has
+ * passed the real-usage A/B gate (deferred). Pure fs; testable against a tmp dir.
+ */
+import fs from "node:fs";
+import path from "node:path";
+
+export interface PublishResult {
+  path: string;
+  oldVersion: number;
+  newVersion: number;
+  backupPath: string;
+}
+
+/** Split a SKILL.md into its frontmatter block (incl. fences) and the body. */
+export function splitFrontmatter(md: string): { frontmatter: string; body: string } {
+  const m = md.match(/^(---\n[\s\S]*?\n---\n)([\s\S]*)$/);
+  if (m) return { frontmatter: m[1], body: m[2] };
+  return { frontmatter: "", body: md };
+}
+
+/** Bump `version: N` in a frontmatter block (absent → treat as 1 → 2). */
+export function bumpVersion(frontmatter: string): { frontmatter: string; oldVersion: number; newVersion: number } {
+  const m = frontmatter.match(/^version:\s*(\d+)\s*$/m);
+  const oldVersion = m ? parseInt(m[1], 10) : 1;
+  const newVersion = oldVersion + 1;
+  const next = m
+    ? frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`)
+    : frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`);
+  return { frontmatter: next, oldVersion, newVersion };
+}
+
+/**
+ * Write `editedBody` to the skill's live SKILL.md, version bumped, original backed
+ * up to SKILL.v<old>.bak.md. Throws if the skill dir / file isn't present.
+ */
+export function publishSkillEdit(
+  skillsRoot: string,
+  name: string,
+  author: string,
+  editedBody: string,
+): PublishResult {
+  const dir = path.join(skillsRoot, `${name}--${author}`);
+  const file = path.join(dir, "SKILL.md");
+  const existing = fs.readFileSync(file, "utf8");
+  const { frontmatter } = splitFrontmatter(existing);
+  const { frontmatter: bumped, oldVersion, newVersion } = bumpVersion(frontmatter);
+  const backupPath = path.join(dir, `SKILL.v${oldVersion}.bak.md`);
+  fs.writeFileSync(backupPath, existing);
+  fs.writeFileSync(file, `${bumped}${editedBody.trimEnd()}\n`);
+  return { path: file, oldVersion, newVersion, backupPath };
+}
diff --git a/tests/shared/skill-publisher.test.ts b/tests/shared/skill-publisher.test.ts
new file mode 100644
index 00000000..065b5929
--- /dev/null
+++ b/tests/shared/skill-publisher.test.ts
@@ -0,0 +1,57 @@
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { splitFrontmatter, bumpVersion, publishSkillEdit } from "../../src/skillify/skill-publisher.js";
+
+describe("splitFrontmatter", () => {
+  it("splits frontmatter from body", () => {
+    const { frontmatter, body } = splitFrontmatter("---\nname: x\nversion: 3\n---\n## Body\nhi");
+    expect(frontmatter).toBe("---\nname: x\nversion: 3\n---\n");
+    expect(body).toBe("## Body\nhi");
+  });
+  it("handles a doc with no frontmatter", () => {
+    expect(splitFrontmatter("just body")).toEqual({ frontmatter: "", body: "just body" });
+  });
+});
+
+describe("bumpVersion", () => {
+  it("increments an existing version", () => {
+    const r = bumpVersion("---\nname: x\nversion: 4\n---\n");
+    expect(r.oldVersion).toBe(4);
+    expect(r.newVersion).toBe(5);
+    expect(r.frontmatter).toContain("version: 5");
+  });
+  it("inserts version 2 when absent (original treated as 1)", () => {
+    const r = bumpVersion("---\nname: x\n---\n");
+    expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 });
+    expect(r.frontmatter).toMatch(/version: 2\n---\n$/);
+  });
+});
+
+describe("publishSkillEdit", () => {
+  let root: string;
+  beforeEach(() => { root = fs.mkdtempSync(path.join(os.tmpdir(), "pub-")); });
+  afterEach(() => { fs.rmSync(root, { recursive: true, force: true }); });
+
+  it("writes the bumped body and backs up the original", () => {
+    const dir = path.join(root, "posthog--kamo");
+    fs.mkdirSync(dir);
+    fs.writeFileSync(path.join(dir, "SKILL.md"), "---\nname: posthog\nauthor: kamo\nversion: 2\n---\n## Rules\n1. mock the client\n");
+
+    const res = publishSkillEdit(root, "posthog", "kamo", "## Rules\n1. NEVER mock — assert on the real client");
+
+    expect(res).toMatchObject({ oldVersion: 2, newVersion: 3 });
+    const written = fs.readFileSync(res.path, "utf8");
+    expect(written).toContain("version: 3");
+    expect(written).toContain("NEVER mock — assert on the real client");
+    expect(written).not.toContain("1. mock the client\n");
+    // backup preserves the prior version verbatim
+    expect(fs.readFileSync(res.backupPath, "utf8")).toContain("version: 2");
+    expect(fs.readFileSync(res.backupPath, "utf8")).toContain("1. mock the client");
+  });
+
+  it("throws when the skill isn't installed (caller decides what to do)", () => {
+    expect(() => publishSkillEdit(root, "missing", "x", "body")).toThrow();
+  });
+});

From de5bac2f298ebc65652ebef9f83228d3c225f64d Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:30:28 +0000
Subject: [PATCH 10/30] feat(skillopt): weekly cycle orchestration + >=5 fire
 gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runSkillOptCycle ties it together: detect deficient skills, fire only at
>=fireThreshold (default 5 — act on a pattern, not noise), then for each: read the
body, propose a bounded edit, write a REVIEW PROPOSAL (candidate.md + proposal.json
with full evidence). Never auto-overwrites a live skill — live publish is gated on
the deferred A/B. All I/O injected; 4 tests cover the gate, proposals, skip-when-
not-installed, and a custom threshold.
---
 src/skillify/skillopt-engine.ts      | 98 ++++++++++++++++++++++++++++
 tests/shared/skillopt-engine.test.ts | 86 ++++++++++++++++++++++++
 2 files changed, 184 insertions(+)
 create mode 100644 src/skillify/skillopt-engine.ts
 create mode 100644 tests/shared/skillopt-engine.test.ts

diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts
new file mode 100644
index 00000000..8af765d0
--- /dev/null
+++ b/src/skillify/skillopt-engine.ts
@@ -0,0 +1,98 @@
+/**
+ * The weekly SkillOpt cycle, wired end to end and fully injectable:
+ *
+ *   detect deficient skills  →  ≥5 fire gate  →  for each: read body, propose a
+ *   bounded edit, write a REVIEW PROPOSAL (not a live publish).
+ *
+ * Why proposals, not auto-publish: the offline gate isn't trustworthy (spike
+ * finding), so we never auto-overwrite a live skill. The engine surfaces concrete,
+ * evidence-backed edit proposals; turning one live is gated on the real-usage A/B
+ * (deferred) or a human. Everything is injected (query, judge/proposer models, the
+ * skill reader, the proposal writer), so this orchestration is unit-tested with no
+ * Deeplake / LLM / fs.
+ */
+import fs from "node:fs";
+import path from "node:path";
+import { detectDeficientSkills, type DetectorConfig } from "./deficiency-detector.js";
+import { proposeSkillEdit, type ProposeConfig } from "./skill-proposer.js";
+import { splitFrontmatter } from "./skill-publisher.js";
+import type { QueryFn } from "./skill-invocations.js";
+import type { Edit } from "./skill-edits.js";
+
+export interface ProposalRecord {
+  name: string;
+  author: string;
+  invocations: number;
+  confirmedFailures: number;
+  failureRate: number;
+  examples: string[];
+  edits: Edit[];
+  report: string[];
+  candidateBody: string;
+  createdAt: string;
+}
+
+export interface CycleDeps {
+  query: QueryFn;
+  sessionsTable: string;
+  readSkillBody: (name: string, author: string) => string | null; // null when not installed locally
+  writeProposal: (rec: ProposalRecord) => void;
+  detector?: DetectorConfig;
+  proposer?: ProposeConfig;
+  fireThreshold?: number; // deficient-skill count to fire (default 5)
+  maxProposals?: number;  // cap edits proposed per cycle (default 10)
+  now: string;            // ISO timestamp (injected — Date is awkward in workers)
+}
+
+export interface CycleResult {
+  deficientCount: number;
+  fired: boolean;
+  proposals: Array<{ name: string; author: string; changed: boolean; failureRate: number }>;
+}
+
+export async function runSkillOptCycle(deps: CycleDeps): Promise<CycleResult> {
+  const fireThreshold = deps.fireThreshold ?? 5;
+  const { skills, deficientCount } = await detectDeficientSkills(deps.query, deps.sessionsTable, deps.detector);
+
+  // The ≥N gate: only act on a real PATTERN of deficiency, not one or two noisy skills.
+  if (deficientCount < fireThreshold) {
+    return { deficientCount, fired: false, proposals: [] };
+  }
+
+  const targets = skills.filter((s) => s.deficient).slice(0, deps.maxProposals ?? 10);
+  const proposals: CycleResult["proposals"] = [];
+  for (const s of targets) {
+    const body = deps.readSkillBody(s.name, s.author);
+    if (!body) continue; // not installed locally → nothing to edit
+    const p = await proposeSkillEdit(body, s.examples, deps.proposer);
+    if (p.changed) {
+      deps.writeProposal({
+        name: s.name, author: s.author,
+        invocations: s.invocations, confirmedFailures: s.confirmedFailures, failureRate: s.failureRate,
+        examples: s.examples, edits: p.edits, report: p.report,
+        candidateBody: p.editedBody, createdAt: deps.now,
+      });
+    }
+    proposals.push({ name: s.name, author: s.author, changed: p.changed, failureRate: s.failureRate });
+  }
+  return { deficientCount, fired: true, proposals };
+}
+
+/** Default proposal writer: <proposalsRoot>/<name>--<author>/{proposal.json,candidate.md}. */
+export function writeProposalToDisk(proposalsRoot: string, rec: ProposalRecord): string {
+  const dir = path.join(proposalsRoot, `${rec.name}--${rec.author}`);
+  fs.mkdirSync(dir, { recursive: true });
+  fs.writeFileSync(path.join(dir, "candidate.md"), rec.candidateBody.trimEnd() + "\n");
+  fs.writeFileSync(path.join(dir, "proposal.json"), JSON.stringify(rec, null, 2) + "\n");
+  return dir;
+}
+
+/** Read a skill's SKILL.md body (frontmatter stripped) from a skills root; null if absent. */
+export function readSkillBodyFromDisk(skillsRoot: string, name: string, author: string): string | null {
+  try {
+    const md = fs.readFileSync(path.join(skillsRoot, `${name}--${author}`, "SKILL.md"), "utf8");
+    return splitFrontmatter(md).body.trim();
+  } catch {
+    return null;
+  }
+}
diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts
new file mode 100644
index 00000000..6ecfbbfb
--- /dev/null
+++ b/tests/shared/skillopt-engine.test.ts
@@ -0,0 +1,86 @@
+import { describe, it, expect, vi } from "vitest";
+import { runSkillOptCycle, type ProposalRecord } from "../../src/skillify/skillopt-engine.js";
+
+const invRow = (skill: string, sid: string) => ({
+  message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid },
+  last_update_date: sid,
+});
+const transcript = (skill: string, sid: string, pushback: boolean) => [
+  { message: { type: "user_message", content: "do it" } },
+  { message: { type: "assistant_message", content: "done" } },
+  { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+  { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } },
+];
+
+/** nBad skills, each 10 invocations with 5 pushback → each deficient. */
+function world(nBad: number) {
+  const invs: Array<Record<string, unknown>> = [];
+  const transcripts = new Map<string, Array<Record<string, unknown>>>();
+  for (let b = 0; b < nBad; b++) {
+    for (let i = 0; i < 10; i++) {
+      const sid = `b${b}s${i}`;
+      invs.push(invRow(`bad${b}--auth`, sid));
+      transcripts.set(sid, transcript(`bad${b}--auth`, sid, i < 5));
+    }
+  }
+  const query = vi.fn(async (sql: string) => {
+    if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return invs;
+    const m = sql.match(/\/sessions\/%([^%]+)%/);
+    return m ? (transcripts.get(m[1]) ?? []) : [];
+  });
+  return query;
+}
+
+const judge = () => vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks the client"}');
+const proposerModel = () => vi.fn(async (_s: string, _u: string) => '[{"op":"append","content":"Always verify via the PostHog API."}]');
+
+describe("runSkillOptCycle", () => {
+  it("fires when >=5 skills are deficient and writes a proposal per editable skill", async () => {
+    const written: ProposalRecord[] = [];
+    const res = await runSkillOptCycle({
+      query: world(6), sessionsTable: "sessions", now: "2026-06-05T00:00:00Z",
+      readSkillBody: () => "## Rules\n1. mock the client",
+      writeProposal: (r) => written.push(r),
+      detector: { judge: judge() }, proposer: { model: proposerModel() },
+    });
+    expect(res.fired).toBe(true);
+    expect(res.deficientCount).toBe(6);
+    expect(written).toHaveLength(6);
+    expect(written[0].candidateBody).toContain("Always verify via the PostHog API.");
+    expect(written[0]).toMatchObject({ invocations: 10, confirmedFailures: 5 });
+  });
+
+  it("does NOT fire below the threshold (no proposals, even though detection ran)", async () => {
+    const writeProposal = vi.fn();
+    const res = await runSkillOptCycle({
+      query: world(4), sessionsTable: "sessions", now: "t",
+      readSkillBody: () => "## Rules", writeProposal,
+      detector: { judge: judge() }, proposer: { model: proposerModel() },
+    });
+    expect(res).toMatchObject({ fired: false, deficientCount: 4 });
+    expect(res.proposals).toHaveLength(0);
+    expect(writeProposal).not.toHaveBeenCalled();
+  });
+
+  it("skips a deficient skill that isn't installed locally (no body to edit)", async () => {
+    const written: ProposalRecord[] = [];
+    const res = await runSkillOptCycle({
+      query: world(6), sessionsTable: "sessions", now: "t",
+      readSkillBody: (name) => (name === "bad0" ? null : "## Rules\n1. mock the client"),
+      writeProposal: (r) => written.push(r),
+      detector: { judge: judge() }, proposer: { model: proposerModel() },
+    });
+    expect(res.fired).toBe(true);
+    expect(written).toHaveLength(5);                       // bad0 skipped
+    expect(written.some((w) => w.name === "bad0")).toBe(false);
+  });
+
+  it("honors a custom fireThreshold", async () => {
+    const res = await runSkillOptCycle({
+      query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3,
+      readSkillBody: () => "## Rules", writeProposal: vi.fn(),
+      detector: { judge: judge() }, proposer: { model: proposerModel() },
+    });
+    expect(res.fired).toBe(true);
+  });
+});

From 3b7fa7adc20a00dc965b304ba2b69787898258a6 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:32:57 +0000
Subject: [PATCH 11/30] feat(skillopt): wire the real cycle into the weekly
 worker

Replaces the stub: the detached weekly worker now loads config, builds the query,
and runs runSkillOptCycle (detect -> >=5 gate -> propose -> write review proposals)
over a 30-day invocation lookback. Verified end-to-end against a real org: 40 Skill
invocations scanned, 0 anchored -> 0 judge calls -> 0 deficient -> exits at the gate
(~14s, $0). No auto-publish; proposals land under <stateDir>/skillopt/proposals.
---
 src/skillify/skillopt-worker.ts | 57 ++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts
index 77d815bc..10dc8040 100644
--- a/src/skillify/skillopt-worker.ts
+++ b/src/skillify/skillopt-worker.ts
@@ -1,32 +1,51 @@
 #!/usr/bin/env node
 /**
- * Detached weekly SkillOpt worker (spawned by skillopt-trigger). Runs the loop ONCE:
- *   1. detect a deficient skill (behavioral: sessions that loaded it still scored low)
- *   2. optimizer proposes a bounded edit (v2)
- *   3. real-rollout gate: keep v2 only if it measurably beats v1
- *   4. silent canary publish + post-publish monitor / auto-revert
+ * Detached weekly SkillOpt worker (spawned by skillopt-trigger). Runs the cycle ONCE:
+ *   1. detect deficient skills from real invocations (anchor + judge, windowed)
+ *   2. ≥5 fire gate (act on a pattern, not noise)
+ *   3. propose a bounded edit per deficient skill and write a REVIEW PROPOSAL
  *
- * Uses the user's own agent (claude -p / codex), so no org API key. Runs in the background; the
- * user never notices. HIVEMIND_SKILLOPT_WORKER=1 is set by the trigger as a recursion guard.
- *
- * STATUS: scaffold. Steps 1/3/4 depend on prerequisites not yet shipped (deployed attribution data
- * for detection + monitoring, and a local rollout sandbox). The loop ENGINE (rollout->optimize->gate)
- * is prototyped in experiments/skillopt-spike (skillopt-loop.ts, validated both directions). This
- * entry exists so the trigger has a real, spawnable target and the wiring is testable end to end.
+ * It does NOT auto-publish: the offline gate isn't trustworthy (spike finding), so
+ * live publish is reserved for the real-usage A/B (deferred). Runs on the user's own
+ * agent (claude -p) — no org key, cost lands on the user — in the background, weekly.
+ * HIVEMIND_SKILLOPT_WORKER=1 is set by the trigger as a recursion guard.
  */
+import os from "node:os";
+import path from "node:path";
 import { log as _log } from "../utils/debug.js";
+import { loadConfig } from "../config.js";
+import { DeeplakeApi } from "../deeplake-api.js";
+import { getStateDir } from "./state-dir.js";
+import { runSkillOptCycle, writeProposalToDisk, readSkillBodyFromDisk } from "./skillopt-engine.js";
 
 const log = (m: string) => _log("skillopt-worker", m);
 
 async function main(): Promise<void> {
   log("skillopt worker started (detached, weekly)");
-  // TODO(skillopt): wire the validated loop engine here once prerequisites land:
-  //   const skill = await detectDeficientSkill();        // needs deployed attribution + satisfaction
-  //   if (!skill) { log("no deficient skill found"); return; }
-  //   const v2 = await optimize(skill);                   // optimizer proposes a bounded edit
-  //   const gain = await gateViaRealRollout(skill, v2);   // keep only if v2 beats v1 (validated)
-  //   if (gain > THRESHOLD) await canaryPublish(skill, v2); // silent; monitor + auto-revert
-  log("skillopt worker: loop body not yet enabled (prerequisites pending) — exiting cleanly");
+  const config = loadConfig();
+  if (!config?.token) { log("no config/credentials — exiting"); return; }
+
+  const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName);
+  const query = (sql: string) => api.query(sql) as Promise<Array<Record<string, unknown>>>;
+  const skillsRoot = path.join(os.homedir(), ".claude", "skills");
+  const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals");
+  const sinceIso = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString(); // 30-day lookback
+
+  const res = await runSkillOptCycle({
+    query,
+    sessionsTable: config.sessionsTableName,
+    readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author),
+    writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec),
+    detector: { sinceIso, limit: 5000 },
+    now: new Date().toISOString(),
+  });
+
+  if (!res.fired) {
+    log(`skillopt: ${res.deficientCount} deficient skill(s) — below the fire gate, no action`);
+  } else {
+    const changed = res.proposals.filter((p) => p.changed).length;
+    log(`skillopt: fired — ${res.deficientCount} deficient, ${changed} edit proposal(s) written to ${proposalsRoot}`);
+  }
 }
 
 main().catch((e) => { log(`fatal (swallowed): ${(e as Error)?.message ?? e}`); process.exit(0); });

From e24319018f2241561e0a43bd128660c728f223d8 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 05:37:03 +0000
Subject: [PATCH 12/30] =?UTF-8?q?feat(skillopt):=20meta-skill=20=E2=80=94?=
 =?UTF-8?q?=20optimizer=20cross-run=20memory?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The paper's meta-skill: an append-only JSONL recording every proposed edit per
skill (with an order-independent fingerprint). The cycle feeds prior-tried edits to
the proposer as context and dedups — never re-writing an edit already proposed for a
skill, so the optimizer doesn't churn. Status field (proposed→applied/reverted) is
ready to close the loop once the A/B gate records outcomes. Wired into the worker;
meta + dedup tests.
---
 src/skillify/skill-proposer.ts       | 14 +++--
 src/skillify/skillopt-engine.ts      | 15 ++++-
 src/skillify/skillopt-meta.ts        | 84 ++++++++++++++++++++++++++++
 src/skillify/skillopt-worker.ts      |  8 +++
 tests/shared/skillopt-engine.test.ts | 21 +++++++
 tests/shared/skillopt-meta.test.ts   | 54 ++++++++++++++++++
 6 files changed, 188 insertions(+), 8 deletions(-)
 create mode 100644 src/skillify/skillopt-meta.ts
 create mode 100644 tests/shared/skillopt-meta.test.ts

diff --git a/src/skillify/skill-proposer.ts b/src/skillify/skill-proposer.ts
index aa9c0159..1032b1bd 100644
--- a/src/skillify/skill-proposer.ts
+++ b/src/skillify/skill-proposer.ts
@@ -20,8 +20,9 @@ export interface Proposal {
 }
 
 export interface ProposeConfig {
-  editBudget?: number; // max edits to keep (default 3)
-  model?: ModelCall;   // injected; default = claude sonnet
+  editBudget?: number;     // max edits to keep (default 3)
+  model?: ModelCall;       // injected; default = claude sonnet
+  priorEdits?: string[];   // meta-skill: edits already tried for this skill (don't repeat)
 }
 
 const SYSTEM =
@@ -34,9 +35,12 @@ const SYSTEM =
   'insert_after/replace/delete>","content":"<new text; required for ' +
   'append/insert_after/replace>"}. Prefer the smallest change that fixes the weakness.';
 
-function buildUserPrompt(body: string, failures: string[]): string {
+function buildUserPrompt(body: string, failures: string[], priorEdits: string[]): string {
   const cases = failures.slice(0, 8).map((f, i) => `${i + 1}. ${f}`).join("\n");
-  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}\n\nPropose the bounded edits. JSON array only.`;
+  const prior = priorEdits.length
+    ? `\n\nALREADY TRIED for this skill on earlier runs (do NOT repeat these — propose something different, or nothing):\n${priorEdits.slice(0, 12).map((p) => `- ${p}`).join("\n")}`
+    : "";
+  return `CURRENT SKILL:\n${body}\n\nCONFIRMED FAILURES it produced (user pushed back AND a judge confirmed the task was not accomplished):\n${cases}${prior}\n\nPropose the bounded edits. JSON array only.`;
 }
 
 const OPS = new Set<EditOp>(["append", "insert_after", "replace", "delete"]);
@@ -77,7 +81,7 @@ export async function proposeSkillEdit(
   const model = cfg.model ?? claudeModel("sonnet");
   let raw: string;
   try {
-    raw = await model(SYSTEM, buildUserPrompt(skillBody, failures));
+    raw = await model(SYSTEM, buildUserPrompt(skillBody, failures, cfg.priorEdits ?? []));
   } catch {
     return { edits: [], editedBody: skillBody, report: ["proposer model call failed"], changed: false };
   }
diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts
index 8af765d0..24d1c591 100644
--- a/src/skillify/skillopt-engine.ts
+++ b/src/skillify/skillopt-engine.ts
@@ -42,6 +42,11 @@ export interface CycleDeps {
   fireThreshold?: number; // deficient-skill count to fire (default 5)
   maxProposals?: number;  // cap edits proposed per cycle (default 10)
   now: string;            // ISO timestamp (injected — Date is awkward in workers)
+  meta?: {                // optimizer cross-run memory (skillopt-meta); optional
+    prior: (name: string, author: string) => string[];
+    has: (name: string, author: string, edits: Edit[]) => boolean;
+    record: (name: string, author: string, edits: Edit[]) => void;
+  };
 }
 
 export interface CycleResult {
@@ -64,16 +69,20 @@ export async function runSkillOptCycle(deps: CycleDeps): Promise<CycleResult> {
   for (const s of targets) {
     const body = deps.readSkillBody(s.name, s.author);
     if (!body) continue; // not installed locally → nothing to edit
-    const p = await proposeSkillEdit(body, s.examples, deps.proposer);
-    if (p.changed) {
+    const priorEdits = deps.meta?.prior(s.name, s.author) ?? [];
+    const p = await proposeSkillEdit(body, s.examples, { ...deps.proposer, priorEdits });
+    // dedup against the meta memory — don't re-write an edit already tried for this skill.
+    const isDup = p.changed && (deps.meta?.has(s.name, s.author, p.edits) ?? false);
+    if (p.changed && !isDup) {
       deps.writeProposal({
         name: s.name, author: s.author,
         invocations: s.invocations, confirmedFailures: s.confirmedFailures, failureRate: s.failureRate,
         examples: s.examples, edits: p.edits, report: p.report,
         candidateBody: p.editedBody, createdAt: deps.now,
       });
+      deps.meta?.record(s.name, s.author, p.edits);
     }
-    proposals.push({ name: s.name, author: s.author, changed: p.changed, failureRate: s.failureRate });
+    proposals.push({ name: s.name, author: s.author, changed: p.changed && !isDup, failureRate: s.failureRate });
   }
   return { deficientCount, fired: true, proposals };
 }
diff --git a/src/skillify/skillopt-meta.ts b/src/skillify/skillopt-meta.ts
new file mode 100644
index 00000000..f86473f6
--- /dev/null
+++ b/src/skillify/skillopt-meta.ts
@@ -0,0 +1,84 @@
+/**
+ * Meta-skill — the optimizer's cross-run memory (the paper's meta-skill). Records
+ * every edit proposed for a skill so later runs (a) don't re-propose an edit that
+ * was already tried, and (b) feed "what's been tried" to the proposer. When the A/B
+ * gate lands, the recorded `status` (proposed → applied/reverted) closes the loop so
+ * the optimizer learns which kinds of edits actually help a given skill.
+ *
+ * Append-only JSONL at <stateDir>/skillopt/meta.jsonl. Pure helpers + injected path,
+ * so it's unit-tested with a tmp file.
+ */
+import fs from "node:fs";
+import path from "node:path";
+import type { Edit } from "./skill-edits.js";
+
+export type MetaStatus = "proposed" | "applied" | "reverted";
+
+export interface MetaEntry {
+  skill: string;       // "<name>--<author>"
+  ops: string[];       // short per-edit summaries (op + anchor/preview)
+  fingerprint: string; // stable hash of the edits, for dedup
+  proposedAt: string;
+  status: MetaStatus;
+}
+
+export const skillRef = (name: string, author: string) => `${name}--${author}`;
+
+/** Short human summary of one edit. */
+function summarizeEdit(e: Edit): string {
+  const anchor = e.target ? ` @"${e.target.slice(0, 40)}"` : "";
+  const preview = e.content ? `: ${e.content.slice(0, 60).replace(/\s+/g, " ")}` : "";
+  return `${e.op}${anchor}${preview}`;
+}
+
+/** Order-independent fingerprint of an edit set (so the same edits dedup). */
+export function fingerprintEdits(edits: Edit[]): string {
+  return edits
+    .map((e) => `${e.op}|${e.target ?? ""}|${e.content ?? ""}`)
+    .sort()
+    .join("\n");
+}
+
+export function loadMeta(file: string): MetaEntry[] {
+  let raw: string;
+  try { raw = fs.readFileSync(file, "utf8"); } catch { return []; }
+  const out: MetaEntry[] = [];
+  for (const line of raw.split("\n")) {
+    const t = line.trim();
+    if (!t) continue;
+    try {
+      const e = JSON.parse(t) as MetaEntry;
+      if (e && typeof e.skill === "string" && typeof e.fingerprint === "string") out.push(e);
+    } catch { /* skip malformed line */ }
+  }
+  return out;
+}
+
+export function appendMeta(file: string, entry: MetaEntry): void {
+  fs.mkdirSync(path.dirname(file), { recursive: true });
+  fs.appendFileSync(file, JSON.stringify(entry) + "\n");
+}
+
+/** Has this exact edit set already been proposed for this skill? (avoid churn) */
+export function alreadyProposed(meta: MetaEntry[], name: string, author: string, edits: Edit[]): boolean {
+  const ref = skillRef(name, author);
+  const fp = fingerprintEdits(edits);
+  return meta.some((m) => m.skill === ref && m.fingerprint === fp);
+}
+
+/** Summaries of edits previously tried for this skill — context for the proposer. */
+export function priorEditSummaries(meta: MetaEntry[], name: string, author: string): string[] {
+  const ref = skillRef(name, author);
+  return meta.filter((m) => m.skill === ref).flatMap((m) => m.ops);
+}
+
+/** Build a meta entry for a freshly-proposed edit set. */
+export function metaEntryFor(name: string, author: string, edits: Edit[], now: string): MetaEntry {
+  return {
+    skill: skillRef(name, author),
+    ops: edits.map(summarizeEdit),
+    fingerprint: fingerprintEdits(edits),
+    proposedAt: now,
+    status: "proposed",
+  };
+}
diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts
index 10dc8040..8027272a 100644
--- a/src/skillify/skillopt-worker.ts
+++ b/src/skillify/skillopt-worker.ts
@@ -17,6 +17,7 @@ import { loadConfig } from "../config.js";
 import { DeeplakeApi } from "../deeplake-api.js";
 import { getStateDir } from "./state-dir.js";
 import { runSkillOptCycle, writeProposalToDisk, readSkillBodyFromDisk } from "./skillopt-engine.js";
+import { loadMeta, appendMeta, priorEditSummaries, alreadyProposed, metaEntryFor } from "./skillopt-meta.js";
 
 const log = (m: string) => _log("skillopt-worker", m);
 
@@ -29,6 +30,8 @@ async function main(): Promise<void> {
   const query = (sql: string) => api.query(sql) as Promise<Array<Record<string, unknown>>>;
   const skillsRoot = path.join(os.homedir(), ".claude", "skills");
   const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals");
+  const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl");
+  const metaCache = loadMeta(metaFile);
   const sinceIso = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString(); // 30-day lookback
 
   const res = await runSkillOptCycle({
@@ -36,6 +39,11 @@ async function main(): Promise<void> {
     sessionsTable: config.sessionsTableName,
     readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author),
     writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec),
+    meta: {
+      prior: (n, a) => priorEditSummaries(metaCache, n, a),
+      has: (n, a, edits) => alreadyProposed(metaCache, n, a, edits),
+      record: (n, a, edits) => { const e = metaEntryFor(n, a, edits, new Date().toISOString()); appendMeta(metaFile, e); metaCache.push(e); },
+    },
     detector: { sinceIso, limit: 5000 },
     now: new Date().toISOString(),
   });
diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts
index 6ecfbbfb..d297fda6 100644
--- a/tests/shared/skillopt-engine.test.ts
+++ b/tests/shared/skillopt-engine.test.ts
@@ -75,6 +75,27 @@ describe("runSkillOptCycle", () => {
     expect(written.some((w) => w.name === "bad0")).toBe(false);
   });
 
+  it("dedups against meta memory: a skill whose edit was already proposed isn't re-written", async () => {
+    const written: ProposalRecord[] = [];
+    const recorded: string[] = [];
+    const res = await runSkillOptCycle({
+      query: world(6), sessionsTable: "sessions", now: "t",
+      readSkillBody: () => "## Rules\n1. mock the client",
+      writeProposal: (r) => written.push(r),
+      detector: { judge: judge() }, proposer: { model: proposerModel() },
+      meta: {
+        prior: () => ["append: earlier idea"],          // fed to the proposer as context
+        has: (name) => name === "bad0",                  // bad0 already tried → dedup
+        record: (name) => recorded.push(name),
+      },
+    });
+    expect(res.fired).toBe(true);
+    expect(written).toHaveLength(5);                     // bad0 deduped
+    expect(written.some((w) => w.name === "bad0")).toBe(false);
+    expect(recorded).not.toContain("bad0");              // not recorded again
+    expect(res.proposals.find((p) => p.name === "bad0")!.changed).toBe(false);
+  });
+
   it("honors a custom fireThreshold", async () => {
     const res = await runSkillOptCycle({
       query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3,
diff --git a/tests/shared/skillopt-meta.test.ts b/tests/shared/skillopt-meta.test.ts
new file mode 100644
index 00000000..f92d8516
--- /dev/null
+++ b/tests/shared/skillopt-meta.test.ts
@@ -0,0 +1,54 @@
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import {
+  fingerprintEdits, alreadyProposed, priorEditSummaries, metaEntryFor, loadMeta, appendMeta,
+} from "../../src/skillify/skillopt-meta.js";
+import type { Edit } from "../../src/skillify/skill-edits.js";
+
+const edits: Edit[] = [{ op: "append", content: "always flush" }, { op: "replace", target: "mock", content: "do not mock" }];
+
+describe("fingerprintEdits", () => {
+  it("is order-independent (same set → same fingerprint)", () => {
+    expect(fingerprintEdits(edits)).toBe(fingerprintEdits([...edits].reverse()));
+  });
+  it("differs for different content", () => {
+    expect(fingerprintEdits(edits)).not.toBe(fingerprintEdits([{ op: "append", content: "other" }]));
+  });
+});
+
+describe("alreadyProposed / priorEditSummaries", () => {
+  const meta = [metaEntryFor("posthog", "kamo", edits, "t1"), metaEntryFor("other", "x", [{ op: "append", content: "z" }], "t2")];
+  it("matches a prior proposal by skill + fingerprint", () => {
+    expect(alreadyProposed(meta, "posthog", "kamo", [...edits].reverse())).toBe(true);
+    expect(alreadyProposed(meta, "posthog", "kamo", [{ op: "append", content: "new" }])).toBe(false);
+    expect(alreadyProposed(meta, "nope", "kamo", edits)).toBe(false); // different skill
+  });
+  it("surfaces prior edit summaries only for the given skill", () => {
+    const prior = priorEditSummaries(meta, "posthog", "kamo");
+    expect(prior.length).toBe(2);
+    expect(prior.join(" ")).toContain("append");
+    expect(priorEditSummaries(meta, "posthog", "kamo").join(" ")).not.toContain('append: z'); // other skill's edit
+  });
+});
+
+describe("loadMeta / appendMeta", () => {
+  let file: string;
+  beforeEach(() => { file = path.join(fs.mkdtempSync(path.join(os.tmpdir(), "meta-")), "meta.jsonl"); });
+  afterEach(() => { fs.rmSync(path.dirname(file), { recursive: true, force: true }); });
+
+  it("round-trips entries and skips malformed lines", () => {
+    appendMeta(file, metaEntryFor("a", "b", edits, "t1"));
+    appendMeta(file, metaEntryFor("c", "d", [{ op: "append", content: "x" }], "t2"));
+    fs.appendFileSync(file, "{ not json }\n\n");
+    const loaded = loadMeta(file);
+    expect(loaded).toHaveLength(2);
+    expect(loaded[0].skill).toBe("a--b");
+    expect(loaded[0].status).toBe("proposed");
+  });
+
+  it("returns [] for a missing file", () => {
+    expect(loadMeta(path.join(os.tmpdir(), "does-not-exist-xyz.jsonl"))).toEqual([]);
+  });
+});

From b03c10a9514abb705b219b6072eaf6b519d60b44 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:06:16 +0000
Subject: [PATCH 13/30] feat(skillopt): edit-outcome gate (longitudinal
 before/after validation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The validation organ. A randomized A/B needs the skill version at invocation time
(a capture change we lack), so the feasible gate is longitudinal: after an edit
publishes, compare the skill's confirmed-failure rate after vs before — a real drop
keeps it, a real rise reverts (from the SKILL.v<old> backup), unclear/too-few =
inconclusive. Observational (needs margin + min sample); randomized A/B is the
upgrade once invocation-version capture lands. Extracts scoreInvocations (shared by
detector + gate); adds untilIso to listSkillInvocations. 7 tests.
---
 src/skillify/deficiency-detector.ts  | 50 +++++++++++-----
 src/skillify/skill-edit-gate.ts      | 89 ++++++++++++++++++++++++++++
 src/skillify/skill-invocations.ts    |  3 +-
 tests/shared/skill-edit-gate.test.ts | 58 ++++++++++++++++++
 4 files changed, 184 insertions(+), 16 deletions(-)
 create mode 100644 src/skillify/skill-edit-gate.ts
 create mode 100644 tests/shared/skill-edit-gate.test.ts

diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts
index 70152597..e209df0b 100644
--- a/src/skillify/deficiency-detector.ts
+++ b/src/skillify/deficiency-detector.ts
@@ -41,6 +41,40 @@ export interface DetectorConfig {
 
 const skillKey = (name: string, author: string) => `${name}--${author}`;
 
+export interface ScoreConfig {
+  window?: { before?: number; after?: number; maxChars?: number };
+  judge?: ModelCall;
+}
+
+/**
+ * Score a set of invocations: window each, run the free anchor, and judge ONLY the
+ * anchored ones. Shared by the detector (per-skill deficiency) and the edit gate
+ * (a skill's failure rate in a time window).
+ */
+export async function scoreInvocations(
+  query: QueryFn,
+  sessionsTable: string,
+  invocations: SkillInvocation[],
+  cfg: ScoreConfig = {},
+): Promise<{ anchored: number; confirmed: number; examples: string[] }> {
+  let anchored = 0;
+  let confirmed = 0;
+  const examples: string[] = [];
+  for (const inv of invocations) {
+    const turns = await windowedTurns(query, sessionsTable, inv, cfg.window);
+    const anchor = detectAnchor(turns);
+    if (!anchor.anchored) continue; // free filter — no judge call
+    anchored++;
+    const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n");
+    const verdict = await judgeSuccess(window, { model: cfg.judge });
+    if (verdict.success === 0) {
+      confirmed++;
+      if (examples.length < 3) examples.push(verdict.reason || anchor.evidence);
+    }
+  }
+  return { anchored, confirmed, examples };
+}
+
 export interface DetectionResult {
   skills: SkillDeficiency[];
   deficientCount: number;
@@ -65,21 +99,7 @@ export async function detectDeficientSkills(
 
   const skills: SkillDeficiency[] = [];
   for (const list of groups.values()) {
-    let anchored = 0;
-    let confirmed = 0;
-    const examples: string[] = [];
-    for (const inv of list) {
-      const turns = await windowedTurns(query, sessionsTable, inv, cfg.window);
-      const anchor = detectAnchor(turns);
-      if (!anchor.anchored) continue;          // free filter — no judge call
-      anchored++;
-      const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n");
-      const verdict = await judgeSuccess(window, { model: cfg.judge });
-      if (verdict.success === 0) {             // confirmed: anchor AND judge agree
-        confirmed++;
-        if (examples.length < 3) examples.push(verdict.reason || anchor.evidence);
-      }
-    }
+    const { anchored, confirmed, examples } = await scoreInvocations(query, sessionsTable, list, cfg);
     const failureRate = list.length ? confirmed / list.length : 0;
     skills.push({
       name: list[0].name,
diff --git a/src/skillify/skill-edit-gate.ts b/src/skillify/skill-edit-gate.ts
new file mode 100644
index 00000000..50d01450
--- /dev/null
+++ b/src/skillify/skill-edit-gate.ts
@@ -0,0 +1,89 @@
+/**
+ * Edit-outcome gate — the validation organ (the paper's gate, adapted).
+ *
+ * A randomized A/B is the ideal, but it needs the skill VERSION recorded at
+ * invocation time (a capture change we don't have yet — the Skill tool_use only
+ * carries the skill name). So the feasible gate is LONGITUDINAL: after an edit is
+ * published, compare the skill's confirmed-failure rate in the window AFTER publish
+ * vs BEFORE. A real drop = the edit helped → keep; a real rise = it hurt → revert
+ * (one `cp` from the SKILL.v<old>.bak backup). No clear signal / too few post-publish
+ * uses → inconclusive (wait, or revert when stale).
+ *
+ * It's OBSERVATIONAL (confounded — the population shifts week to week), so it needs
+ * a margin + a minimum sample. Randomized A/B is the clean upgrade once invocation-
+ * version capture lands. Reuses scoreInvocations, so the same anchor+judge that
+ * detects deficiency also validates the fix. Injected query/judge → unit-testable.
+ */
+import { listSkillInvocations, type QueryFn } from "./skill-invocations.js";
+import { scoreInvocations } from "./deficiency-detector.js";
+import type { ModelCall } from "./claude-model.js";
+
+export interface WindowStats {
+  invocations: number;
+  anchored: number;
+  confirmed: number;
+  failureRate: number; // confirmed / invocations
+}
+
+export interface GateDecision {
+  before: WindowStats;
+  after: WindowStats;
+  delta: number; // before.failureRate - after.failureRate (positive = improved)
+  decision: "keep" | "revert" | "inconclusive";
+}
+
+interface MeasureOpts {
+  sinceIso?: string;
+  untilIso?: string;
+  limit?: number;
+  window?: { before?: number; after?: number; maxChars?: number };
+  judge?: ModelCall;
+}
+
+/** Confirmed-failure rate for one skill over a time window. */
+export async function measureSkillFailureRate(
+  query: QueryFn,
+  sessionsTable: string,
+  name: string,
+  author: string,
+  opts: MeasureOpts = {},
+): Promise<WindowStats> {
+  const all = await listSkillInvocations(query, sessionsTable, { sinceIso: opts.sinceIso, untilIso: opts.untilIso, limit: opts.limit });
+  const mine = all.filter((i) => i.name === name && i.author === author);
+  const { anchored, confirmed } = await scoreInvocations(query, sessionsTable, mine, { window: opts.window, judge: opts.judge });
+  return { invocations: mine.length, anchored, confirmed, failureRate: mine.length ? confirmed / mine.length : 0 };
+}
+
+/** Pure decision from before/after stats. */
+export function gateEditOutcome(
+  before: WindowStats,
+  after: WindowStats,
+  opts: { margin?: number; minAfter?: number } = {},
+): GateDecision {
+  const margin = opts.margin ?? 0.2;
+  const minAfter = opts.minAfter ?? 5;
+  const delta = before.failureRate - after.failureRate;
+  let decision: GateDecision["decision"];
+  if (after.invocations < minAfter) decision = "inconclusive";              // not enough post-publish use
+  else if (delta >= margin) decision = "keep";                              // failure rate dropped → helped
+  else if (after.failureRate - before.failureRate >= margin) decision = "revert"; // got measurably worse
+  else decision = "inconclusive";                                          // no clear signal
+  return { before, after, delta, decision };
+}
+
+/** Full gate: measure before/after a publish timestamp and decide. */
+export async function gateEdit(
+  query: QueryFn,
+  sessionsTable: string,
+  name: string,
+  author: string,
+  publishIso: string,
+  opts: { windowDays?: number; nowIso?: string; margin?: number; minAfter?: number } & MeasureOpts = {},
+): Promise<GateDecision> {
+  const windowDays = opts.windowDays ?? 14;
+  const beforeSince = new Date(Date.parse(publishIso) - windowDays * 24 * 60 * 60 * 1000).toISOString();
+  const shared = { limit: opts.limit, window: opts.window, judge: opts.judge };
+  const before = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: beforeSince, untilIso: publishIso });
+  const after = await measureSkillFailureRate(query, sessionsTable, name, author, { ...shared, sinceIso: publishIso, untilIso: opts.nowIso });
+  return gateEditOutcome(before, after, opts);
+}
diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts
index e5ef664f..544cd3a3 100644
--- a/src/skillify/skill-invocations.ts
+++ b/src/skillify/skill-invocations.ts
@@ -68,10 +68,11 @@ export function splitOrgSkill(skill: string): { name: string; author: string } |
 export async function listSkillInvocations(
   query: QueryFn,
   sessionsTable: string,
-  opts: { sinceIso?: string; limit?: number } = {},
+  opts: { sinceIso?: string; untilIso?: string; limit?: number } = {},
 ): Promise<SkillInvocation[]> {
   const where = [`CAST(message AS TEXT) LIKE '%"Skill"%'`];
   if (opts.sinceIso) where.push(`last_update_date >= '${sqlStr(opts.sinceIso)}'`);
+  if (opts.untilIso) where.push(`last_update_date < '${sqlStr(opts.untilIso)}'`);
   const limit = opts.limit && opts.limit > 0 ? ` LIMIT ${Math.floor(opts.limit)}` : "";
   const rows = await query(
     `SELECT message, last_update_date FROM "${sessionsTable}" WHERE ${where.join(" AND ")} ORDER BY last_update_date DESC${limit}`,
diff --git a/tests/shared/skill-edit-gate.test.ts b/tests/shared/skill-edit-gate.test.ts
new file mode 100644
index 00000000..283ff1fb
--- /dev/null
+++ b/tests/shared/skill-edit-gate.test.ts
@@ -0,0 +1,58 @@
+import { describe, it, expect, vi } from "vitest";
+import { gateEditOutcome, gateEdit, type WindowStats } from "../../src/skillify/skill-edit-gate.js";
+
+const stats = (invocations: number, failureRate: number): WindowStats =>
+  ({ invocations, anchored: Math.round(invocations * failureRate), confirmed: Math.round(invocations * failureRate), failureRate });
+
+describe("gateEditOutcome", () => {
+  it("KEEP when the failure rate dropped by >= margin", () => {
+    expect(gateEditOutcome(stats(10, 0.6), stats(10, 0.1)).decision).toBe("keep");
+  });
+  it("REVERT when it got measurably worse", () => {
+    expect(gateEditOutcome(stats(10, 0.1), stats(10, 0.5)).decision).toBe("revert");
+  });
+  it("INCONCLUSIVE when there's too little post-publish use", () => {
+    expect(gateEditOutcome(stats(10, 0.6), stats(3, 0.0)).decision).toBe("inconclusive");
+  });
+  it("INCONCLUSIVE when the change is within the margin (noise)", () => {
+    expect(gateEditOutcome(stats(10, 0.30), stats(10, 0.25)).decision).toBe("inconclusive");
+  });
+});
+
+const invRow = (skill: string, sid: string) => ({
+  message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid },
+  last_update_date: sid,
+});
+const transcript = (skill: string, sid: string, pushback: boolean) => [
+  { message: { type: "user_message", content: "do it" } },
+  { message: { type: "assistant_message", content: "done (mocked)" } },
+  { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+  { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "looks good thanks" } },
+];
+
+describe("gateEdit (longitudinal before/after)", () => {
+  it("keeps an edit whose failure rate dropped after publish", async () => {
+    const PUB = "2026-06-05T00:00:00.000Z";
+    const transcripts = new Map<string, Array<Record<string, unknown>>>();
+    const beforeInvs: Array<Record<string, unknown>> = [];
+    const afterInvs: Array<Record<string, unknown>> = [];
+    for (let i = 0; i < 8; i++) {
+      const b = `bef${i}`, a = `aft${i}`;
+      beforeInvs.push(invRow("x--a", b)); transcripts.set(b, transcript("x--a", b, true));   // before: all pushback
+      afterInvs.push(invRow("x--a", a)); transcripts.set(a, transcript("x--a", a, false));    // after: none
+    }
+    const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"mocks"}');
+    const query = vi.fn(async (sql: string) => {
+      if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) {
+        return sql.includes(`< '${PUB}'`) ? beforeInvs : afterInvs; // before window has the untilIso bound
+      }
+      const m = sql.match(/\/sessions\/%([^%]+)%/);
+      return m ? (transcripts.get(m[1]) ?? []) : [];
+    });
+
+    const res = await gateEdit(query, "sessions", "x", "a", PUB, { windowDays: 14, nowIso: "2026-06-12T00:00:00.000Z", judge, minAfter: 5 });
+    expect(res.before.failureRate).toBeCloseTo(1.0);
+    expect(res.after.failureRate).toBeCloseTo(0.0);
+    expect(res.decision).toBe("keep");
+  });
+});

From a1905fda0c7319e716a9dc6ec15f62442bd304f1 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:22:46 +0000
Subject: [PATCH 14/30] review: remove committed node_modules symlink
 (coderabbit)

A machine-specific absolute node_modules symlink got committed in 1929c96e because
.gitignore only had `node_modules/` (matches a directory, not the symlink file).
Untracked it and tightened the ignore to `node_modules` so the symlink can't be
re-added. The symlink stays locally (for builds); it's just no longer in git.
---
 .gitignore   | 2 +-
 node_modules | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)
 delete mode 120000 node_modules

diff --git a/.gitignore b/.gitignore
index a0b7da6d..d6754c7c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-node_modules/
+node_modules
 dist/
 tmp/
 *.js.map
diff --git a/node_modules b/node_modules
deleted file mode 120000
index 1dd8c0a6..00000000
--- a/node_modules
+++ /dev/null
@@ -1 +0,0 @@
-/home/ubuntu/al-projects/hivemind/node_modules
\ No newline at end of file

From 67ee35c0a0a52834c2830dea181f439b4bb098ab Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:32:43 +0000
Subject: [PATCH 15/30] review: cap the judged window at maxChars (codex P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scoreInvocations built the judge window from windowedTurns without applying the
maxChars cap, so a captured turn containing a pasted log/diff could be sent whole
to claude -p — slow, costly, or failing. Extracted elide() from
windowAroundInvocation and apply it to the window before judging. Test asserts a
5000-char paste is capped under maxChars.
---
 src/skillify/deficiency-detector.ts      |  4 ++--
 src/skillify/skill-invocations.ts        | 15 +++++++++------
 tests/shared/deficiency-detector.test.ts | 21 +++++++++++++++++++++
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts
index e209df0b..6ed606ab 100644
--- a/src/skillify/deficiency-detector.ts
+++ b/src/skillify/deficiency-detector.ts
@@ -14,7 +14,7 @@
  * The ≥5 fire gate lives with the caller (worker): we just return deficientCount.
  */
 import {
-  listSkillInvocations, windowedTurns, type QueryFn, type SkillInvocation,
+  listSkillInvocations, windowedTurns, elide, type QueryFn, type SkillInvocation,
 } from "./skill-invocations.js";
 import { detectAnchor } from "./session-anchor.js";
 import { judgeSuccess, type ModelCall } from "./success-judge.js";
@@ -65,7 +65,7 @@ export async function scoreInvocations(
     const anchor = detectAnchor(turns);
     if (!anchor.anchored) continue; // free filter — no judge call
     anchored++;
-    const window = turns.map((t) => `${t.role}: ${t.text}`).join("\n\n");
+    const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000);
     const verdict = await judgeSuccess(window, { model: cfg.judge });
     if (verdict.success === 0) {
       confirmed++;
diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts
index 544cd3a3..1a6d5c64 100644
--- a/src/skillify/skill-invocations.ts
+++ b/src/skillify/skill-invocations.ts
@@ -152,17 +152,20 @@ export async function windowedTurns(
   return turns.slice(Math.max(0, invIndex - before), invIndex + after);
 }
 
+/** Head+tail elide a string to maxChars (so a pasted log/diff can't blow a prompt). */
+export function elide(text: string, maxChars: number): string {
+  if (text.length <= maxChars) return text;
+  const head = text.slice(0, Math.floor(maxChars * 0.55));
+  const tail = text.slice(text.length - Math.floor(maxChars * 0.45));
+  return `${head}\n\n…[${text.length - maxChars} chars elided]…\n\n${tail}`;
+}
+
 export async function windowAroundInvocation(
   query: QueryFn,
   sessionsTable: string,
   inv: SkillInvocation,
   opts: { before?: number; after?: number; maxChars?: number } = {},
 ): Promise<string> {
-  const maxChars = opts.maxChars ?? 4000;
   const slice = await windowedTurns(query, sessionsTable, inv, opts);
-  const joined = slice.map((t) => `${t.role}: ${t.text}`).join("\n\n");
-  if (joined.length <= maxChars) return joined;
-  const head = joined.slice(0, Math.floor(maxChars * 0.55));
-  const tail = joined.slice(joined.length - Math.floor(maxChars * 0.45));
-  return `${head}\n\n…[${joined.length - maxChars} chars elided]…\n\n${tail}`;
+  return elide(slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000);
 }
diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts
index 6e00b3d2..57972fa4 100644
--- a/tests/shared/deficiency-detector.test.ts
+++ b/tests/shared/deficiency-detector.test.ts
@@ -52,6 +52,27 @@ describe("detectDeficientSkills", () => {
     expect(judge).toHaveBeenCalledTimes(8);
   });
 
+  it("caps the judged window at maxChars (a pasted log can't blow the judge call)", async () => {
+    const huge = "L".repeat(5000);
+    const skill = "bigskill--x", sid = "S1";
+    const transcripts = new Map<string, Array<Record<string, unknown>>>([[sid, [
+      { message: { type: "user_message", content: "do it" } },
+      { message: { type: "assistant_message", content: huge } },                                  // pasted log
+      { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+      { message: { type: "user_message", content: "no that's wrong" } },
+    ]]]);
+    let judgedLen = 0;
+    const judge = vi.fn(async (_s: string, user: string) => { judgedLen = user.length; return '{"success":0,"confidence":0.9,"reason":"x"}'; });
+    const query = vi.fn(async (sql: string) => {
+      if (sql.includes('"Skill"') && sql.includes("ORDER BY last_update_date")) return [invRow(skill, sid)];
+      const m = sql.match(/\/sessions\/%([^%]+)%/);
+      return m ? (transcripts.get(m[1]) ?? []) : [];
+    });
+    await detectDeficientSkills(query, TABLE, { judge, minInvocations: 1, window: { maxChars: 300 } });
+    expect(judgedLen).toBeGreaterThan(0);  // judge was called (anchored)
+    expect(judgedLen).toBeLessThan(800);   // capped — not the ~5000-char paste
+  });
+
   it("respects a custom threshold + min-n", async () => {
     const { invs, transcripts } = world();
     const judge = vi.fn(async (_s: string, _u: string) => '{"success":0,"confidence":0.9,"reason":"x"}');

From f4047fc82a6fb2528b7ad59721f8ba4fafab5d26 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:32:43 +0000
Subject: [PATCH 16/30] review: deny all write-capable Claude tools in the
 judge/proposer (codex P2)

The judge & proposer get untrusted captured transcript text in their prompts, so
--disallowed-tools is the prompt-injection guard. The list missed write-capable
tools (MultiEdit, NotebookEdit, TodoWrite), leaving an escape hatch in installs
that expose them. Enumerated them so a malicious failure example can't act.
---
 src/skillify/claude-model.ts | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
index 47b5e8f0..30be3701 100644
--- a/src/skillify/claude-model.ts
+++ b/src/skillify/claude-model.ts
@@ -9,7 +9,14 @@ import { spawn } from "node:child_process";
 /** (systemPrompt, userPrompt) -> raw model text. */
 export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
 
-const DENY = ["Bash", "Edit", "Write", "Read", "Glob", "Grep", "WebFetch", "WebSearch", "Task"];
+// Deny EVERY write/exec/network tool — the judge & proposer get untrusted captured
+// transcript text in their prompts, so a prompt-injected failure example must not be
+// able to act. Enumerate the write-capable ones (MultiEdit/NotebookEdit/TodoWrite)
+// too, not just the obvious Edit/Write.
+const DENY = [
+  "Bash", "Edit", "MultiEdit", "Write", "NotebookEdit", "Read", "Glob", "Grep",
+  "WebFetch", "WebSearch", "Task", "TodoWrite",
+];
 
 export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall {
   const timeoutMs = opts.timeoutMs ?? 120_000;

From fc3825a383f4d0a50d4af7d7e38744d9071aa8a5 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:32:43 +0000
Subject: [PATCH 17/30] review: worker reads project-scoped skill root too
 (codex P2)

The worker only read ~/.claude/skills, so a deficient skill pulled with
--to project (<cwd>/.claude/skills) had its invocation detected but no body to
edit -> the proposal was silently skipped. The detached worker inherits the
SessionStart cwd, so it now falls back to <cwd>/.claude/skills.
---
 src/skillify/skillopt-worker.ts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts
index 8027272a..1e0596c2 100644
--- a/src/skillify/skillopt-worker.ts
+++ b/src/skillify/skillopt-worker.ts
@@ -28,7 +28,12 @@ async function main(): Promise<void> {
 
   const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName);
   const query = (sql: string) => api.query(sql) as Promise<Array<Record<string, unknown>>>;
+  // Read both the global root and the project-scoped root (skills pulled with
+  // `--to project` live under <cwd>/.claude/skills; the detached worker inherits
+  // the SessionStart cwd). Without the project root, a deficient project-pulled
+  // skill would be silently skipped (readSkillBody → null).
   const skillsRoot = path.join(os.homedir(), ".claude", "skills");
+  const projectRoot = path.join(process.cwd(), ".claude", "skills");
   const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals");
   const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl");
   const metaCache = loadMeta(metaFile);
@@ -37,7 +42,7 @@ async function main(): Promise<void> {
   const res = await runSkillOptCycle({
     query,
     sessionsTable: config.sessionsTableName,
-    readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author),
+    readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author) ?? readSkillBodyFromDisk(projectRoot, name, author),
     writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec),
     meta: {
       prior: (n, a) => priorEditSummaries(metaCache, n, a),

From a3e31861b5cab17fa802711b01df64d1ac633458 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:43:20 +0000
Subject: [PATCH 18/30] review: run judge/proposer with capture disabled (codex
 P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On a machine with the Hivemind hooks installed, each judge/proposer claude -p call
was captured as a real session row — polluting the very sessions data the detector
scans. Spawn the child with HIVEMIND_CAPTURE=false, as the skillify gate runner does.
---
 src/skillify/claude-model.ts | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
index 30be3701..b11f15be 100644
--- a/src/skillify/claude-model.ts
+++ b/src/skillify/claude-model.ts
@@ -25,7 +25,13 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M
       "-p", user, "--model", model, "--no-session-persistence",
       "--output-format", "json", "--system-prompt", system, "--disallowed-tools", ...DENY,
     ];
-    const child = spawn("claude", args, { stdio: ["ignore", "pipe", "pipe"] });
+    // HIVEMIND_CAPTURE=false so these judge/proposer calls are NOT captured as
+    // real sessions — otherwise the engine pollutes the very sessions data it
+    // scans (and the synthetic prompts would show up as transcript rows).
+    const child = spawn("claude", args, {
+      stdio: ["ignore", "pipe", "pipe"],
+      env: { ...process.env, HIVEMIND_CAPTURE: "false" },
+    });
     let out = "";
     let err = "";
     const timer = setTimeout(() => { child.kill("SIGKILL"); reject(new Error("claude timed out")); }, timeoutMs);

From a118d0a1ad268346007d97f769cc7100bcb6f526 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 06:43:20 +0000
Subject: [PATCH 19/30] review: strong pushback overrides benign in the anchor
 (codex P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"thanks, but this is still failing" matched PUSHBACK but BENIGN's bare "thank"
also matched, so detectAnchor returned false and the failure was dropped — bad for
a recall-oriented stage. Split into STRONG corrections (always anchor, even amid
polite words) and the ambiguous bare "no" (benign-gated). Test covers polite
corrections.
---
 src/skillify/session-anchor.ts      | 18 +++++++++++-------
 tests/shared/session-anchor.test.ts |  5 +++++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts
index 677b8598..0b9fac9e 100644
--- a/src/skillify/session-anchor.ts
+++ b/src/skillify/session-anchor.ts
@@ -19,23 +19,27 @@ export interface Anchor {
   evidence: string; // the user turn that triggered it (truncated)
 }
 
-// User pushback: rejection / correction of what the assistant just produced.
-const PUSHBACK = /\b(no|nope|wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|that'?s wrong|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail)/i;
+// Unambiguous correction — ALWAYS an anchor, even amid polite words. This must
+// win over BENIGN so "thanks, but this is still failing" still fires.
+const STRONG = /\b(wrong|incorrect|not what|that'?s not|does ?n'?t work|did ?n'?t work|do ?n'?t work|wo ?n'?t work|is ?n'?t|broke|broken|still (failing|broken|not working|wrong|the same)|try again|undo|revert that|that fail|not right)/i;
 
-// Clear benign negatives we don't want to fire on (keeps obvious false positives
-// out of the judge to save tokens). Intentionally narrow — when in doubt, fire.
-const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good|thank)/i;
+// Ambiguous negation: "no" is pushback ("no, that's off") but also benign
+// ("no problem"), so it only anchors when the turn isn't a clear benign phrase.
+const AMBIGUOUS = /\b(no|nope)\b/i;
+const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i;
 
 /**
  * Detect a correction anchor in a windowed slice of turns. Only a USER turn that
  * immediately follows an ASSISTANT turn can be pushback (the first user turn is
- * the request, not a reaction).
+ * the request, not a reaction). Recall-oriented: a strong correction phrase fires
+ * regardless of polite framing; only the bare "no" is benign-gated.
  */
 export function detectAnchor(turns: Turn[]): Anchor {
   for (let i = 1; i < turns.length; i++) {
     const t = turns[i];
     if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue;
-    if (PUSHBACK.test(t.text) && !BENIGN.test(t.text)) {
+    const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text));
+    if (anchored) {
       return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) };
     }
   }
diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts
index 1780584f..62dc7ee3 100644
--- a/tests/shared/session-anchor.test.ts
+++ b/tests/shared/session-anchor.test.ts
@@ -34,6 +34,11 @@ describe("detectAnchor", () => {
     }
   });
 
+  it("fires on polite-but-failing corrections (strong pushback overrides benign words)", () => {
+    expect(detectAnchor([a("here"), u("thanks, but this is still failing")]).anchored).toBe(true);
+    expect(detectAnchor([a("done"), u("perfect start, but that's still wrong")]).anchored).toBe(true);
+  });
+
   it("returns none when the user is satisfied / silent", () => {
     expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false);
     expect(detectAnchor([]).anchored).toBe(false);

From 50913652fb241ab4e385c1f79d097d903b9019b2 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:23:26 +0000
Subject: [PATCH 20/30] review: reject protected-region overlaps, not just
 inside-starts (codex P2)

targetsProtected only checked the target's start offset, so a replace/delete whose
target began just before SLOW_UPDATE_START and spanned into the block slipped
through and could remove protected longitudinal guidance. Now rejects any target
RANGE that overlaps [r0, r1). Test covers a spanning target.
---
 src/skillify/skill-edits.ts      | 6 +++++-
 tests/shared/skill-edits.test.ts | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/skillify/skill-edits.ts b/src/skillify/skill-edits.ts
index 218ba6a0..115c1aa9 100644
--- a/src/skillify/skill-edits.ts
+++ b/src/skillify/skill-edits.ts
@@ -30,7 +30,11 @@ function targetsProtected(skill: string, target: string): boolean {
   const r = protectedRange(skill);
   if (!r || !target) return false;
   const idx = skill.indexOf(target);
-  return idx !== -1 && idx >= r[0] && idx < r[1];
+  if (idx === -1) return false;
+  // Reject if the target RANGE [idx, idx+len) overlaps the protected range at all —
+  // not just if it starts inside it (a target that begins just before SLOW_UPDATE_START
+  // and spans into the block must not be allowed to delete protected guidance).
+  return idx < r[1] && idx + target.length > r[0];
 }
 
 /** Enforce the edit budget ("textual learning rate"): keep at most `budget` edits. */
diff --git a/tests/shared/skill-edits.test.ts b/tests/shared/skill-edits.test.ts
index 3322cb48..9143270c 100644
--- a/tests/shared/skill-edits.test.ts
+++ b/tests/shared/skill-edits.test.ts
@@ -43,6 +43,15 @@ describe("applyEdits", () => {
     // appended content sits before the protected block
     expect(r.skill.indexOf("2. b")).toBeLessThan(r.skill.indexOf(SU_START));
   });
+
+  it("rejects a target that SPANS INTO the protected region (not just one starting inside)", () => {
+    const doc = `## Rules\n1. a\n${SU_START}\nLongitudinal guidance.\n${SU_END}`;
+    // target begins before SLOW_UPDATE_START but extends into the protected block
+    const r = applyEdits(doc, [{ op: "delete", target: `1. a\n${SU_START}\nLongitudinal` }]);
+    expect(r.applied).toBe(0);
+    expect(r.skill).toContain("Longitudinal guidance.");
+    expect(r.report.some((l) => l.includes("protected slow-update region"))).toBe(true);
+  });
 });
 
 describe("selectEdits (edit budget)", () => {

From b934458b084ece8127bdc8427b6ca846b690d929 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:23:26 +0000
Subject: [PATCH 21/30] review: anchor only post-invocation turns + validate
 skill refs as paths (codex P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two findings sharing skill-invocations.ts:
- detectAnchor scanned the whole window incl. pre-invocation turns, so a correction
  from an EARLIER failed attempt (skill used as a repair) was misattributed to this
  skill. windowedTurns now returns a pivot (first post-invocation turn); detectAnchor
  takes fromIndex and only fires when both the reaction AND the assistant it reacts to
  are post-invocation. Test transcripts reordered to real capture order
  (user → tool_call → assistant → reaction).
- splitOrgSkill now rejects refs containing path separators or '..' before name/author
  are used to build skills/proposals filesystem paths (path-escape guard, matching the
  pull path's untrusted treatment).
---
 src/skillify/deficiency-detector.ts      |  4 ++--
 src/skillify/session-anchor.ts           | 15 +++++++++------
 src/skillify/skill-invocations.ts        | 21 +++++++++++++++++----
 tests/shared/deficiency-detector.test.ts |  4 ++--
 tests/shared/session-anchor.test.ts      |  8 ++++++++
 tests/shared/skill-edit-gate.test.ts     |  2 +-
 tests/shared/skill-invocations.test.ts   |  6 ++++++
 tests/shared/skillopt-engine.test.ts     |  2 +-
 8 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/skillify/deficiency-detector.ts b/src/skillify/deficiency-detector.ts
index 6ed606ab..73b77514 100644
--- a/src/skillify/deficiency-detector.ts
+++ b/src/skillify/deficiency-detector.ts
@@ -61,8 +61,8 @@ export async function scoreInvocations(
   let confirmed = 0;
   const examples: string[] = [];
   for (const inv of invocations) {
-    const turns = await windowedTurns(query, sessionsTable, inv, cfg.window);
-    const anchor = detectAnchor(turns);
+    const { turns, pivot } = await windowedTurns(query, sessionsTable, inv, cfg.window);
+    const anchor = detectAnchor(turns, pivot); // anchor only on post-invocation reaction
     if (!anchor.anchored) continue; // free filter — no judge call
     anchored++;
     const window = elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), cfg.window?.maxChars ?? 4000);
diff --git a/src/skillify/session-anchor.ts b/src/skillify/session-anchor.ts
index 0b9fac9e..83056d8d 100644
--- a/src/skillify/session-anchor.ts
+++ b/src/skillify/session-anchor.ts
@@ -29,15 +29,18 @@ const AMBIGUOUS = /\b(no|nope)\b/i;
 const BENIGN = /\b(no (problem|worries|need|biggie)|no,? thanks|all good|works? (now|great|fine|perfectly)|that works|perfect|looks good)\b/i;
 
 /**
- * Detect a correction anchor in a windowed slice of turns. Only a USER turn that
- * immediately follows an ASSISTANT turn can be pushback (the first user turn is
- * the request, not a reaction). Recall-oriented: a strong correction phrase fires
- * regardless of polite framing; only the bare "no" is benign-gated.
+ * Detect a correction anchor in a windowed slice of turns. A pushback is a USER turn
+ * reacting to an ASSISTANT turn — and BOTH must be POST-invocation (index ≥ fromIndex),
+ * so a correction that happened BEFORE the skill ran (e.g. the skill was a repair
+ * attempt) isn't misattributed to this skill. fromIndex defaults to 0 (scan all).
+ * Recall-oriented: a strong correction fires regardless of polite framing; only the
+ * bare "no" is benign-gated.
  */
-export function detectAnchor(turns: Turn[]): Anchor {
-  for (let i = 1; i < turns.length; i++) {
+export function detectAnchor(turns: Turn[], fromIndex = 0): Anchor {
+  for (let i = Math.max(1, fromIndex); i < turns.length; i++) {
     const t = turns[i];
     if (t.role !== "USER" || turns[i - 1].role !== "ASSISTANT") continue;
+    if (i - 1 < fromIndex) continue; // the assistant being reacted to must be post-invocation
     const anchored = STRONG.test(t.text) || (AMBIGUOUS.test(t.text) && !BENIGN.test(t.text));
     if (anchored) {
       return { anchored: true, kind: "correction", evidence: t.text.slice(0, 200) };
diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts
index 1a6d5c64..791a65d7 100644
--- a/src/skillify/skill-invocations.ts
+++ b/src/skillify/skill-invocations.ts
@@ -55,6 +55,10 @@ export function invokedSkillRef(msg: ParsedMsg): string | null {
 /** Split "<name>--<author>" → parts. null for plugin-namespaced / bare / malformed refs. */
 export function splitOrgSkill(skill: string): { name: string; author: string } | null {
   if (skill.includes(":")) return null; // plugin-namespaced (e.g. hivemind:hivemind-memory)
+  // name/author are used to build filesystem paths (skills dir, proposals dir), so a
+  // captured tool_input must not smuggle path separators / traversal — same untrusted
+  // treatment the pull path applies to these segments.
+  if (skill.includes("/") || skill.includes("\\") || skill.includes("..")) return null;
   const i = skill.lastIndexOf("--");
   if (i <= 0 || i + 2 >= skill.length) return null; // bare or malformed
   return { name: skill.slice(0, i), author: skill.slice(i + 2) };
@@ -140,16 +144,25 @@ async function sessionTurns(
  * turns after — where the help-or-harm signal lives — head+tail elided to maxChars.
  * `before`/`after` are tunable; defaults chosen as a small starting point.
  */
+/** A windowed slice plus `pivot` = the index in `turns` of the first POST-invocation
+ * turn (turns before it are the pre-invocation context — kept for the judge, but the
+ * anchor must not scan them, or a prior correction gets misattributed to this skill). */
+export interface WindowSlice {
+  turns: Turn[];
+  pivot: number;
+}
+
 export async function windowedTurns(
   query: QueryFn,
   sessionsTable: string,
   inv: SkillInvocation,
   opts: { before?: number; after?: number } = {},
-): Promise<Turn[]> {
+): Promise<WindowSlice> {
   const before = opts.before ?? 3;
   const after = opts.after ?? 6;
   const { turns, invIndex } = await sessionTurns(query, sessionsTable, inv);
-  return turns.slice(Math.max(0, invIndex - before), invIndex + after);
+  const start = Math.max(0, invIndex - before);
+  return { turns: turns.slice(start, invIndex + after), pivot: invIndex - start };
 }
 
 /** Head+tail elide a string to maxChars (so a pasted log/diff can't blow a prompt). */
@@ -166,6 +179,6 @@ export async function windowAroundInvocation(
   inv: SkillInvocation,
   opts: { before?: number; after?: number; maxChars?: number } = {},
 ): Promise<string> {
-  const slice = await windowedTurns(query, sessionsTable, inv, opts);
-  return elide(slice.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000);
+  const { turns } = await windowedTurns(query, sessionsTable, inv, opts);
+  return elide(turns.map((t) => `${t.role}: ${t.text}`).join("\n\n"), opts.maxChars ?? 4000);
 }
diff --git a/tests/shared/deficiency-detector.test.ts b/tests/shared/deficiency-detector.test.ts
index 57972fa4..5c9ee877 100644
--- a/tests/shared/deficiency-detector.test.ts
+++ b/tests/shared/deficiency-detector.test.ts
@@ -9,8 +9,8 @@ const invRow = (skill: string, sid: string) => ({
 });
 const transcript = (skill: string, sid: string, pushback: boolean) => [
   { message: { type: "user_message", content: "do it" } },
-  { message: { type: "assistant_message", content: "done" } },
   { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+  { message: { type: "assistant_message", content: "done" } },
   { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } },
 ];
 
@@ -57,8 +57,8 @@ describe("detectDeficientSkills", () => {
     const skill = "bigskill--x", sid = "S1";
     const transcripts = new Map<string, Array<Record<string, unknown>>>([[sid, [
       { message: { type: "user_message", content: "do it" } },
-      { message: { type: "assistant_message", content: huge } },                                  // pasted log
       { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+      { message: { type: "assistant_message", content: huge } },                                  // pasted log
       { message: { type: "user_message", content: "no that's wrong" } },
     ]]]);
     let judgedLen = 0;
diff --git a/tests/shared/session-anchor.test.ts b/tests/shared/session-anchor.test.ts
index 62dc7ee3..942462c7 100644
--- a/tests/shared/session-anchor.test.ts
+++ b/tests/shared/session-anchor.test.ts
@@ -39,6 +39,14 @@ describe("detectAnchor", () => {
     expect(detectAnchor([a("done"), u("perfect start, but that's still wrong")]).anchored).toBe(true);
   });
 
+  it("ignores PRE-invocation pushback (fromIndex) — no misattribution to a repair-attempt skill", () => {
+    // turns: [a, USER pushback (pre-invocation), a (skill output), USER ok] — pivot=2
+    const turns = [a("attempt 1"), u("no that's wrong"), a("retried with the skill"), u("looks good")];
+    expect(detectAnchor(turns, 2).anchored).toBe(false); // the pre-invocation correction is not scanned
+    // a genuine POST-invocation pushback still fires
+    expect(detectAnchor([a("attempt 1"), u("no wrong"), a("fixed"), u("still failing")], 2).anchored).toBe(true);
+  });
+
   it("returns none when the user is satisfied / silent", () => {
     expect(detectAnchor([u("do X"), a("done")]).anchored).toBe(false);
     expect(detectAnchor([]).anchored).toBe(false);
diff --git a/tests/shared/skill-edit-gate.test.ts b/tests/shared/skill-edit-gate.test.ts
index 283ff1fb..4cbb760e 100644
--- a/tests/shared/skill-edit-gate.test.ts
+++ b/tests/shared/skill-edit-gate.test.ts
@@ -25,8 +25,8 @@ const invRow = (skill: string, sid: string) => ({
 });
 const transcript = (skill: string, sid: string, pushback: boolean) => [
   { message: { type: "user_message", content: "do it" } },
-  { message: { type: "assistant_message", content: "done (mocked)" } },
   { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+  { message: { type: "assistant_message", content: "done (mocked)" } },
   { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "looks good thanks" } },
 ];
 
diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts
index ecdfadbb..ec8e584d 100644
--- a/tests/shared/skill-invocations.test.ts
+++ b/tests/shared/skill-invocations.test.ts
@@ -39,6 +39,12 @@ describe("splitOrgSkill", () => {
     expect(splitOrgSkill("update-config")).toBeNull();            // bare
     expect(splitOrgSkill("baz--")).toBeNull();                    // empty author
   });
+  it("rejects refs with path separators / traversal (no path escape)", () => {
+    expect(splitOrgSkill("../../etc--x")).toBeNull();
+    expect(splitOrgSkill("ok--..%2f")).toBeNull();   // contains ..
+    expect(splitOrgSkill("a/b--c")).toBeNull();      // separator
+    expect(splitOrgSkill("a--b/c")).toBeNull();      // separator in author
+  });
 });
 
 describe("listSkillInvocations", () => {
diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts
index d297fda6..cd9bf3cc 100644
--- a/tests/shared/skillopt-engine.test.ts
+++ b/tests/shared/skillopt-engine.test.ts
@@ -7,8 +7,8 @@ const invRow = (skill: string, sid: string) => ({
 });
 const transcript = (skill: string, sid: string, pushback: boolean) => [
   { message: { type: "user_message", content: "do it" } },
-  { message: { type: "assistant_message", content: "done" } },
   { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), timestamp: sid } },
+  { message: { type: "assistant_message", content: "done" } },
   { message: { type: "user_message", content: pushback ? "no that's wrong, it mocks the client" : "thanks, perfect" } },
 ];
 

From b3db22edfe473d3ecc86b1dd88d1f2834cb95d34 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:37:59 +0000
Subject: [PATCH 22/30] review: no-tools allow-list for judge/proposer, not a
 deny-list (codex P1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A deny-list only blocks named built-ins — any other tool in the user's env (LS,
NotebookRead, configured MCP tools) stayed callable, breaking the pure-text
isolation for calls that include untrusted captured transcript text. Switched to
`--tools """ (empty allow-list = zero tools available, authoritative over built-ins
AND MCP). Verified the flag isolates with a real claude -p call.
---
 src/skillify/claude-model.ts | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
index b11f15be..7be4bf87 100644
--- a/src/skillify/claude-model.ts
+++ b/src/skillify/claude-model.ts
@@ -9,21 +9,16 @@ import { spawn } from "node:child_process";
 /** (systemPrompt, userPrompt) -> raw model text. */
 export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
 
-// Deny EVERY write/exec/network tool — the judge & proposer get untrusted captured
-// transcript text in their prompts, so a prompt-injected failure example must not be
-// able to act. Enumerate the write-capable ones (MultiEdit/NotebookEdit/TodoWrite)
-// too, not just the obvious Edit/Write.
-const DENY = [
-  "Bash", "Edit", "MultiEdit", "Write", "NotebookEdit", "Read", "Glob", "Grep",
-  "WebFetch", "WebSearch", "Task", "TodoWrite",
-];
-
 export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): ModelCall {
   const timeoutMs = opts.timeoutMs ?? 120_000;
   return (system, user) => new Promise<string>((resolve, reject) => {
     const args = [
       "-p", user, "--model", model, "--no-session-persistence",
-      "--output-format", "json", "--system-prompt", system, "--disallowed-tools", ...DENY,
+      "--output-format", "json", "--system-prompt", system,
+      // Empty allow-list = NO tools available. Authoritative: it covers built-ins AND
+      // any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected
+      // transcript text in the judge/proposer prompt can never trigger tool use.
+      "--tools", "",
     ];
     // HIVEMIND_CAPTURE=false so these judge/proposer calls are NOT captured as
     // real sessions — otherwise the engine pollutes the very sessions data it

From ba78b2878b6a7b6be51d69ee775c2b07a03f46d2 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:37:59 +0000
Subject: [PATCH 23/30] review: resolve skill body via pull manifest, not the
 worker cwd (codex P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Invocations come from ALL projects, but the worker resolved project-scoped skills
from its own SessionStart cwd — so a skill pulled --to project in another cwd was
skipped, or a same-named skill in the current cwd was edited instead.
readSkillBodyViaManifest now resolves the authoritative installRoot from the pull
manifest (global ~/.claude/skills as fallback). Test covers the project-root read.
---
 src/skillify/skillopt-engine.ts      | 24 ++++++++++++++++++++++++
 src/skillify/skillopt-worker.ts      | 14 +++++++-------
 tests/shared/skillopt-engine.test.ts | 23 ++++++++++++++++++++++-
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/src/skillify/skillopt-engine.ts b/src/skillify/skillopt-engine.ts
index 24d1c591..4d1a5f8f 100644
--- a/src/skillify/skillopt-engine.ts
+++ b/src/skillify/skillopt-engine.ts
@@ -18,6 +18,7 @@ import { proposeSkillEdit, type ProposeConfig } from "./skill-proposer.js";
 import { splitFrontmatter } from "./skill-publisher.js";
 import type { QueryFn } from "./skill-invocations.js";
 import type { Edit } from "./skill-edits.js";
+import type { PulledManifest } from "./manifest.js";
 
 export interface ProposalRecord {
   name: string;
@@ -105,3 +106,26 @@ export function readSkillBodyFromDisk(skillsRoot: string, name: string, author:
     return null;
   }
 }
+
+/**
+ * Resolve a skill's body from its ACTUAL install location via the pull manifest,
+ * trying every recorded installRoot, then a fallback root. Authoritative — handles
+ * skills pulled with `--to project` into any cwd (invocations come from all
+ * projects, so the worker can't assume its own cwd), and avoids editing a
+ * same-named skill that happens to sit in the current cwd.
+ */
+export function readSkillBodyViaManifest(
+  name: string,
+  author: string,
+  manifest: PulledManifest,
+  fallbackRoot?: string,
+): string | null {
+  const dirName = `${name}--${author}`;
+  const roots = manifest.entries.filter((e) => e.dirName === dirName).map((e) => e.installRoot);
+  if (fallbackRoot) roots.push(fallbackRoot);
+  for (const root of roots) {
+    const body = readSkillBodyFromDisk(root, name, author);
+    if (body) return body;
+  }
+  return null;
+}
diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts
index 1e0596c2..ffaa1284 100644
--- a/src/skillify/skillopt-worker.ts
+++ b/src/skillify/skillopt-worker.ts
@@ -16,8 +16,9 @@ import { log as _log } from "../utils/debug.js";
 import { loadConfig } from "../config.js";
 import { DeeplakeApi } from "../deeplake-api.js";
 import { getStateDir } from "./state-dir.js";
-import { runSkillOptCycle, writeProposalToDisk, readSkillBodyFromDisk } from "./skillopt-engine.js";
+import { runSkillOptCycle, writeProposalToDisk, readSkillBodyViaManifest } from "./skillopt-engine.js";
 import { loadMeta, appendMeta, priorEditSummaries, alreadyProposed, metaEntryFor } from "./skillopt-meta.js";
+import { loadManifest } from "./manifest.js";
 
 const log = (m: string) => _log("skillopt-worker", m);
 
@@ -28,12 +29,11 @@ async function main(): Promise<void> {
 
   const api = new DeeplakeApi(config.token, config.apiUrl, config.orgId, config.workspaceId, config.tableName);
   const query = (sql: string) => api.query(sql) as Promise<Array<Record<string, unknown>>>;
-  // Read both the global root and the project-scoped root (skills pulled with
-  // `--to project` live under <cwd>/.claude/skills; the detached worker inherits
-  // the SessionStart cwd). Without the project root, a deficient project-pulled
-  // skill would be silently skipped (readSkillBody → null).
+  // Resolve skill bodies via the pull manifest's recorded installRoot (authoritative)
+  // — invocations come from ALL projects, so we can't assume the worker's own cwd.
+  // The global ~/.claude/skills is a fallback for skills not in the manifest.
+  const manifest = loadManifest();
   const skillsRoot = path.join(os.homedir(), ".claude", "skills");
-  const projectRoot = path.join(process.cwd(), ".claude", "skills");
   const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals");
   const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl");
   const metaCache = loadMeta(metaFile);
@@ -42,7 +42,7 @@ async function main(): Promise<void> {
   const res = await runSkillOptCycle({
     query,
     sessionsTable: config.sessionsTableName,
-    readSkillBody: (name, author) => readSkillBodyFromDisk(skillsRoot, name, author) ?? readSkillBodyFromDisk(projectRoot, name, author),
+    readSkillBody: (name, author) => readSkillBodyViaManifest(name, author, manifest, skillsRoot),
     writeProposal: (rec) => writeProposalToDisk(proposalsRoot, rec),
     meta: {
       prior: (n, a) => priorEditSummaries(metaCache, n, a),
diff --git a/tests/shared/skillopt-engine.test.ts b/tests/shared/skillopt-engine.test.ts
index cd9bf3cc..b06fda27 100644
--- a/tests/shared/skillopt-engine.test.ts
+++ b/tests/shared/skillopt-engine.test.ts
@@ -1,5 +1,9 @@
 import { describe, it, expect, vi } from "vitest";
-import { runSkillOptCycle, type ProposalRecord } from "../../src/skillify/skillopt-engine.js";
+import fs from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { runSkillOptCycle, readSkillBodyViaManifest, type ProposalRecord } from "../../src/skillify/skillopt-engine.js";
+import type { PulledManifest } from "../../src/skillify/manifest.js";
 
 const invRow = (skill: string, sid: string) => ({
   message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill }), session_id: sid, timestamp: sid },
@@ -96,6 +100,23 @@ describe("runSkillOptCycle", () => {
     expect(res.proposals.find((p) => p.name === "bad0")!.changed).toBe(false);
   });
 
+  it("reads a project-pulled skill body via the manifest's installRoot (not the cwd)", () => {
+    const projRoot = fs.mkdtempSync(path.join(os.tmpdir(), "proj-"));
+    try {
+      fs.mkdirSync(path.join(projRoot, "x--a"), { recursive: true });
+      fs.writeFileSync(path.join(projRoot, "x--a", "SKILL.md"), "---\nname: x\nauthor: a\n---\n## Body\nproject body");
+      const manifest = {
+        version: 1,
+        entries: [{ dirName: "x--a", name: "x", author: "a", installRoot: projRoot, projectKey: "", remoteVersion: 1, install: "project", installedAtVersion: 1, pulledAt: "", symlinks: [] }],
+      } as unknown as PulledManifest;
+      expect(readSkillBodyViaManifest("x", "a", manifest, "/nonexistent-global")).toBe("## Body\nproject body");
+      // no manifest entry + no fallback body → null (not a silent wrong-skill edit)
+      expect(readSkillBodyViaManifest("y", "b", manifest, "/nonexistent-global")).toBeNull();
+    } finally {
+      fs.rmSync(projRoot, { recursive: true, force: true });
+    }
+  });
+
   it("honors a custom fireThreshold", async () => {
     const res = await runSkillOptCycle({
       query: world(3), sessionsTable: "sessions", now: "t", fireThreshold: 3,

From fe1b913ff25a5cc3f49d498553660631942338a5 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:54:23 +0000
Subject: [PATCH 24/30] review: skip this package's SessionStart hook on
 internal claude -p calls (codex P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

HIVEMIND_CAPTURE=false stopped capture, but the spawned judge/proposer claude -p
still ran our SessionStart hook — injecting the large Deeplake context into the
prompt and doing auto-pull/graph work, once per anchored invocation. Set
HIVEMIND_WIKI_WORKER=1 (the guard internal runners use) so the hook returns
immediately: no prompt contamination, no repeated background work.
---
 src/skillify/claude-model.ts | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
index 7be4bf87..f0f7641b 100644
--- a/src/skillify/claude-model.ts
+++ b/src/skillify/claude-model.ts
@@ -20,12 +20,14 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M
       // transcript text in the judge/proposer prompt can never trigger tool use.
       "--tools", "",
     ];
-    // HIVEMIND_CAPTURE=false so these judge/proposer calls are NOT captured as
-    // real sessions — otherwise the engine pollutes the very sessions data it
-    // scans (and the synthetic prompts would show up as transcript rows).
+    // HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND
+    // HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart
+    // hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph
+    // work) — one child per anchored invocation would otherwise contaminate the judge
+    // prompt and pile up background work. Same guard the other internal runners use.
     const child = spawn("claude", args, {
       stdio: ["ignore", "pipe", "pipe"],
-      env: { ...process.env, HIVEMIND_CAPTURE: "false" },
+      env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" },
     });
     let out = "";
     let err = "";

From 62907c66464c05b1838d3c1ed69e75134f63cbdf Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:54:23 +0000
Subject: [PATCH 25/30] review: match session rows exactly when reconstructing
 windows (codex P3)

path LIKE '%sid%' could pull rows from an unrelated session (sid as a substring, or
SQL LIKE wildcards in the id), feeding the judge a mixed transcript. Escape LIKE
wildcards (+ ESCAPE clause) and drop any row whose message.session_id isn't the exact
session. Test covers a cross-session collision row.
---
 src/skillify/skill-invocations.ts      | 12 ++++++++++--
 tests/shared/skill-invocations.test.ts | 13 +++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/skillify/skill-invocations.ts b/src/skillify/skill-invocations.ts
index 791a65d7..ae9ed9b7 100644
--- a/src/skillify/skill-invocations.ts
+++ b/src/skillify/skill-invocations.ts
@@ -108,18 +108,26 @@ export interface Turn { role: "USER" | "ASSISTANT"; text: string }
  * Reconstruct the transcript turns of a session, and mark where (between which two
  * turns) the given invocation happened — so callers can window around it.
  */
+/** Escape SQL LIKE wildcards (% _ \) so a session id with those chars matches literally. */
+function likeEscape(s: string): string {
+  return s.replace(/([\\%_])/g, "\\$1");
+}
+
 async function sessionTurns(
   query: QueryFn, sessionsTable: string, inv: SkillInvocation,
 ): Promise<{ turns: Turn[]; invIndex: number }> {
-  const sid = sqlStr(inv.sessionId);
+  const sid = sqlStr(likeEscape(inv.sessionId));
   const rows = await query(
-    `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ORDER BY creation_date ASC`,
+    `SELECT message FROM "${sessionsTable}" WHERE path LIKE '/sessions/%${sid}%' ESCAPE '\\' ORDER BY creation_date ASC`,
   );
   const turns: Turn[] = [];
   let invIndex = -1;
   for (const r of rows) {
     const j = parseMessage(r.message);
     if (!j) continue;
+    // Exact session match: `path LIKE %sid%` can match a substring/wildcard collision,
+    // so drop any row whose recorded session_id isn't this exact session.
+    if (typeof j.session_id === "string" && j.session_id !== inv.sessionId) continue;
     // The invocation itself is a tool_call (not a turn): mark its position then skip.
     const ref = invokedSkillRef(j);
     if (ref) {
diff --git a/tests/shared/skill-invocations.test.ts b/tests/shared/skill-invocations.test.ts
index ec8e584d..8e8e1bfd 100644
--- a/tests/shared/skill-invocations.test.ts
+++ b/tests/shared/skill-invocations.test.ts
@@ -106,4 +106,17 @@ describe("windowAroundInvocation", () => {
     expect(out).toContain("chars elided");
     expect(out.length).toBeLessThan(300);
   });
+
+  it("drops rows from a different session + escapes LIKE wildcards (exact match)", async () => {
+    const { fn, calls } = mockQuery([
+      { message: { type: "user_message", content: "first", session_id: "S1" } },
+      { message: { type: "tool_call", tool_name: "Skill", tool_input: JSON.stringify({ skill: "posthog-smoke--kamo" }), session_id: "S1", timestamp: "t5" } },
+      { message: { type: "assistant_message", content: "did X", session_id: "S1" } },
+      { message: { type: "user_message", content: "LEAK from other session", session_id: "S2" } }, // collision → dropped
+    ]);
+    const out = await windowAroundInvocation(fn, TABLE, inv, { before: 5, after: 5 });
+    expect(calls[0]).toContain("ESCAPE '\\'");
+    expect(out).toContain("did X");
+    expect(out).not.toContain("LEAK from other session");
+  });
 });

From 78d68a82554a2ec990039341c015ad34572f1f68 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 18:54:23 +0000
Subject: [PATCH 26/30] review: create frontmatter when bumping a SKILL.md
 without one (codex P3)

bumpVersion('') left the frontmatter empty while reporting newVersion: 2, so a
published edit of a no-frontmatter skill carried no version and the gate couldn't
tell versions apart. Now emits a minimal '---\nversion: N\n---\n' block. Test added.
---
 src/skillify/skill-publisher.ts      | 11 ++++++++---
 tests/shared/skill-publisher.test.ts |  5 +++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/skillify/skill-publisher.ts b/src/skillify/skill-publisher.ts
index 68a3f52d..a647dcb5 100644
--- a/src/skillify/skill-publisher.ts
+++ b/src/skillify/skill-publisher.ts
@@ -31,9 +31,14 @@ export function bumpVersion(frontmatter: string): { frontmatter: string; oldVers
   const m = frontmatter.match(/^version:\s*(\d+)\s*$/m);
   const oldVersion = m ? parseInt(m[1], 10) : 1;
   const newVersion = oldVersion + 1;
-  const next = m
-    ? frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`)
-    : frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`);
+  let next: string;
+  if (m) {
+    next = frontmatter.replace(/^version:\s*\d+\s*$/m, `version: ${newVersion}`); // has a version line
+  } else if (/\n---\n$/.test(frontmatter)) {
+    next = frontmatter.replace(/\n---\n$/, `\nversion: ${newVersion}\n---\n`);     // frontmatter, no version
+  } else {
+    next = `---\nversion: ${newVersion}\n---\n`;                                   // no frontmatter → create one
+  }
   return { frontmatter: next, oldVersion, newVersion };
 }
 
diff --git a/tests/shared/skill-publisher.test.ts b/tests/shared/skill-publisher.test.ts
index 065b5929..62916504 100644
--- a/tests/shared/skill-publisher.test.ts
+++ b/tests/shared/skill-publisher.test.ts
@@ -27,6 +27,11 @@ describe("bumpVersion", () => {
     expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 });
     expect(r.frontmatter).toMatch(/version: 2\n---\n$/);
   });
+  it("creates a frontmatter block when the doc has none", () => {
+    const r = bumpVersion("");
+    expect(r).toMatchObject({ oldVersion: 1, newVersion: 2 });
+    expect(r.frontmatter).toBe("---\nversion: 2\n---\n");
+  });
 });
 
 describe("publishSkillEdit", () => {

From 3335238b1146b9c28edb18584c7d52a684cb8681 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 19:12:12 +0000
Subject: [PATCH 27/30] review: resolve the claude binary via findAgentBin, not
 PATH (codex P2)

A detached hook worker may not have claude on PATH (e.g. ~/.claude/local/claude);
spawn("claude") would ENOENT and the callers swallow it as no-change, so the weekly
worker silently produced no proposals in those layouts. Resolve the binary the same
way the rest of skillify does (findAgentBin('claude_code')).
---
 src/skillify/claude-model.ts | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
index f0f7641b..eba80ffc 100644
--- a/src/skillify/claude-model.ts
+++ b/src/skillify/claude-model.ts
@@ -5,6 +5,7 @@
  * unit-testable with zero real calls.
  */
 import { spawn } from "node:child_process";
+import { findAgentBin } from "./gate-runner.js";
 
 /** (systemPrompt, userPrompt) -> raw model text. */
 export type ModelCall = (systemPrompt: string, userPrompt: string) => Promise<string>;
@@ -25,7 +26,10 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M
     // hook entirely (no Deeplake-context injection into the prompt, no auto-pull/graph
     // work) — one child per anchored invocation would otherwise contaminate the judge
     // prompt and pile up background work. Same guard the other internal runners use.
-    const child = spawn("claude", args, {
+    // Resolve the claude binary the same way the rest of skillify does — a detached
+    // hook worker may not have it on PATH (e.g. ~/.claude/local/claude), and a bare
+    // "claude" would ENOENT and the callers would swallow it as no-change.
+    const child = spawn(findAgentBin("claude_code"), args, {
       stdio: ["ignore", "pipe", "pipe"],
       env: { ...process.env, HIVEMIND_CAPTURE: "false", HIVEMIND_WIKI_WORKER: "1" },
     });

From f8427c0669df9cbd2fc8e091b1a642850733bf9b Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 19:12:12 +0000
Subject: [PATCH 28/30] review: don't burn the weekly throttle when logged out
 (codex P2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

runWeeklySkillOpt stamped lastRun before the worker checks creds, so firing at a
logged-out SessionStart consumed the week — a user who logged in minutes later was
skipped as throttled until next week. Added a canFire gate (default: creds present)
that returns reason 'no-creds' before any stamp/spawn. Test covers it.
---
 src/skillify/skillopt-trigger.ts      | 13 ++++++++++++-
 tests/shared/skillopt-trigger.test.ts |  8 ++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/skillify/skillopt-trigger.ts b/src/skillify/skillopt-trigger.ts
index f96a757a..81504d31 100644
--- a/src/skillify/skillopt-trigger.ts
+++ b/src/skillify/skillopt-trigger.ts
@@ -16,9 +16,15 @@ import { fileURLToPath } from "node:url";
 import { log as _log } from "../utils/debug.js";
 import { getStateDir } from "./state-dir.js";
 import { tryAcquireWorkerLock, releaseWorkerLock } from "./state.js";
+import { loadCredentials } from "../commands/auth.js";
 
 const log = (m: string) => _log("skillopt-trigger", m);
 
+/** The worker needs creds to query Deeplake; default fire-gate skips when logged out. */
+function defaultHasCreds(): boolean {
+  try { return Boolean(loadCredentials()?.token); } catch { return false; }
+}
+
 export const WEEK_MS = 7 * 24 * 60 * 60 * 1000;
 /** Cross-process lock key arbitrating the weekly fire (see runWeeklySkillOpt). */
 const LOCK_KEY = "skillopt-weekly";
@@ -81,11 +87,12 @@ export interface FireDeps {
   tryLock?: () => boolean;            // cross-process arbiter; default: real worker lock
   releaseLock?: () => void;          // default: release the real worker lock
   reload?: () => SkillOptState;      // fresh state re-read INSIDE the lock; default: loadState
+  canFire?: () => boolean;           // gate before stamping; default: creds present
 }
 
 export interface FireResult {
   fired: boolean;
-  reason?: "disabled" | "in-worker" | "throttled" | "locked" | "spawned";
+  reason?: "disabled" | "in-worker" | "throttled" | "locked" | "no-creds" | "spawned";
 }
 
 /**
@@ -102,6 +109,10 @@ export function runWeeklySkillOpt(deps: FireDeps = {}): FireResult {
   // Cheap pre-lock check: skip the lock entirely when clearly throttled.
   if (!shouldFire(state.lastRun, now)) return { fired: false, reason: "throttled" };
 
+  // Don't burn the weekly throttle when logged out — stamping lastRun before the
+  // worker confirms creds would skip a user who logs in shortly after until next week.
+  if (!(deps.canFire ?? defaultHasCreds)()) return { fired: false, reason: "no-creds" };
+
   // Cross-process arbiter: two SessionStart hooks racing at the weekly boundary
   // could both pass the throttle and spawn duplicate workers (doubling user-side
   // cost once the worker does real LLM work). An atomic openSync(wx) worker-lock
diff --git a/tests/shared/skillopt-trigger.test.ts b/tests/shared/skillopt-trigger.test.ts
index 7049e5da..ff3c3751 100644
--- a/tests/shared/skillopt-trigger.test.ts
+++ b/tests/shared/skillopt-trigger.test.ts
@@ -34,6 +34,7 @@ describe("runWeeklySkillOpt (auto-fire decision)", () => {
       tryLock: () => true,          // injected so the unit test touches no real lock file
       releaseLock: release,
       reload: () => over.state ?? {}, // in-lock re-read mirrors the pre-lock state by default
+      canFire: () => true,           // injected so the unit test doesn't read real credentials
       ...over,
     });
     return { res, saved, spawn, release };
@@ -47,6 +48,13 @@ describe("runWeeklySkillOpt (auto-fire decision)", () => {
     expect(release).toHaveBeenCalledTimes(1); // lock released after firing
   });
 
+  it("does NOT fire or stamp when logged out (preserves the throttle for a fresh login)", () => {
+    const { res, saved, spawn } = harness({ state: {}, canFire: () => false });
+    expect(res).toEqual({ fired: false, reason: "no-creds" });
+    expect(spawn).not.toHaveBeenCalled();
+    expect(saved).toEqual([]); // no stamp → next session after login fires
+  });
+
   it("does NOT fire when another process holds the weekly lock (cross-process race)", () => {
     const { res, saved, spawn } = harness({ state: {}, tryLock: () => false });
     expect(res).toEqual({ fired: false, reason: "locked" });

From ae278af5ea4d5476ee689093bfe704b97335b57a Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 19:24:52 +0000
Subject: [PATCH 29/30] review: strict MCP config on model calls + match
 worker's config gate (codex P2)

Two follow-ons from round 6:
- claude-model: --tools "" denies tool USE but claude -p still LOADS the user's MCP
  config; a broken/oversized MCP schema fails every judge/proposer call before JSON.
  Added --strict-mcp-config to ignore user MCP entirely.
- trigger: the canFire gate used the credentials FILE, but the worker uses loadConfig()
  (which also accepts HIVEMIND_TOKEN/ORG_ID env creds). Mismatch skipped env-cred users
  forever / stamped on a token-only malformed file. Now both use loadConfig()?.token.
---
 src/skillify/claude-model.ts     |  4 ++++
 src/skillify/skillopt-trigger.ts | 10 +++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/skillify/claude-model.ts b/src/skillify/claude-model.ts
index eba80ffc..34ea08e6 100644
--- a/src/skillify/claude-model.ts
+++ b/src/skillify/claude-model.ts
@@ -20,6 +20,10 @@ export function claudeModel(model: string, opts: { timeoutMs?: number } = {}): M
       // any MCP/configured tools (a deny-list can't enumerate those), so prompt-injected
       // transcript text in the judge/proposer prompt can never trigger tool use.
       "--tools", "",
+      // --strict-mcp-config ignores the user's MCP config entirely (--tools only denies
+      // USE, not LOADING) — a broken/oversized user MCP schema would otherwise fail every
+      // judge/proposer call before it returns JSON, silently stopping proposals.
+      "--strict-mcp-config",
     ];
     // HIVEMIND_CAPTURE=false so these calls aren't captured as real sessions, AND
     // HIVEMIND_WIKI_WORKER=1 so the spawned claude -p skips this package's SessionStart
diff --git a/src/skillify/skillopt-trigger.ts b/src/skillify/skillopt-trigger.ts
index 81504d31..cc9e7d9e 100644
--- a/src/skillify/skillopt-trigger.ts
+++ b/src/skillify/skillopt-trigger.ts
@@ -16,13 +16,17 @@ import { fileURLToPath } from "node:url";
 import { log as _log } from "../utils/debug.js";
 import { getStateDir } from "./state-dir.js";
 import { tryAcquireWorkerLock, releaseWorkerLock } from "./state.js";
-import { loadCredentials } from "../commands/auth.js";
+import { loadConfig } from "../config.js";
 
 const log = (m: string) => _log("skillopt-trigger", m);
 
-/** The worker needs creds to query Deeplake; default fire-gate skips when logged out. */
+/**
+ * Fire-gate: the worker queries Deeplake via loadConfig(), which accepts both the
+ * credentials file AND env creds (HIVEMIND_TOKEN/HIVEMIND_ORG_ID). Use the SAME check
+ * here so we neither skip env-cred users forever nor stamp on a malformed token-only file.
+ */
 function defaultHasCreds(): boolean {
-  try { return Boolean(loadCredentials()?.token); } catch { return false; }
+  try { return Boolean(loadConfig()?.token); } catch { return false; }
 }
 
 export const WEEK_MS = 7 * 24 * 60 * 60 * 1000;

From 06ca4cb54f229f8added68f03501ac87d222fff8 Mon Sep 17 00:00:00 2001
From: kaghni <kamo.aghbalyan@activeloop.ai>
Date: Fri, 5 Jun 2026 20:24:10 +0000
Subject: [PATCH 30/30] feat(skillopt): env-configurable worker thresholds
 (defaults unchanged)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The weekly worker's lookback + detector thresholds are now overridable via env —
HIVEMIND_SKILLOPT_{LOOKBACK_DAYS,MIN_INVOCATIONS,FAILURE_RATE,FIRE_THRESHOLD} — with
the same defaults (30d, detector min-n, 0.4 rate, >=5 fire). A positive override
wins; non-numeric/<=0 falls back. Useful for tuning + smaller-data testing.
---
 src/skillify/skillopt-worker.ts | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/skillify/skillopt-worker.ts b/src/skillify/skillopt-worker.ts
index ffaa1284..d9f1ecd7 100644
--- a/src/skillify/skillopt-worker.ts
+++ b/src/skillify/skillopt-worker.ts
@@ -37,7 +37,12 @@ async function main(): Promise<void> {
   const proposalsRoot = path.join(getStateDir(), "skillopt", "proposals");
   const metaFile = path.join(getStateDir(), "skillopt", "meta.jsonl");
   const metaCache = loadMeta(metaFile);
-  const sinceIso = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000).toISOString(); // 30-day lookback
+  // Lookback + thresholds are env-tunable (defaults: 30-day window, the detector's
+  // own min-n, and a ≥5-deficient fire gate). A positive override wins; anything
+  // non-numeric/≤0 falls back to the default.
+  const envNum = (k: string): number | undefined => { const n = Number(process.env[k]); return Number.isFinite(n) && n > 0 ? n : undefined; };
+  const lookbackDays = envNum("HIVEMIND_SKILLOPT_LOOKBACK_DAYS") ?? 30;
+  const sinceIso = new Date(Date.now() - lookbackDays * 24 * 60 * 60 * 1000).toISOString();
 
   const res = await runSkillOptCycle({
     query,
@@ -49,7 +54,12 @@ async function main(): Promise<void> {
       has: (n, a, edits) => alreadyProposed(metaCache, n, a, edits),
       record: (n, a, edits) => { const e = metaEntryFor(n, a, edits, new Date().toISOString()); appendMeta(metaFile, e); metaCache.push(e); },
     },
-    detector: { sinceIso, limit: 5000 },
+    detector: {
+      sinceIso, limit: 5000,
+      minInvocations: envNum("HIVEMIND_SKILLOPT_MIN_INVOCATIONS"),
+      failureRateThreshold: envNum("HIVEMIND_SKILLOPT_FAILURE_RATE"),
+    },
+    fireThreshold: envNum("HIVEMIND_SKILLOPT_FIRE_THRESHOLD"),
     now: new Date().toISOString(),
   });