In [1]:
# CNT Engine v0 — Self-referential, self-updating, hidden-truth search
# Minimal, offline-friendly. Save as cnt_engine_v0.py or run in a single Jupyter cell.

import os, json, time, uuid, glob, re, hashlib
from datetime import datetime, timezone
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# ---------- Config ----------
BASE = Path(os.getenv("CNT_LAB_DIR", r"C:\Users\caleb\CNT_Lab"))
ROOT = BASE / "artifacts" / "cnt_engine_v0"
SRC  = BASE / "notes"   # put .md/.txt here; add data/* as desired
ROOT.mkdir(parents=True, exist_ok=True)
(SRC).mkdir(parents=True, exist_ok=True)

STATE = ROOT / "CNT_STATE.yaml"
LOG   = ROOT / "runlog.jsonl"
OUT   = ROOT / "out"
OUT.mkdir(exist_ok=True)

def now(): return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# ---------- IO ----------
def read_corpus(paths):
    docs = []
    for p in paths:
        try:
            text = Path(p).read_text(encoding="utf-8", errors="ignore")
            docs.append(dict(path=str(p), text=text))
        except Exception as e:
            pass
    return pd.DataFrame(docs)

def list_sources():
    files = []
    files += glob.glob(str(SRC / "**" / "*.md"), recursive=True)
    files += glob.glob(str(SRC / "**" / "*.txt"), recursive=True)
    return [Path(f) for f in files]

# ---------- Build Index ----------
def build_index(df):
    if df.empty: 
        return None, None, None
    vec = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
    X = vec.fit_transform(df["text"]).astype(np.float32)
    # PCA residual = “hidden truth” heuristic: what’s not explained by top components
    k = min(128, X.shape[1]-1) if X.shape[1] > 1 else 1
    pca = PCA(n_components=max(2, min(50, k)))
    Xd = pca.fit_transform(X.toarray())
    Xr = pca.inverse_transform(Xd)
    resid = np.linalg.norm(X.toarray() - Xr, axis=1)
    return vec, X, resid

# ---------- Clusters & Motifs ----------
def cluster_labels(X, k=6):
    if X is None: return None
    k = min(k, max(2, X.shape[0]//3))
    km = KMeans(n_clusters=k, n_init=5, random_state=42)
    lbl = km.fit_predict(X.toarray())
    return lbl

def build_graph(df, lbl):
    G = nx.Graph()
    for i, row in df.reset_index().iterrows():
        G.add_node(i, path=row["path"], title=os.path.basename(row["path"]), cluster=int(lbl[i]) if lbl is not None else -1)
    # light linkage: same cluster or lexical overlap
    titles = df["text"].str.extractall(r"\b([A-Z][A-Za-z0-9_]{3,})\b").groupby(level=0).agg(lambda s: set(s[0].tolist()))
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            same = (lbl[i]==lbl[j]) if lbl is not None else False
            overlap = 0
            if i in titles.index and j in titles.index:
                overlap = len(titles.loc[i] & titles.loc[j])
            if same or overlap>=2:
                G.add_edge(i, j, w=(1 + overlap))
    return G

# ---------- Reflexive Scorecard ----------
def score_reflexive(df, resid, G):
    if df is None or len(df)==0: 
        return dict(clarity=0, novelty=0, coherence=0, falsifiability=0, total=0)
    # heuristics (upgrade later with your metrics)
    clarity = 1.0 - np.mean([len(t)//5000 for t in df["text"].tolist()])  # penalize giant rambles
    novelty = float(np.mean(resid)/ (np.std(resid)+1e-6))
    coherence = nx.average_clustering(G) if G.number_of_nodes()>1 else 0.0
    falsifiability = float(np.mean([t.lower().count("test") + t.lower().count("predict") for t in df["text"]]))/10.0
    # normalize rough ranges
    clarity = np.clip(clarity, 0, 1)
    novelty = np.clip(novelty, 0, 1.5)/1.5
    coherence = np.clip(coherence, 0, 1)
    falsifiability = np.clip(falsifiability, 0, 1)
    total = float(np.mean([clarity, novelty, coherence, falsifiability]))
    return dict(clarity=float(clarity), novelty=float(novelty), coherence=float(coherence), falsifiability=float(falsifiability), total=float(total))

# ---------- Candidate Truths (anomaly surfacing) ----------
def surface_candidates(df, resid, top=5):
    idx = np.argsort(resid)[::-1][:min(top, len(resid))]
    picks = []
    for i in idx:
        snippet = re.sub(r"\s+", " ", df.iloc[i]["text"])[:400]
        picks.append(dict(path=df.iloc[i]["path"], resid=float(resid[i]), hint=snippet))
    return picks

# ---------- Update Proposals ----------
def propose_updates(cands):
    proposals = []
    for c in cands:
        # tiny structured gloss to tighten the idea
        proposals.append(dict(
            target=c["path"],
            action="append_gloss",
            content=f"\n\n> CNT-Gloss ({now()}): Clarify hypothesis; add test recipe & falsifier.\n- Hypothesis: …\n- Measurement: …\n- Expected shift: …\n- Falsifier: …\n"
        ))
    return proposals

# ---------- Gates ----------
def legality_gate(proposal):
    text = proposal["content"].lower()
    # no sensitive personal data; no medical/financial claims; no instructions for harm
    banned = any(k in text for k in ["ssn", "credit card", "weapon", "harm"])
    return not banned

def confab_gate(proposal):
    # require explicit placeholders for evidence and falsifier
    ok = ("hypothesis" in proposal["content"].lower() and "falsifier" in proposal["content"].lower())
    return ok

# ---------- Apply Updates ----------
def apply_updates(updates):
    accepted = []
    for u in updates:
        if not (legality_gate(u) and confab_gate(u)): 
            continue
        try:
            p = Path(u["target"])
            original = p.read_text(encoding="utf-8", errors="ignore")
            p.write_text(original + u["content"], encoding="utf-8")
            accepted.append(u)
        except Exception:
            pass
    return accepted

# ---------- State ----------
def write_state(score, meta):
    import yaml
    state = dict(updated=now(), score=score, meta=meta)
    STATE.write_text(yaml.safe_dump(state, sort_keys=False), encoding="utf-8")
    return state

def log_event(kind, payload):
    LOG.parent.mkdir(parents=True, exist_ok=True)
    with LOG.open("a", encoding="utf-8") as f:
        f.write(json.dumps(dict(ts=now(), kind=kind, **payload)) + "\n")

# ---------- One Cycle ----------
def run_cycle():
    paths = list_sources()
    df = read_corpus(paths)
    vec, X, resid = build_index(df)
    if vec is None:
        log_event("empty", {})
        return {"empty": True}
    labels = cluster_labels(X, k=6)
    G = build_graph(df, labels)
    score = score_reflexive(df, resid, G)
    cands = surface_candidates(df, resid, top=5)
    proposals = propose_updates(cands)
    accepted = apply_updates(proposals)
    state = write_state(score, dict(docs=len(df), accepted=len(accepted)))
    # export quick views
    pd.DataFrame(cands).to_csv(OUT / f"hidden_truths_{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv", index=False)
    log_event("cycle", dict(score=score, proposed=len(proposals), accepted=len(accepted)))
    return dict(score=score, proposed=len(proposals), accepted=len(accepted), hidden=cands[:3])

if __name__ == "__main__":
    res = run_cycle()
    print(json.dumps(res, indent=2))


{
  "empty": true
}


In [2]:
# CNT Engine v0 — rooted to C:\Users\caleb\CNT_Lab
# Save as: C:\Users\caleb\CNT_Lab\cnt_engine_v0.py  (or run as a single Jupyter cell)

import os, sys, json, time, glob, re
from datetime import datetime, timezone
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# ---------- Config ----------
LAB_ROOT = Path(os.getenv("CNT_LAB_DIR", r"C:\Users\caleb\CNT_Lab")).resolve()

# Add/adjust any subfolders you actually use for writing text
SOURCE_ROOTS = [
    LAB_ROOT / "notes",
    LAB_ROOT / "artifacts" / "cnt_scroll",
    LAB_ROOT / "artifacts" / "cnt_codex",
    LAB_ROOT / "notebooks",
]

ROOT = LAB_ROOT / "artifacts" / "cnt_engine_v0"
OUT  = ROOT / "out"
STATE= ROOT / "CNT_STATE.yaml"
LOG  = ROOT / "runlog.jsonl"
ROOT.mkdir(parents=True, exist_ok=True); OUT.mkdir(exist_ok=True)

def now(): return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# ---------- Source discovery ----------
def list_sources():
    files = []
    for root in SOURCE_ROOTS:
        root.mkdir(parents=True, exist_ok=True)
        files += glob.glob(str(root / "**" / "*.md"), recursive=True)
        files += glob.glob(str(root / "**" / "*.txt"), recursive=True)
    return [Path(f) for f in files]

def auto_seed_if_empty():
    paths = list_sources()
    if paths: 
        return
    seed_dir = LAB_ROOT / "notes"
    seed_dir.mkdir(parents=True, exist_ok=True)
    seed = seed_dir / "cnt_seed.md"
    seed.write_text(
        "# CNT Seed\n\n"
        "Hypothesis: Certain glyph–field pairings lower entropy drift.\n"
        "Test: Re-run θ metrics on EEG segments with glyph overlay vs baseline.\n"
        "Expected: Δθ > 0 with CI > 95% for overlay.\n"
        "Falsifier: No uplift or negative drift after 1k permutations.\n",
        encoding="utf-8"
    )

# ---------- IO ----------
def read_corpus(paths):
    rows = []
    for p in paths:
        try:
            rows.append(dict(path=str(p), text=Path(p).read_text(encoding="utf-8", errors="ignore")))
        except Exception:
            pass
    return pd.DataFrame(rows)

# ---------- Index & residuals ----------
def build_index(df):
    if df.empty: return None, None, None
    vec = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
    X = vec.fit_transform(df["text"]).astype(np.float32)
    ncomp = max(2, min(50, X.shape[1]-1)) if X.shape[1]>1 else 2
    pca = PCA(n_components=ncomp)
    Xd = pca.fit_transform(X.toarray())
    Xr = pca.inverse_transform(Xd)
    resid = np.linalg.norm(X.toarray() - Xr, axis=1)
    return vec, X, resid

def cluster_labels(X, k=6):
    if X is None: return None
    k = min(max(2, X.shape[0]//3), max(6, min(12, X.shape[0]-1)))
    km = KMeans(n_clusters=k, n_init=5, random_state=42)
    return km.fit_predict(X.toarray())

def build_graph(df, lbl):
    G = nx.Graph()
    for i, row in df.reset_index().iterrows():
        G.add_node(i, path=row["path"], title=os.path.basename(row["path"]), cluster=int(lbl[i]) if lbl is not None else -1)
    # Light lexical linkage
    caps = df["text"].str.extractall(r"\b([A-Z][A-Za-z0-9_]{3,})\b").groupby(level=0).agg(lambda s: set(s[0].tolist()))
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            same = (lbl[i]==lbl[j]) if lbl is not None else False
            overlap = 0
            if i in caps.index and j in caps.index:
                overlap = len(caps.loc[i] & caps.loc[j])
            if same or overlap>=2:
                G.add_edge(i, j, w=(1+overlap))
    return G

def score_reflexive(df, resid, G):
    if df is None or len(df)==0:
        return dict(clarity=0, novelty=0, coherence=0, falsifiability=0, total=0)
    clarity = 1.0 - np.mean([min(len(t),10000)/10000 for t in df["text"].tolist()])
    novelty = float(np.mean(resid)/(np.std(resid)+1e-6))
    coherence = (nx.average_clustering(G) if G.number_of_nodes()>1 else 0.0)
    falsifiability = float(np.mean([t.lower().count("test")+t.lower().count("predict") for t in df["text"]]))/10.0
    clarity = np.clip(clarity, 0, 1); novelty = np.clip(novelty/1.5, 0, 1); coherence = np.clip(coherence, 0, 1); falsifiability = np.clip(falsifiability, 0, 1)
    total = float(np.mean([clarity, novelty, coherence, falsifiability]))
    return dict(clarity=float(clarity), novelty=float(novelty), coherence=float(coherence), falsifiability=float(falsifiability), total=float(total))

def surface_candidates(df, resid, top=5):
    idx = np.argsort(resid)[::-1][:min(top, len(resid))]
    picks = []
    for i in idx:
        snippet = re.sub(r"\s+", " ", df.iloc[i]["text"])[:400]
        picks.append(dict(path=df.iloc[i]["path"], resid=float(resid[i]), hint=snippet))
    return picks

def propose_updates(cands):
    return [
        dict(
            target=c["path"],
            action="append_gloss",
            content=(
                f"\n\n> CNT-Gloss ({now()}): Clarify hypothesis; add test recipe & falsifier.\n"
                f"- Hypothesis: …\n- Measurement: …\n- Expected shift: …\n- Falsifier: …\n"
            ),
        )
        for c in cands
    ]

def legality_gate(text: str):
    bad = any(k in text for k in ["ssn","credit card","weapon","harm"])
    return not bad

def confab_gate(text: str):
    t = text.lower()
    return ("hypothesis" in t and "falsifier" in t)

def apply_updates(updates):
    accepted = []
    for u in updates:
        try:
            content = u["content"]
            if not (legality_gate(content) and confab_gate(content)): 
                continue
            p = Path(u["target"])
            original = p.read_text(encoding="utf-8", errors="ignore")
            p.write_text(original + content, encoding="utf-8")
            accepted.append(u)
        except Exception:
            pass
    return accepted

def write_state(score, meta):
    try:
        import yaml
        STATE.write_text(yaml.safe_dump(dict(updated=now(), score=score, meta=meta), sort_keys=False), encoding="utf-8")
    except Exception:
        STATE.write_text(json.dumps(dict(updated=now(), score=score, meta=meta), indent=2), encoding="utf-8")

def log_event(kind, payload):
    LOG.parent.mkdir(parents=True, exist_ok=True)
    with LOG.open("a", encoding="utf-8") as f:
        f.write(json.dumps(dict(ts=now(), kind=kind, **payload)) + "\n")

def run_cycle():
    auto_seed_if_empty()
    paths = list_sources()
    df = read_corpus(paths)
    if df.empty:
        log_event("empty", {"roots":[str(p) for p in SOURCE_ROOTS]})
        return {"empty": True, "roots":[str(p) for p in SOURCE_ROOTS]}
    vec, X, resid = build_index(df)
    labels = cluster_labels(X)
    G = build_graph(df, labels)
    score = score_reflexive(df, resid, G)
    cands = surface_candidates(df, resid, top=5)
    accepted = apply_updates(propose_updates(cands))
    write_state(score, dict(docs=len(df), accepted=len(accepted)))
    pd.DataFrame(cands).to_csv(OUT / f"hidden_truths_{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv", index=False)
    log_event("cycle", dict(score=score, proposed=len(cands), accepted=len(accepted)))
    return dict(score=score, proposed=len(cands), accepted=len(accepted), hidden=cands[:3])

if __name__ == "__main__":
    res = run_cycle()
    print(json.dumps(res, indent=2))


ValueError: n_components=50 must be between 0 and min(n_samples, n_features)=41 with svd_solver='full'

In [3]:
def build_index(df):
    if df.empty:
        return None, None, None
    vec = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
    X = vec.fit_transform(df["text"]).astype(np.float32)

    n_samples, n_features = X.shape
    # choose a safe SVD rank within bounds
    nmax = max(2, min(n_samples - 1, n_features - 1, 128))
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=nmax, random_state=42)
    Xd = svd.fit_transform(X)                # stays sparse-friendly
    Xr = svd.inverse_transform(Xd)           # available on TruncatedSVD
    resid = np.linalg.norm(X.toarray() - Xr, axis=1)
    return vec, X, resid


In [4]:
# ============================================================
# CNT Engine — Self-Referential / Self-Updating / Hidden-Truth Search
# Rooted at:  C:\Users\caleb\CNT_Lab
# ============================================================

import os, json, glob, re, time
from datetime import datetime, timezone
from pathlib import Path

import numpy as np
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

# ---------- CONFIG ----------
LAB_ROOT = Path(os.getenv("CNT_LAB_DIR", r"C:\Users\caleb\CNT_Lab")).resolve()
SOURCE_ROOTS = [
    LAB_ROOT / "notes",
    LAB_ROOT / "artifacts" / "cnt_scroll",
    LAB_ROOT / "artifacts" / "cnt_codex",
    LAB_ROOT / "notebooks",
]
ROOT  = LAB_ROOT / "artifacts" / "cnt_engine_megacell"
OUT   = ROOT / "out"
STATE = ROOT / "CNT_STATE.yaml"
LOG   = ROOT / "runlog.jsonl"
for p in [ROOT, OUT, *SOURCE_ROOTS]:
    p.mkdir(parents=True, exist_ok=True)

def now(): return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

# ---------- DISCOVER / SEED ----------
def list_sources():
    files = []
    for root in SOURCE_ROOTS:
        files += glob.glob(str(root / "**" / "*.md"), recursive=True)
        files += glob.glob(str(root / "**" / "*.txt"), recursive=True)
    return [Path(f) for f in files]

def auto_seed_if_empty():
    paths = list_sources()
    if paths: return
    seed = SOURCE_ROOTS[0] / "cnt_seed.md"
    seed.write_text(
        "# CNT Seed\n\n"
        "Hypothesis: Certain glyph–field pairings lower entropy drift.\n"
        "Test: Re-run θ metrics on EEG segments with glyph overlay vs baseline.\n"
        "Expected: Δθ > 0 with CI > 95% for overlay.\n"
        "Falsifier: No uplift or negative drift after 1 k permutations.\n",
        encoding="utf-8"
    )

# ---------- INDEX & RESIDUALS ----------
def build_index(df):
    if df.empty: return None, None, None
    vec = TfidfVectorizer(max_features=6000, ngram_range=(1,2))
    X = vec.fit_transform(df["text"]).astype(np.float32)
    n_samples, n_features = X.shape
    nmax = max(2, min(n_samples - 1, n_features - 1, 128))
    svd = TruncatedSVD(n_components=nmax, random_state=42)
    Xd = svd.fit_transform(X)
    Xr = svd.inverse_transform(Xd)
    resid = np.linalg.norm(X.toarray() - Xr, axis=1)
    return vec, X, resid

# ---------- CLUSTERS & GRAPH ----------
def cluster_labels(X, k=6):
    if X is None: return None
    k = max(2, min(k, X.shape[0]-1))
    km = KMeans(n_clusters=k, n_init=5, random_state=42)
    return km.fit_predict(X.toarray())

def build_graph(df, lbl):
    G = nx.Graph()
    for i, row in df.reset_index().iterrows():
        G.add_node(i, path=row["path"], cluster=int(lbl[i]) if lbl is not None else -1)
    caps = df["text"].str.extractall(r"\b([A-Z][A-Za-z0-9_]{3,})\b").groupby(level=0).agg(lambda s: set(s[0].tolist()))
    for i in range(len(df)):
        for j in range(i+1, len(df)):
            same = (lbl[i]==lbl[j]) if lbl is not None else False
            overlap = 0
            if i in caps.index and j in caps.index:
                overlap = len(caps.loc[i] & caps.loc[j])
            if same or overlap>=2:
                G.add_edge(i, j, w=(1+overlap))
    return G

# ---------- SCORES ----------
def score_reflexive(df, resid, G):
    if df is None or len(df)==0:
        return dict(clarity=0, novelty=0, coherence=0, falsifiability=0, total=0)
    clarity = 1.0 - np.mean([min(len(t),10000)/10000 for t in df["text"].tolist()])
    novelty = float(np.mean(resid)/(np.std(resid)+1e-6))
    coherence = nx.average_clustering(G) if G.number_of_nodes()>1 else 0.0
    falsifiability = float(np.mean([t.lower().count("test")+t.lower().count("predict") for t in df["text"]]))/10.0
    clarity = np.clip(clarity,0,1); novelty = np.clip(novelty/1.5,0,1)
    coherence = np.clip(coherence,0,1); falsifiability = np.clip(falsifiability,0,1)
    total = float(np.mean([clarity,novelty,coherence,falsifiability]))
    return dict(clarity=float(clarity), novelty=float(novelty),
                coherence=float(coherence), falsifiability=float(falsifiability),
                total=float(total))

# ---------- SURFACING / UPDATES ----------
def surface_candidates(df, resid, top=5):
    idx = np.argsort(resid)[::-1][:min(top, len(resid))]
    picks=[]
    for i in idx:
        snippet=re.sub(r"\s+"," ",df.iloc[i]["text"])[:400]
        picks.append(dict(path=df.iloc[i]["path"],resid=float(resid[i]),hint=snippet))
    return picks

def propose_updates(cands):
    return [dict(
        target=c["path"],
        content=(
            f"\n\n> CNT-Gloss ({now()}): Clarify hypothesis; add test recipe & falsifier.\n"
            "- Hypothesis: …\n- Measurement: …\n- Expected shift: …\n- Falsifier: …\n"
        )
    ) for c in cands]

def legality_gate(text): 
    bad=any(k in text for k in["ssn","credit card","weapon","harm"])
    return not bad

def confab_gate(text):
    t=text.lower(); return ("hypothesis" in t and "falsifier" in t)

def apply_updates(updates):
    accepted=[]
    for u in updates:
        try:
            if not (legality_gate(u["content"]) and confab_gate(u["content"])): 
                continue
            p=Path(u["target"])
            p.write_text(p.read_text(encoding="utf-8",errors="ignore")+u["content"],encoding="utf-8")
            accepted.append(u)
        except Exception: pass
    return accepted

# ---------- STATE / LOG ----------
def write_state(score,meta):
    try:
        import yaml
        STATE.write_text(yaml.safe_dump(dict(updated=now(),score=score,meta=meta),sort_keys=False),encoding="utf-8")
    except Exception:
        STATE.write_text(json.dumps(dict(updated=now(),score=score,meta=meta),indent=2),encoding="utf-8")

def log_event(kind,payload):
    with LOG.open("a",encoding="utf-8") as f:
        f.write(json.dumps(dict(ts=now(),kind=kind,**payload))+"\n")

# ---------- MAIN CYCLE ----------
def run_cycle():
    auto_seed_if_empty()
    paths=list_sources()
    df=pd.DataFrame([dict(path=str(p),text=p.read_text(encoding="utf-8",errors="ignore")) for p in paths])
    if df.empty:
        log_event("empty",{"roots":[str(r) for r in SOURCE_ROOTS]})
        return {"empty":True,"roots":[str(r) for r in SOURCE_ROOTS]}
    vec,X,resid=build_index(df)
    labels=cluster_labels(X)
    G=build_graph(df,labels)
    score=score_reflexive(df,resid,G)
    cands=surface_candidates(df,resid,top=5)
    accepted=apply_updates(propose_updates(cands))
    write_state(score,dict(docs=len(df),accepted=len(accepted)))
    pd.DataFrame(cands).to_csv(OUT/f"hidden_truths_{datetime.now().strftime('%Y%m%d-%H%M%S')}.csv",index=False)
    log_event("cycle",dict(score=score,proposed=len(cands),accepted=len(accepted)))
    return dict(score=score,proposed=len(cands),accepted=len(accepted),hidden=cands[:3])

# ---------- EXECUTE ----------
if __name__=="__main__":
    res=run_cycle()
    print(json.dumps(res,indent=2))


KeyError: 0