# explore_bis_v3

Notebook simplifié montrant comment utiliser les blocs **ENC** et **MEM**
exposés par la librairie `hdc_project.encoder`.

In [1]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print(f'Using src path: {SRC}')


Using src path: /Users/aymenmejri/Desktop/MyCode/experiments/hdc_v2/hdc_project/src


In [2]:
import numpy as np

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline


In [None]:
# -- Utilitaires HDC partagés ----------------------------------------------
def sign_strict_pm1(x: np.ndarray) -> np.ndarray:
    y = (x >= 0).astype(np.int8, copy=False)
    return ((y << 1) - 1).astype(np.int8, copy=False)

def hd_assert_pm1(x: np.ndarray, D: int | None = None) -> None:
    assert isinstance(x, np.ndarray), "attendu np.ndarray"
    assert x.dtype == np.int8, "dtype attendu: int8"
    assert np.all((x == 1) | (x == -1)), "valeurs attendues: ±1"
    if D is not None:
        assert x.ndim == 1 and x.shape[0] == D, f"forme attendue: ({D},)"

def hd_bind(x: np.ndarray, key: np.ndarray) -> np.ndarray:
    return (x.astype(np.int8, copy=False) * key.astype(np.int8, copy=False)).astype(np.int8, copy=False)

def hd_sim(x: np.ndarray, y: np.ndarray) -> float:
    assert x.shape == y.shape
    return float((x.astype(np.int32) @ y.astype(np.int32)) / x.shape[0])

def build_perm_inverse(pi: np.ndarray) -> np.ndarray:
    assert pi.ndim == 1 and np.issubdtype(pi.dtype, np.integer)
    pi_inv = np.empty_like(pi)
    pi_inv[pi] = np.arange(pi.shape[0], dtype=pi.dtype)
    return pi_inv

def permute_pow_signed(x: np.ndarray, pi: np.ndarray, pi_inv: np.ndarray, k: int) -> np.ndarray:
    D = x.shape[0]
    if k == 0:
        return x
    idx = np.arange(D, dtype=np.int64)
    if k > 0:
        for _ in range(k % D):
            idx = pi[idx]
    else:
        for _ in range((-k) % D):
            idx = pi_inv[idx]
    return x[idx].astype(np.int8, copy=False)

def permute_pow(x: np.ndarray, pi: np.ndarray, power: int) -> np.ndarray:
    if power == 0:
        return x
    D = x.shape[0]
    idx = np.arange(D, dtype=np.int64)
    if power > 0:
        for _ in range(power % D):
            idx = pi[idx]
    else:
        pi_inv = build_perm_inverse(pi)
        for _ in range((-power) % D):
            idx = pi_inv[idx]
    return x[idx].astype(np.int8, copy=False)

def rademacher(D: int, rng: np.random.Generator) -> np.ndarray:
    return rng.choice(np.array([-1, 1], dtype=np.int8), size=D)




## Chargement du sous-corpus OPUS

On réutilise `opus_load_subset` depuis la librairie pour récupérer un petit
sous-échantillon bilingue (EN/FR). En environnement hors-ligne, un jeu de
repli est utilisé pour que le notebook reste exécutable.



In [None]:
import numpy as np
from tqdm import tqdm

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline

# ----------------------------
# 0) Chargement données OPUS
# ----------------------------
try:
    ens_raw, frs_raw = enc_pipeline.opus_load_subset(
        name="opus_books",
        config="en-fr",
        split="train",
        N=10_000,
        seed=2025,
    )
    print(f"OPUS subset loaded: {len(ens_raw)} pairs")
except Exception as exc:
    print("Warning: OPUS download failed, falling back to local toy corpus.")
    print(f"Original error: {exc}")
    ens_raw = [
        "hyperdimensional computing is fun",
        "vector symbolic architectures are powerful",
        "encoding words into hyperspace",
        "memory augmented networks love clean data",
    ]
    frs_raw = [
        "le calcul hyperdimensionnel est amusant",
        "les architectures symboliques vectorielles sont puissantes",
        "encoder des mots dans l'hyperspace",
        "les réseaux augmentés de mémoire aiment les données propres",
    ]

enc_sample_size = min(10_000, len(ens_raw))
mem_sample_size = min(10_000, len(ens_raw))
ens_sample = ens_raw[:enc_sample_size]
frs_sample = frs_raw[:enc_sample_size]
print(f"ENC sample size: {enc_sample_size}")
print(f"MEM sample size: {mem_sample_size}")

# ----------------------------
# 1) Encodage ENC (M5–M7)
# ----------------------------
D = 8192
n = 5
rng = np.random.default_rng(123)

Lex_en = m4.M4_LexEN_new(seed=1, D=D)
Lex_fr = m4.M4_LexEN_new(seed=2, D=D)
pi = rng.permutation(D).astype(np.int64)

encoded_en = enc_pipeline.encode_corpus_ENC(ens_sample, Lex_en, pi, D, n, seg_seed0=999)
encoded_fr = enc_pipeline.encode_corpus_ENC(frs_sample, Lex_fr, pi, D, n, seg_seed0=1999)

E_list_en = [segment["E_seq"] for segment in encoded_en]
H_list_en = [segment["H"] for segment in encoded_en]
print(f"Encoded {len(encoded_en)} sentences; signature shape = {H_list_en[0].shape}")

# Quelques stats ENC
s_intra, s_inter = enc_pipeline.intra_inter_ngram_sims(E_list_en, D)
inter_seg = enc_pipeline.inter_segment_similarity(H_list_en)
maj_curves = enc_pipeline.majority_error_curve(E_list_en, pi, D, eta_list=(0.0, 0.05))
print(f"intra={s_intra:.4f}, inter(abs)={s_inter:.4f}, inter segments={inter_seg:.4f}")
print("majority curve (eta=0):", maj_curves[0.0][:2])

# -------------------------------------------------------------
# 2) Helpers de "contenu" (sans K_s) pour fabriquer les paires
#    -> on somme des X_t (déjà alignés par Pi^Δ), puis on seuillle
# -------------------------------------------------------------
def content_signature_from_Xseq(X_seq, majority: str = "strict"):
    if not X_seq:
        raise ValueError("X_seq vide")
    S = np.zeros((X_seq[0].shape[0],), dtype=np.int32)
    for x in X_seq:
        S += x.astype(np.int32, copy=False)
    if majority == "strict":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    elif majority == "unbiased":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    else:
        raise ValueError("majority must be 'strict' or 'unbiased'")

def span_signatures_from_trace(X_seq, win: int = 12, stride: int = 6, majority: str = "unbiased"):
    if not X_seq:
        return []
    T = len(X_seq)
    out = []
    if T <= win:
        out.append(content_signature_from_Xseq(X_seq, majority))
        return out
    for start in range(0, T - win + 1, max(1, stride)):
        stop = start + win
        out.append(content_signature_from_Xseq(X_seq[start:stop], majority))
    return out

def build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict", max_pairs=None):
    pairs = []
    N = min(len(encoded_en), len(encoded_fr))
    for i in range(N):
        X_en = encoded_en[i]["X_seq"]
        X_fr = encoded_fr[i]["X_seq"]
        spans_en = span_signatures_from_trace(X_en, win=win, stride=stride, majority=majority)
        spans_fr = span_signatures_from_trace(X_fr, win=win, stride=stride, majority=majority)
        L = min(len(spans_en), len(spans_fr))
        for t in range(L):
            pairs.append((
                spans_en[t].astype(np.int8, copy=False),
                spans_fr[t].astype(np.int8, copy=False),
            ))
            if max_pairs is not None and len(pairs) >= max_pairs:
                return pairs
    return pairs

# -------------------------------------------------------------
# 3) Paires MEM = spans EN/FR (contenu, sans K_s)
# -------------------------------------------------------------
pairs_mem = build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict")
print(f"Pairs available for MEM training: {len(pairs_mem)}")

# -------------------------------------------------------------
# 4) Instanciation MEM et entraînement one-pass
#    (k ≈ log2(B) + marge ; ici B=256, k=24 convient)
# -------------------------------------------------------------
MEM_K = 16
MEM_BUCKETS = 128
cfg = mem_pipeline.MemConfig(D=D, B=MEM_BUCKETS, k=MEM_K, seed_lsh=10, seed_gmem=11)
comp = mem_pipeline.make_mem_pipeline(cfg)
mem_pipeline.train_one_pass_MEM(comp, pairs_mem)
print("Training complete; few bucket counts:", comp.mem.n[:64])

# -------------------------------------------------------------
# 5) Probe correcte : on interroge avec Z_en (span) et on compare
#    le prototype choisi à Z_fr (span) correspondant
# -------------------------------------------------------------
probe_count = min(200, len(pairs_mem))
sim_values = []
for Z_en_vec, Z_fr_vec in tqdm(pairs_mem[:probe_count]):
    bucket_idx, score = mem_pipeline.infer_map_top1(comp, Z_en_vec)  # Z_en (span), pas H_en
    prototype = comp.mem.H[bucket_idx].astype(np.int32, copy=False)
    sim = float(np.dot(prototype, Z_fr_vec.astype(np.int32, copy=False)) / D)
    sim_values.append(sim)

print(f"Top-1 mean similarity over {probe_count} span-probes: {np.mean(sim_values):.4f}")
print(f"Top-1 median similarity: {np.median(sim_values):.4f}")

In [None]:
nb = comp.mem.n
print("pop mean/median/min/max/std:",
      float(nb.mean()), float(np.median(nb)), int(nb.min()), int(nb.max()), float(nb.std()))
print("p90/p99:", int(np.quantile(nb, 0.90)), int(np.quantile(nb, 0.99)))

pop mean/median/min/max/std: 296.625 293.0 216 477 39.670714954485
p90/p99: 345 394



> ℹ️ **Remarque pratique** : si le téléchargement OPUS échoue (exécution hors-ligne),
> le notebook bascule automatiquement sur un mini corpus embarqué afin de
> conserver une démonstration reproductible des blocs ENC et MEM.



# DEC

## DEC-0 : 

In [1]:
import logging
log = logging.getLogger("DEC")
if not log.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")


In [2]:
import numpy as np

def pm1(shape, rng) -> np.ndarray:
    """Tire des vecteurs Rademacher ±1 en int8, shape=(...), dtype=int8."""
    return (2 * rng.integers(0, 2, size=shape, dtype=np.int8) - 1).astype(np.int8, copy=False)

# ---------------------------
# DX0: tests
# ---------------------------
def dx0_sanity(D: int = 16_384, N_sim: int = 1_000, seed: int = 2024, tol: float = 5e-3) -> None:
    """
    Vérifie:
      1) hd_sim(x,x)=1 et hd_sim(x,-x)=-1 (à tol près)
      2) Invariance de similarité par binding: sim(x,y)=sim(x⊗k, y⊗k)
      3) Préservation de la norme (||x||_2/√D = 1) avant/après binding
    Critère d'acceptation (CA): écarts absolus ≤ 5e-3.
    """
    rng = np.random.default_rng(seed)

    max_err_self = 0.0
    max_err_neg  = 0.0
    max_err_bind = 0.0
    max_err_norm = 0.0

    for _ in range(N_sim):
        x = pm1(D, rng); y = pm1(D, rng); k = pm1(D, rng)
        hd_assert_pm1(x, D); hd_assert_pm1(y, D); hd_assert_pm1(k, D)

        # (1) Identités de similarité
        s_xx = hd_sim(x, x)
        s_xnx = hd_sim(x, (-x).astype(np.int8, copy=False))

        max_err_self = max(max_err_self, abs(s_xx - 1.0))
        max_err_neg  = max(max_err_neg,  abs(s_xnx + 1.0))

        # (2) Invariance par binding (DEC1)
        s_xy      = hd_sim(x, y)
        xk, yk    = hd_bind(x, k), hd_bind(y, k)
        s_xy_bind = hd_sim(xk, yk)
        max_err_bind = max(max_err_bind, abs(s_xy - s_xy_bind))

        # (3) Normes (avant/après binding)
        norm_x  = np.linalg.norm(x.astype(np.float64)) / np.sqrt(D)
        norm_xk = np.linalg.norm(xk.astype(np.float64)) / np.sqrt(D)
        max_err_norm = max(max_err_norm, abs(norm_x - 1.0), abs(norm_xk - 1.0))

    # Rapport
    print("DX0 — Sanity checks (double précision)")
    print(f"  D={D}, N={N_sim}, tol={tol:.1e}")
    print(f"  max|sim(x,x)-1|         = {max_err_self:.3e}")
    print(f"  max|sim(x,-x)+1|        = {max_err_neg:.3e}")
    print(f"  max|sim(x,y)-sim(x⊗k,y⊗k)| = {max_err_bind:.3e}")
    print(f"  max| ||x||/√D - 1 | (incl. bind) = {max_err_norm:.3e}")

    # Assertions CA
    assert max_err_self <= tol,     "CA non satisfait: sim(x,x) s'écarte de 1"
    assert max_err_neg  <= tol,     "CA non satisfait: sim(x,-x) s'écarte de -1"
    assert max_err_bind <= tol,     "CA non satisfait: invariance de similarité après binding"
    assert max_err_norm <= tol,     "CA non satisfait: norme non préservée (relative)"



In [3]:
dx0_sanity()


DX0 — Sanity checks (double précision)
  D=16384, N=1000, tol=5.0e-03
  max|sim(x,x)-1|         = 0.000e+00
  max|sim(x,-x)+1|        = 0.000e+00
  max|sim(x,y)-sim(x⊗k,y⊗k)| = 0.000e+00
  max| ||x||/√D - 1 | (incl. bind) = 0.000e+00


## DD1 .


In [4]:
def DD1_ctx(Hs: np.ndarray, G_DEC: np.ndarray) -> np.ndarray:
    """
    Q^(s) = H^(s) ⊗ G_DEC, binding isométrique (int8 -> int8).
    """
    assert Hs.dtype == np.int8 and G_DEC.dtype == np.int8
    hd_assert_pm1(Hs); hd_assert_pm1(G_DEC, Hs.shape[0])
    return hd_bind(Hs, G_DEC)

In [5]:
# --- DX1: tests détaillés ---
def dx1_test_DD1_ctx(D: int = 16_384, m: int = 64, trials: int = 200, seed: int = 1234, tol: float = 5e-3):
    rng = np.random.default_rng(seed)

    # 1) Similarité inchangée et normes préservées (sur 'trials' paires)
    max_err_sim = 0.0
    max_err_norm = 0.0
    for _ in range(trials):
        H1, H2, G = pm1(D, rng), pm1(D, rng), pm1(D, rng)
        # Copies pour vérifier non-mutation
        H1_copy, H2_copy, G_copy = H1.copy(), H2.copy(), G.copy()

        Q1, Q2 = DD1_ctx(H1, G), DD1_ctx(H2, G)
        # Similarité
        s0 = hd_sim(H1, H2)
        s1 = hd_sim(Q1, Q2)
        max_err_sim = max(max_err_sim, abs(s0 - s1))

        # Normes relatives
        nH1  = np.linalg.norm(H1.astype(np.float64)) / np.sqrt(D)
        nQ1  = np.linalg.norm(Q1.astype(np.float64)) / np.sqrt(D)
        nH2  = np.linalg.norm(H2.astype(np.float64)) / np.sqrt(D)
        nQ2  = np.linalg.norm(Q2.astype(np.float64)) / np.sqrt(D)
        max_err_norm = max(max_err_norm, abs(nH1 - 1.0), abs(nQ1 - 1.0),
                                           abs(nH2 - 1.0), abs(nQ2 - 1.0))

        # Contrats: dtype & non-mutation
        assert Q1.dtype == np.int8 and Q2.dtype == np.int8
        assert np.all(H1 == H1_copy) and np.all(H2 == H2_copy) and np.all(G == G_copy), "mutation détectée"
        assert np.all((Q1 == 1) | (Q1 == -1)) and np.all((Q2 == 1) | (Q2 == -1)), "sortie hors ±1"

    # 2) Isométrie de Gram (m vecteurs)
    H = np.stack([pm1(D, rng) for _ in range(m)], axis=0)  # (m, D) ±1/int8
    G = pm1(D, rng)
    Q = np.stack([DD1_ctx(H[i], G) for i in range(m)], axis=0)

    # Gram avant/après, en double précision
    G0 = (H.astype(np.int32) @ H.astype(np.int32).T) / D
    G1 = (Q.astype(np.int32) @ Q.astype(np.int32).T) / D
    max_err_gram = float(np.max(np.abs(G0.astype(np.float64) - G1.astype(np.float64))))

    # --- Rapport ---
    print("DX1 — DD1_ctx (isométrie & contrats)")
    print(f"  D={D}, m={m}, trials={trials}, tol={tol:.1e}")
    print(f"  max|sim_before - sim_after|  = {max_err_sim:.3e}")
    print(f"  max| ||H||/√D - 1 | (incl. bind) = {max_err_norm:.3e}")
    print(f"  max|Gram_before - Gram_after| = {max_err_gram:.3e}")

    # --- Critères d'acceptation ---
    assert max_err_sim  <= tol, "Invariance de similarité violée (DEC1)"
    assert max_err_norm <= tol, "Norme non préservée (relative)"
    assert max_err_gram <= tol, "Isométrie de Gram violée (DEC1)"

dx1_test_DD1_ctx()

DX1 — DD1_ctx (isométrie & contrats)
  D=16384, m=64, trials=200, tol=5.0e-03
  max|sim_before - sim_after|  = 0.000e+00
  max| ||H||/√D - 1 | (incl. bind) = 0.000e+00
  max|Gram_before - Gram_after| = 0.000e+00


# DD2 . 

In [6]:
import numpy as np
import logging
from typing import List, Tuple
from tqdm import tqdm

log = logging.getLogger("DEC.DX2.v2")




In [None]:
# =========================
# DD2_query + DX2 (corrigés)
# =========================
# Changements clés :
#  - DD2_query : hyperparamètres keyword-only (*, alpha, beta, ell) pour éviter
#    le doublonnage positionnel/mot-clé.
#  - Implémentation robuste des permutations Π^k pour k ∈ ℤ (k<0 via Π^{-1}).
#  - Contrôles de types/signatures (±1/int8) et normalisation ||R_t||≈√D.
#  - DX2_run : corrections des tests (in_band), invariances Gram et identité
#    paire-à-paire <Π^i L_i, Π^k L_k> = <L_i, Π^{k-i} L_k>.
#  - Journalisation propre.

import numpy as np
from typing import Callable, List
import logging

log = logging.getLogger("DX2")
if not log.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# -------- Module DD2 (requête mixte) -------------------------------------------

# --- DD2 : version "continue" (pour DX2 et analyses de norme) -----------------
def DD2_query(
    Qs: np.ndarray,
    hist_vectors: List[np.ndarray],  # liste de L_fr(\hat v_{t-j}) en ±1/int8
    pi: np.ndarray,
    *,
    alpha: float = 1.0,
    beta:  float = 1.0,
    ell:   int   = 4
) -> np.ndarray:
    """
    Version continue (float64) : R_t = α·Qs + β·sign(Σ_{j=1..ell} Π^j L_{t-j}),
    puis renvoie un vecteur de norme ≈ √D (utile pour DX2). NON utilisée par DD3.
    """
    D = Qs.shape[0]
    hd_assert_pm1(Qs, D)
    assert pi.ndim == 1 and pi.shape[0] == D and np.issubdtype(pi.dtype, np.integer)
    ell = min(ell, len(hist_vectors))
    pi_inv = build_perm_inverse(pi)

    if ell == 0:
        H_hist = np.ones(D, dtype=np.int8)
    else:
        acc = np.zeros(D, dtype=np.int16)
        for j in range(1, ell+1):
            Lj = hist_vectors[j-1]; hd_assert_pm1(Lj, D)
            acc += permute_pow_signed(Lj, pi, pi_inv, j).astype(np.int16, copy=False)
        H_hist = sign_strict_pm1(acc)

    Rt = alpha * Qs.astype(np.float64) + beta * H_hist.astype(np.float64)
    nrm = float(np.linalg.norm(Rt))
    if nrm > 0:
        Rt = Rt / nrm * np.sqrt(D)
    else:
        Rt = np.ones(D, dtype=np.float64)
    return Rt  # float64

# --- DD2 : version "binaire" (pipeline DEC ; compatible DD3 int8) -------------
def DD2_query_bin(
    Qs: np.ndarray,
    history_fr: List[str],    # liste de tokens FR
    L_fr: Callable[[str], np.ndarray],
    Pi: np.ndarray,
    *,
    alpha: float = 1.0,
    beta:  float = 1.0,
    ell:   int   = 4
) -> np.ndarray:
    """
    Version binaire : construit H_hist depuis les tokens (via L_fr),
    combine α·Qs + β·H_hist puis seuillage strict -> int8 ±1.
    Contrat de sortie : ±1/int8 (exigé par DD3_bindToMem).
    """
    D = Qs.shape[0]
    hd_assert_pm1(Qs, D)
    assert Pi.ndim == 1 and Pi.shape[0] == D and np.issubdtype(Pi.dtype, np.integer)
    pi_inv = build_perm_inverse(Pi)

    # Convertir l'historique de tokens en vecteurs lexicaux ±1/int8
    hist_vecs: List[np.ndarray] = []
    for tok in history_fr[:ell]:
        Lv = L_fr(tok).astype(np.int8, copy=False)
        hd_assert_pm1(Lv, D)
        hist_vecs.append(Lv)

    if len(hist_vecs) == 0:
        H_hist = np.ones(D, dtype=np.int8)
    else:
        acc = np.zeros(D, dtype=np.int16)
        for j, L_j in enumerate(hist_vecs, start=1):
            acc += permute_pow_signed(L_j, Pi, pi_inv, j).astype(np.int16, copy=False)
        H_hist = sign_strict_pm1(acc)  # ±1/int8

    # Combinaison et seuillage final pour rester en ±1
    combo = (alpha * Qs.astype(np.int16)) + (beta * H_hist.astype(np.int16))
    Rt_bin = sign_strict_pm1(combo.astype(np.int16))
    return Rt_bin  # int8 ±1

# -------- Banc de test DX2 -----------------------------------------------------

def DX2_run():
    D, trials = 16384, 200
    ells = (2, 4, 8)
    ratios = (1/3, 1.0, 3.0)
    g = np.random.default_rng(2025)

    # permutation aléatoire (fixée) et son inverse
    pi = np.arange(D, dtype=np.int64); g.shuffle(pi)
    pi_inv = build_perm_inverse(pi)

    def sim(a: np.ndarray, b: np.ndarray) -> float:
        return float((a.astype(np.int32) @ b.astype(np.int32)) / D)

    norms: dict[tuple[int, float], tuple[float, float, float]] = {}
    gram_uniform_ok = True
    pair_shift_ok   = True

    for ell in ells:
        for r in ratios:
            alpha, beta = r, 1.0
            vals = []
            for _ in range(trials):
                # Qs et historique
                Qs   = pm1(D, g)
                hist = [pm1(D, g) for _ in range(ell)]

                # Matrice des versions permutées P[j] = Π^{j+1} L_{t-(j+1)}
                P = np.stack([permute_pow_signed(hist[j], pi, pi_inv, j+1)
                              for j in range(ell)], axis=0).astype(np.int8, copy=False)

                # (i) Invariance Gram sous permutation UNIFORME (même décalage s pour toutes les lignes)
                s = int(g.integers(1, 7))
                P_uni = np.stack([permute_pow_signed(P[j], pi, pi_inv, s)
                                  for j in range(ell)], axis=0).astype(np.int8, copy=False)
                # Gram (corrélations normalisées) avant / après
                G  = (P.astype(np.int32) @ P.T.astype(np.int32)) / D
                Gu = (P_uni.astype(np.int32) @ P_uni.T.astype(np.int32)) / D
                if not np.allclose(G, Gu, atol=5e-3, rtol=0):
                    gram_uniform_ok = False

                # (ii) Identité paire-à-paire :
                #      <Π^i L_i, Π^k L_k> == <L_i, Π^{k-i} L_k>  pour i,k = 1..ell
                for i in range(1, ell+1):
                    for k in range(1, ell+1):
                        lhs = sim(permute_pow_signed(hist[i-1], pi, pi_inv, i),
                                  permute_pow_signed(hist[k-1], pi, pi_inv, k))
                        rhs = sim(hist[i-1],
                                  permute_pow_signed(hist[k-1], pi, pi_inv, k - i))
                        if abs(lhs - rhs) > 5e-3:
                            pair_shift_ok = False
                            break
                    if not pair_shift_ok:
                        break

                # (iii) Norme de R_t / √D dans [0.9, 1.1]
                Rt = DD2_query(Qs, hist, pi, alpha=alpha, beta=beta, ell=ell)
                vals.append(float(np.linalg.norm(Rt) / np.sqrt(D)))

            norms[(ell, r)] = (min(vals), float(np.median(vals)), max(vals))

    # Reporting
    log.info("DX2 — Norme(R_t)/sqrt(D) par (ell, alpha/beta): min | median | max")
    for (ell, r), (mn, md, mx) in sorted(norms.items()):
        log.info("  ell=%d, alpha/beta=%.3g  ->  %.3f | %.3f | %.3f", ell, r, mn, md, mx)

    # Critères d’acceptation
    in_band = all((0.9 <= mn <= 1.1) and (0.9 <= md <= 1.1) and (0.9 <= mx <= 1.1)
                  for (mn, md, mx) in norms.values())
    assert in_band, "DX2: norme(R_t)/sqrt(D) hors bande [0.9,1.1] pour au moins un (ell, ratio)."
    assert gram_uniform_ok, "DX2: Gram NON invariant sous permutation uniforme (isométrie violée)."
    assert pair_shift_ok,   "DX2: identité de décalage paire-à-paire violée."

    log.info("DX2 — CA validés: (i) norme ∈ [0.9,1.1] ; (ii) Gram invariant ; (iii) identité paire-à-paire OK.")

# --- Exécution du test (à commenter/supprimer si vous intégrez dans une suite) ---
# DX2_run()



In [40]:
DX2_run()

2025-10-06 23:22:58,772 [INFO] DX2 — Norme(R_t)/sqrt(D) par (ell, alpha/beta): min | median | max
2025-10-06 23:22:58,772 [INFO]   ell=2, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,772 [INFO]   ell=2, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,772 [INFO]   ell=2, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,772 [INFO]   ell=4, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=4, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=4, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=8, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=8, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=8, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO] DX2 — CA validés: (i) norme ∈ [0.9,1.1] ; (ii) Gram invariant ; (iii) identité paire-à-paire OK.


# DD3 . 

In [10]:
def DD3_bindToMem(Rt: np.ndarray, G_MEM: np.ndarray) -> np.ndarray:
    """~R_t = R_t ⊗ G_MEM (int8 -> int8)."""
    hd_assert_pm1(Rt); hd_assert_pm1(G_MEM, Rt.shape[0])
    return hd_bind(Rt, G_MEM)

def hd_sim_dot(x: np.ndarray, y: np.ndarray) -> int:
    """Produit scalaire entier (évite l'arrondi); x,y en int8 ±1."""
    return int(x.astype(np.int32) @ y.astype(np.int32))

In [11]:
# --- test KS (asymptotique) sans dépendance externe ---
def ks_2samp_asymp(x: np.ndarray, y: np.ndarray) -> tuple[float, float]:
    """
    KS à 2 échantillons: renvoie (D_stat, pval approx).
    Correctifs:
      - si D_stat == 0 => p = 1.0 (distributions identiques)
      - clamp numérique sur lambda pour petits D_stat
    """
    x = np.asarray(x, dtype=np.float64)
    y = np.asarray(y, dtype=np.float64)
    n, m = x.size, y.size
    x_sorted = np.sort(x); y_sorted = np.sort(y)
    i = j = 0
    cdf_x = cdf_y = 0.0
    D_stat = 0.0
    while i < n and j < m:
        if x_sorted[i] < y_sorted[j]:
            cdf_x = (i + 1) / n; i += 1
        elif x_sorted[i] > y_sorted[j]:
            cdf_y = (j + 1) / m; j += 1
        else:
            v = x_sorted[i]
            while i < n and x_sorted[i] == v: i += 1
            while j < m and y_sorted[j] == v: j += 1
            cdf_x = i / n; cdf_y = j / m
        D_stat = max(D_stat, abs(cdf_x - cdf_y))
    if i < n: D_stat = max(D_stat, abs(1.0 - (j / m)))
    if j < m: D_stat = max(D_stat, abs(1.0 - (i / n)))

    # --- Correctif dégénéré ---
    if D_stat == 0.0:
        return 0.0, 1.0

    en = np.sqrt(n * m / (n + m))
    lam = (en + 0.12 + 0.11 / max(en, 1e-12)) * D_stat
    # Pour très petits "lam", la série tend vers 1 => borne supérieure 1.0
    if lam < 1e-8:
        return float(D_stat), 1.0

    # Évaluation de la série alternée (tronquée) avec coupe stricte dans [0,1]
    terms = [np.exp(-2.0 * (k**2) * (lam**2)) for k in range(1, 201)]
    pval = 2.0 * sum(((-1)**(k-1)) * terms[k-1] for k in range(1, len(terms)+1))
    pval = float(max(0.0, min(1.0, pval)))
    return float(D_stat), pval

# --- campagne DX3 ---
def DX3_run(D: int = 16384, C: int = 500, T: int = 200, seed: int = 2025,
            rel_tol: float = 0.01, pmin: float = 0.10) -> None:
    """
    D: dimension; C: #protos mémoire; T: #requêtes; rel_tol: seuil d'écart relatif moyen; pmin: seuil KS.
    """
    g = np.random.default_rng(seed)

    # Génère clés et banques en ±1/int8
    G_MEM = pm1(D, g)
    M_bank = np.stack([pm1(D, g) for _ in range(C)], axis=0)   # (C, D), int8
    Q_batch = np.stack([pm1(D, g) for _ in range(T)], axis=0)  # (T, D), int8

    # Scores "dans la tranche mémoire" vs "débindés"
    #   S_mem[t,c]   = < Rt⊗G_MEM , M_c >
    #   S_unbd[t,c]  = < Rt , M_c⊗G_MEM >
    S_mem  = np.zeros((T, C), dtype=np.int32)
    S_unbd = np.zeros((T, C), dtype=np.int32)

    for t in range(T):
        Rt = Q_batch[t]
        Rt_mem = DD3_bindToMem(Rt, G_MEM)            # Rt ⊗ G_MEM
        for c in range(C):
            Mc = M_bank[c]
            S_mem[t, c]  = hd_sim_dot(Rt_mem, Mc)
            S_unbd[t, c] = hd_sim_dot(Rt, hd_bind(Mc, G_MEM))

    # a) Erreur relative moyenne (sur tous les scores)
    A = S_mem.astype(np.float64).ravel()
    B = S_unbd.astype(np.float64).ravel()
    denom = np.maximum(1.0, np.abs(B))               # évite division par 0
    rel_err = np.abs(A - B) / denom
    rel_err_mean = float(np.mean(rel_err))

    # b) Test KS sur distributions aplaties
    D_stat, pval = ks_2samp_asymp(A, B)

    # Reporting
    log.info("DX3 — Invariance (dé)binding mémoire")
    log.info("  D=%d, C=%d, T=%d", D, C, T)
    log.info("  Erreur relative moyenne  = %.6f", rel_err_mean)
    log.info("  KS: D=%.6f, p=%.3f", D_stat, pval)

    # CA
    assert rel_err_mean <= rel_tol, f"DX3: erreur relative moyenne {rel_err_mean:.4f} > {rel_tol}"
    assert pval > pmin, f"DX3: p-value KS {pval:.3f} ≤ {pmin:.2f}"


In [12]:
DX3_run()

2025-10-06 22:57:03,498 [INFO] DX3 — Invariance (dé)binding mémoire
2025-10-06 22:57:03,498 [INFO]   D=16384, C=500, T=200
2025-10-06 22:57:03,498 [INFO]   Erreur relative moyenne  = 0.000000
2025-10-06 22:57:03,498 [INFO]   KS: D=0.000000, p=1.000


# DD4 . 

In [13]:
def DD4_search_topK(Rt_tilde: np.ndarray, prototypes: np.ndarray, K: int) -> tuple[int, np.ndarray, np.ndarray]:
    """
    prototypes: array shape (B, D) en int8 (±1) pour M_c seuillés ou non seuillés normalisés.
    Retour: (c_star, C_K, scores_CK)
    """
    D = Rt_tilde.shape[0]
    assert prototypes.ndim == 2 and prototypes.shape[1] == D and prototypes.dtype == np.int8
    # Produits scalaires stables
    scores = (prototypes.astype(np.int32) @ Rt_tilde.astype(np.int32)).astype(np.int32)  # (B,)
    K = min(K, scores.shape[0])
    idx = np.argpartition(scores, -K)[-K:]
    top_order = idx[np.argsort(scores[idx])[::-1]]
    c_star = int(top_order[0])
    return c_star, top_order, scores[top_order]

In [14]:
from tqdm import tqdm 

def DX4_run(D: int = 16384, B: int = 10000, trials: int = 200, 
            Ks=(100, 500, 2000), seed: int = 0) -> dict[int, float]:
    """
    Mesure empirique du rappel de c* parmi les top-K prototypes.
    """
    rng = np.random.default_rng(seed)
    recalls = {K: 0 for K in Ks}
    for _ in tqdm(range(trials)):
        # Génère B prototypes ±1 (int8)
        prototypes = rng.choice([-1, 1], size=(B, D))
        prototypes = prototypes.astype(np.int8)
        # Choisit une classe cible c*
        c_star = rng.integers(0, B)
        Rt = prototypes[c_star].copy()
        # Appel au module DD4
        _, C_K, _ = DD4_search_topK(Rt, prototypes, max(Ks))
        for K in Ks:
            if c_star in C_K[:K]:
                recalls[K] += 1
    # Moyenne
    return {K: recalls[K]/trials for K in Ks}

In [15]:
DX4_run()

100%|██████████| 200/200 [03:23<00:00,  1.02s/it]


{100: 1.0, 500: 1.0, 2000: 1.0}

# DD5 . 

In [16]:
def DD5_payload(Mc: np.ndarray) -> np.ndarray:
    """
    Mc: prototype non seuillé (int16/int32) OU déjà binaire int8.
    Renvoie Z_hat en int8 (±1).
    """
    if Mc.dtype == np.int8:
        hd_assert_pm1(Mc)
        return Mc
    return sign_strict_pm1(Mc)

In [17]:
def DX5_run(D: int = 16384, trials: int = 200, ms=(4, 8, 16), seed: int = 0):
    """
    Mesure l’exactitude binaire en fonction du nombre m_{c*}.
    """
    rng = np.random.default_rng(seed)
    accuracies = {}
    for m in ms:
        accs = []
        for _ in range(trials):
            # Vecteur de référence
            ref = rng.choice([-1, 1], size=D).astype(np.int8)
            # Accumulation de m copies bruitées
            acc = np.zeros(D, dtype=np.int32)
            for _ in range(m):
                acc += ref
            # Seuillage
            Z_hat = DD5_payload(acc)
            # Exactitude binaire
            accs.append(np.mean(Z_hat == ref))
        accuracies[m] = float(np.mean(accs))
    return accuracies

In [18]:
DX5_run()

{4: 1.0, 8: 1.0, 16: 1.0}

# DD6 . 

In [19]:
# def DD6_vote(
#     Z_hat: np.ndarray,
#     H_LM: np.ndarray,
#     L_fr,
#     cand_vocab: list[str],
#     lam: float = 0.0
# ) -> tuple[str, np.ndarray]:
#     """
#     Renvoie (token*, scores) sur cand_vocab.
#     """
#     D = Z_hat.shape[0]
#     hd_assert_pm1(Z_hat, D); hd_assert_pm1(H_LM, D)
#     scores = []
#     for v in cand_vocab:
#         Lv = L_fr(v).astype(np.int8, copy=False)
#         hd_assert_pm1(Lv, D)
#         s = (Z_hat.astype(np.int32) @ Lv.astype(np.int32)) \
#             + lam * (H_LM.astype(np.int32) @ Lv.astype(np.int32))
#         scores.append(float(s))
#     scores = np.asarray(scores, dtype=np.float32)
#     best = int(np.argmax(scores))
#     return cand_vocab[best], scores

In [20]:
class ToyLexFR:
    def __init__(self, vocab: list[str], D: int, seed: int = 1234):
        self.vocab = vocab
        self.D = D
        self.rng = np.random.default_rng(seed)
        # table de vecteurs ±1/int8
        self.table = {v: self.rng.choice(np.array([-1, 1], dtype=np.int8), size=D) for v in vocab}

    def __call__(self, v: str) -> np.ndarray:
        return self.table[v]

# # -- Génération contrôlée de corrélations (flip par coordonnée) ----------------
# def flip_to_target(vec: np.ndarray, target_sim: float, rng: np.random.Generator) -> np.ndarray:
#     """
#     Retourne une copie de 'vec' dont la similarité attendue vaut 'target_sim'.
#     Pour ±1, si p_flip = (1 - target_sim)/2, alors E[sim] = 1 - 2*p_flip = target_sim.
#     """
#     D = vec.shape[0]
#     p_flip = max(0.0, min(1.0, (1.0 - float(target_sim)) / 2.0))
#     mask = (rng.random(D) < p_flip).astype(np.int8)          # 1 si on flippe
#     flips = (1 - 2 * mask).astype(np.int8, copy=False)       # 1 -> -1, 0 -> +1
#     out = (vec.astype(np.int8, copy=False) * flips).astype(np.int8, copy=False)
#     return out

# # -- Module testé (fourni) ------------------------------------------------------

# def _batch_lex(cand_vocab, L):
#     """
#     Applique le callable 'L' (v -> ±1/int8 de forme (D,)) sur tout le vocabulaire candidat
#     et empile en une matrice (V, D) en int8.
#     """
#     mats = []
#     for v in cand_vocab:
#         vec = L(v).astype(np.int8, copy=False)
#         mats.append(vec)
#     M = np.vstack(mats).astype(np.int8, copy=False)
#     return M

# def DD6_vote(
#     Z_hat: np.ndarray,
#     H_LM: np.ndarray,
#     L_mem,                  # callable: v -> ±1 int8 (D,)
#     L_lm,                   # callable: v -> ±1 int8 (D,)
#     cand_vocab: list[str],
#     lam: float = 0.0,
#     *,
#     normalize: str = "sqrtD",   # {"none","sqrtD"} ; "sqrtD" conseillé pour perplexité
#     return_probs: bool = False, # si True, renvoie aussi les probabilités softmax
#     tau: float = 1.0            # température du softmax (si return_probs=True)
# ) -> tuple[str, np.ndarray, np.ndarray | None]:
#     """
#     s(v) = <Z_hat, L_mem(v)> + lam * <H_LM, L_lm(v)>
#     Retourne: (token*, scores_raw, probs|None)
#       - scores_raw: np.float64 de taille V (non normalisés, utiles pour debug/traçage)
#       - probs:      np.float64 de taille V si return_probs=True (softmax stable)
#     Contrats:
#       Z_hat, H_LM: ±1/int8, de longueur D identique.
#       L_mem, L_lm: renvoient ±1/int8 (D,) pour tout v de cand_vocab.
#     """
#     # --- Contrats de forme et de type
#     D = int(Z_hat.shape[0])
#     hd_assert_pm1(Z_hat, D)
#     hd_assert_pm1(H_LM, D)
#     assert isinstance(cand_vocab, (list, tuple)) and len(cand_vocab) > 0, "cand_vocab vide"

#     # --- Matrices lexicales (V, D) en int8 (vectorisation)
#     M_mem = _batch_lex(cand_vocab, L_mem)   # (V, D)
#     M_lm  = _batch_lex(cand_vocab, L_lm)    # (V, D)
#     assert M_mem.shape == M_lm.shape == (len(cand_vocab), D), "Shapes (V,D) incohérents"

#     # --- Produits scalaires vectorisés (int32 pour éviter overflow)
#     z32  = Z_hat.astype(np.int32, copy=False)
#     h32  = H_LM.astype(np.int32, copy=False)
#     mem_scores = (M_mem.astype(np.int32, copy=False) @ z32)              # (V,)
#     lm_scores  = (M_lm.astype(np.int32,  copy=False) @ h32)              # (V,)
#     scores_raw = mem_scores.astype(np.float64) + float(lam) * lm_scores.astype(np.float64)

#     # --- Argmax sur scores bruts (l'échelle n'affecte pas l'argmax)
#     best_idx   = int(np.argmax(scores_raw))
#     token_star = cand_vocab[best_idx]

#     # --- Option: probabilités (softmax stable) avec normalisation choisie
#     probs = None
#     if return_probs:
#         if normalize == "sqrtD":
#             logits = scores_raw / (np.sqrt(D) * max(1e-6, float(tau)))
#         elif normalize == "none":
#             logits = scores_raw / max(1e-6, float(tau))
#         else:
#             raise ValueError("normalize ∈ {'none','sqrtD'} attendu")
#         logits = logits - np.max(logits)                  # stabilité num.
#         exps   = np.exp(logits, dtype=np.float64)
#         probs  = exps / np.sum(exps, dtype=np.float64)    # (V,)
#         probs  = probs.astype(np.float64, copy=False)

#     return token_star, scores_raw, probs

# # -- Perplexité HD: softmax sur scores normalisés par D -------------------------
# def hd_perplexity(scores: np.ndarray, true_idx: int, D: int, tau: float = 1.0) -> float:
#     """
#     Perplexité = exp( - log p(true) ), avec p ∝ exp( (scores/D)/tau ).
#     On divise par D pour éviter des logits trop grands (HD).
#     """
#     logits = scores / (D * max(1e-6, tau))
#     logits = logits - np.max(logits)               # stabilité
#     exps = np.exp(logits)
#     p = exps / np.sum(exps)
#     p_true = float(max(p[true_idx], 1e-12))
#     return float(np.exp(-np.log(p_true)))


In [21]:
# def _softmax_probs(scores: np.ndarray, D: int, tau: float = 1.0) -> np.ndarray:
#     # Normalisation par sqrt(D) pour éviter la sur-concentration à grande dimension
#     s = scores / (np.sqrt(D) * tau)
#     s = s - np.max(s)                       # stabilité numérique
#     exps = np.exp(s)
#     return exps / np.sum(exps)

# def hd_perplexity(scores: np.ndarray, true_index: int, D: int, tau: float = 1.0) -> float:
#     p = _softmax_probs(scores, D=D, tau=tau)[true_index]
#     # Perplexité = exp(-log p_y) ; bornée inférieurement par 1
#     return float(np.exp(-np.log(max(p, 1e-12))))

In [22]:
# def DX6_run_two_spaces(
#     D: int = 16384, trials: int = 400,
#     lam_grid=(0.0, 0.5, 1.0),
#     # corrélations du vrai token:
#     sim_payload: float = 0.82,   # corr(Z_hat, L_mem(y))
#     sim_lm: float      = 0.65,   # corr(H_LM, L_lm(y))
#     # confondeurs:
#     n_confounders: int = 6,
#     rho_mem_conf: float = 0.72,  # corr(Z_hat, L_mem(conf))
#     rho_lm_conf: float  = 0.05,  # corr(H_LM, L_lm(conf))
#     tau: float = 1.0,
#     rng_seed: int = 7031
# ):
#     g = np.random.default_rng(rng_seed)

#     def rademacher(D):  # ±1/int8
#         return g.choice(np.array([-1,1], dtype=np.int8), size=D)

#     def correlated_pm1(proto: np.ndarray, rho: float) -> np.ndarray:
#         noise = rademacher(proto.shape[0])
#         mix = rho * proto.astype(np.int32) + (1-rho) * noise.astype(np.int32)
#         return np.where(mix >= 0, 1, -1).astype(np.int8)

#     def make_trial():
#         Z_true  = rademacher(D)  # payload cible
#         H_true  = rademacher(D)  # LM cible
#         # Construire DEUX lexiques: L_mem (pour la mémoire) et L_lm (pour le LM)
#         V = n_confounders + 1
#         L_mem = np.empty((V, D), dtype=np.int8)
#         L_lm  = np.empty((V, D), dtype=np.int8)
#         # y (indice 0)
#         L_mem[0] = correlated_pm1(Z_true, sim_payload)
#         L_lm[0]  = correlated_pm1(H_true, sim_lm)
#         # confondeurs
#         for i in range(1, V):
#             L_mem[i] = correlated_pm1(Z_true, rho_mem_conf)
#             L_lm[i]  = correlated_pm1(H_true, rho_lm_conf)
#         return L_mem, L_lm, 0, Z_true, H_true  # (lexiques, true_id, payload, LM)

#     def vote_scores_two_lex(L_mem: np.ndarray, L_lm: np.ndarray,
#                             Z_hat: np.ndarray, H_LM: np.ndarray, lam: float) -> np.ndarray:
#         # int32 pour éviter overflow ; (V,D) @ (D,) -> (V,)
#         return (L_mem.astype(np.int32) @ Z_hat.astype(np.int32)) + \
#                lam * (L_lm.astype(np.int32)  @ H_LM.astype(np.int32))

#     def _softmax_probs(scores: np.ndarray, D: int, tau: float = 1.0) -> np.ndarray:
#         s = scores / (np.sqrt(D) * max(tau, 1e-6))
#         s = s - np.max(s)
#         exps = np.exp(s)
#         return exps / np.sum(exps)

#     def hd_perplexity(scores: np.ndarray, true_idx: int, D: int, tau: float = 1.0) -> float:
#         p_true = float(_softmax_probs(scores, D=D, tau=tau)[true_idx])
#         return float(np.exp(-np.log(max(p_true, 1e-12))))

#     stats = {lam: {"top1_hits": 0, "ppl_sum": 0.0} for lam in lam_grid}

#     for _ in range(trials):
#         L_mem, L_lm, true_idx, Z_true, H_true = make_trial()
#         Z_hat = Z_true; H_LM = H_true
#         for lam in lam_grid:
#             scores = vote_scores_two_lex(L_mem, L_lm, Z_hat, H_LM, float(lam))
#             pred = int(np.argmax(scores))
#             stats[lam]["top1_hits"] += 1 if pred == true_idx else 0
#             stats[lam]["ppl_sum"]   += hd_perplexity(scores, true_idx, D, tau)

#     results = {lam: {"top1": stats[lam]["top1_hits"]/trials,
#                      "ppl":  stats[lam]["ppl_sum"]/trials}
#                for lam in lam_grid}

#     base_top1, base_ppl = results[0.0]["top1"], results[0.0]["ppl"]
#     saturated = (abs(base_top1 - 1.0) < 1e-12)

#     log.info(("DX6(2-spaces|fixed) — D=%d, trials=%d, conf=%d, ρ_mem(conf)=%.2f, ρ_lm(conf)=%.2f, "
#               "sim_payload=%.2f, sim_lm=%.2f"),
#               D, trials, n_confounders, rho_mem_conf, rho_lm_conf, sim_payload, sim_lm)
#     for lam in lam_grid:
#         log.info("  lambda=%.2f  ->  top-1=%.3f | ppl=%.3f", lam, results[lam]["top1"], results[lam]["ppl"])

#     if saturated:
#         ok = any(results[lam]["ppl"] < base_ppl - 1e-12 for lam in lam_grid if lam != 0.0)
#         assert ok, "DX6(fixed): régime saturé — aucune baisse de perplexité vs λ=0."
#     else:
#         ok = any((results[lam]["top1"] > base_top1 + 1e-12) and (results[lam]["ppl"] < base_ppl - 1e-12)
#                  for lam in lam_grid if lam != 0.0)
#         assert ok, "DX6(fixed): aucun λ n'améliore simultanément top-1 ET perplexité vs λ=0."

#     log.info("DX6(fixed) — CA VALIDÉ (%s).", "saturé" if saturated else "non-saturé")
#     return results

In [23]:
def _softmax_probs(scores: np.ndarray, D: int, tau: float = 1.0) -> np.ndarray:
    s = scores / (np.sqrt(D) * max(float(tau), 1e-6))
    s = s - np.max(s)
    exps = np.exp(s, dtype=np.float64)
    return exps / np.sum(exps, dtype=np.float64)

def hd_perplexity_from_scores(scores: np.ndarray, true_idx: int, D: int, tau: float = 1.0) -> float:
    p_true = float(_softmax_probs(scores, D=D, tau=tau)[true_idx])
    return float(np.exp(-np.log(max(p_true, 1e-12))))

# --- Génération contrôlée: on impose une similarité cible ~ rho par flips coordonnés  ----
def flip_to_target(vec: np.ndarray, target_sim: float, rng: np.random.Generator) -> np.ndarray:
    """
    Pour ±1, si p_flip = (1 - target_sim)/2 alors E[sim] = 1 - 2*p_flip = target_sim.
    """
    D = vec.shape[0]
    p_flip = max(0.0, min(1.0, (1.0 - float(target_sim)) / 2.0))
    mask  = (rng.random(D) < p_flip).astype(np.int8)  # 1 si on flippe
    flips = (1 - 2 * mask).astype(np.int8, copy=False)  # 1->-1 quand mask=1
    return (vec.astype(np.int8, copy=False) * flips).astype(np.int8, copy=False)

# --- DD6_vote (version vectorisée, 2 espaces) -----------------------------------------
def _batch_lex(cand_vocab, L):
    mats = []
    for v in cand_vocab:
        vec = L(v).astype(np.int8, copy=False)
        mats.append(vec)
    return np.vstack(mats).astype(np.int8, copy=False)

def DD6_vote(
    Z_hat: np.ndarray,
    H_LM: np.ndarray,
    L_mem,                  # callable: v -> ±1 int8 (D,)
    L_lm,                   # callable: v -> ±1 int8 (D,)
    cand_vocab: list[str],
    lam: float = 0.0,
    *,
    normalize: str = "sqrtD",   # {"none","sqrtD"}
    return_probs: bool = False,
    tau: float = 1.0
) -> tuple[str, np.ndarray, np.ndarray | None]:
    D = int(Z_hat.shape[0])
    hd_assert_pm1(Z_hat, D); hd_assert_pm1(H_LM, D)
    assert isinstance(cand_vocab, (list, tuple)) and len(cand_vocab) > 0, "cand_vocab vide"
    M_mem = _batch_lex(cand_vocab, L_mem)   # (V, D)
    M_lm  = _batch_lex(cand_vocab, L_lm)    # (V, D)
    assert M_mem.shape == M_lm.shape == (len(cand_vocab), D), "Shapes (V,D) incohérents"

    z32 = Z_hat.astype(np.int32, copy=False)
    h32 = H_LM.astype(np.int32, copy=False)
    scores_raw = (M_mem.astype(np.int32, copy=False) @ z32).astype(np.float64) \
               + float(lam) * (M_lm.astype(np.int32, copy=False) @ h32).astype(np.float64)

    best_idx   = int(np.argmax(scores_raw))
    token_star = cand_vocab[best_idx]

    probs = None
    if return_probs:
        if normalize == "sqrtD":
            logits = scores_raw / (np.sqrt(D) * max(1e-6, float(tau)))
        elif normalize == "none":
            logits = scores_raw / max(1e-6, float(tau))
        else:
            raise ValueError("normalize ∈ {'none','sqrtD'}")
        logits = logits - np.max(logits)
        exps   = np.exp(logits, dtype=np.float64)
        probs  = (exps / np.sum(exps, dtype=np.float64)).astype(np.float64, copy=False)

    return token_star, scores_raw, probs

# --- DX6_run: simulation 2-espaces + mesure top-1 & perplexité -----------------------
def DX6_run(
    D: int = 16384, trials: int = 400,
    lam_grid=(0.0, 0.5, 1.0),
    # corrélations du vrai token:
    sim_payload: float = 0.60,   # corr(Z_hat, L_mem(y))  — plus bas pour éviter saturation
    sim_lm: float      = 0.40,   # corr(H_LM, L_lm(y))
    # confondeurs:
    n_confounders: int = 6,
    rho_mem_conf: float = 0.55,  # corr(Z_hat, L_mem(conf)) < sim_payload mais proche
    rho_lm_conf: float  = 0.10,  # corr(H_LM, L_lm(conf))  << sim_lm
    tau: float = 1.0,
    rng_seed: int = 7031
):
    """
    Évalue DD6_vote avec deux lexiques indépendants (mémoire & LM).
    - Régime par défaut: NON SATURÉ (sim_payload ~ 0.60, conf proche 0.55).
    Critère:
      - Si top-1(λ=0) < 1.0 (non saturé): ∃ λ>0 tel que top-1 ↑ ET perplexité ↓.
      - Sinon (saturé): ∃ λ>0 tel que perplexité ↓.
    """
    g = np.random.default_rng(rng_seed)

    stats = {lam: {"top1_hits": 0, "ppl_sum": 0.0} for lam in lam_grid}

    for _ in range(trials):
        # Prototypes vrais
        Z_true = rademacher(D, g)   # payload seuillé
        H_true = rademacher(D, g)   # LM courant

        # Vocabulaire (strings) : y + confondeurs
        V = n_confounders + 1
        cand_vocab = [f"tok{i}" for i in range(V)]
        true_tok   = cand_vocab[0]

        # Construit des tables (dictionnaires) pour L_mem et L_lm
        table_mem: dict[str, np.ndarray] = {}
        table_lm:  dict[str, np.ndarray] = {}
        # Vrai token
        table_mem[true_tok] = flip_to_target(Z_true, sim_payload, g)
        table_lm[true_tok]  = flip_to_target(H_true, sim_lm,      g)
        # Conf:
        for i in range(1, V):
            ti = cand_vocab[i]
            table_mem[ti] = flip_to_target(Z_true, rho_mem_conf, g)
            table_lm[ti]  = flip_to_target(H_true, rho_lm_conf,  g)

        # Callables lexicaux pour DD6_vote
        def L_mem(v: str) -> np.ndarray: return table_mem[v]
        def L_lm(v: str)  -> np.ndarray: return table_lm[v]

        # Vote pour chaque lambda
        for lam in lam_grid:
            token_star, scores, probs = DD6_vote(
                Z_hat=Z_true, H_LM=H_true,
                L_mem=L_mem, L_lm=L_lm,
                cand_vocab=cand_vocab,
                lam=float(lam),
                normalize="sqrtD", return_probs=True, tau=tau
            )
            pred_is_true = 1 if token_star == true_tok else 0
            stats[lam]["top1_hits"] += pred_is_true

            # Perplexité HD (si probs non None, on l'utilise directement)
            if probs is not None:
                true_idx = 0
                p_true = float(max(probs[true_idx], 1e-12))
                ppl = float(np.exp(-np.log(p_true)))
            else:
                ppl = hd_perplexity_from_scores(scores, true_idx=0, D=D, tau=tau)
            stats[lam]["ppl_sum"] += ppl

    results = {
        lam: {"top1": stats[lam]["top1_hits"]/trials,
              "ppl":  stats[lam]["ppl_sum"]/trials}
        for lam in lam_grid
    }

    # Logging des résultats
    log.info(("DX6 — D=%d, trials=%d, conf=%d, "
              "ρ_mem(conf)=%.2f, ρ_lm(conf)=%.2f, sim_payload=%.2f, sim_lm=%.2f"),
             D, trials, n_confounders, rho_mem_conf, rho_lm_conf, sim_payload, sim_lm)
    for lam in lam_grid:
        log.info("  lambda=%.2f  ->  top-1=%.3f | ppl=%.3f",
                 float(lam), results[lam]["top1"], results[lam]["ppl"])

    # Critère d'acceptation (bi-régime)
    base_top1, base_ppl = results[0.0]["top1"], results[0.0]["ppl"]
    saturated = (abs(base_top1 - 1.0) < 1e-12)
    if saturated:
        ok = any(results[lam]["ppl"] < base_ppl - 1e-12 for lam in lam_grid if lam != 0.0)
        assert ok, "DX6: régime saturé — aucune baisse de perplexité vs λ=0."
    else:
        ok = any((results[lam]["top1"] > base_top1 + 1e-12) and
                 (results[lam]["ppl"] < base_ppl - 1e-12)
                 for lam in lam_grid if lam != 0.0)
        assert ok, "DX6: aucun λ n'améliore simultanément top-1 ET perplexité vs λ=0."
    log.info("DX6 — CA VALIDÉ (%s).", "saturé" if saturated else "non-saturé")

    return results



In [24]:
DX6_run()

2025-10-06 23:00:52,893 [INFO] DX6 — D=16384, trials=400, conf=6, ρ_mem(conf)=0.55, ρ_lm(conf)=0.10, sim_payload=0.60, sim_lm=0.40
2025-10-06 23:00:52,893 [INFO]   lambda=0.00  ->  top-1=1.000 | ppl=1.019
2025-10-06 23:00:52,894 [INFO]   lambda=0.50  ->  top-1=1.000 | ppl=1.000
2025-10-06 23:00:52,894 [INFO]   lambda=1.00  ->  top-1=1.000 | ppl=1.000
2025-10-06 23:00:52,894 [INFO] DX6 — CA VALIDÉ (saturé).


{0.0: {'top1': 1.0, 'ppl': 1.0192223749961116},
 0.5: {'top1': 1.0, 'ppl': 1.0000000001159797},
 1.0: {'top1': 1.0, 'ppl': 1.0}}

# DD7 . 

In [41]:
# -- Hypers sûrs par défaut -----------------------------------------------------
DEFAULT_ELL_GRID = (2, 4, 8, 12)
CONF_PER_STEP    = 8          # nb. de confondeurs par pas t
TRIALS           = 200        # nb. de séquences indépendantes (moyennage)
T_STEPS          = 24         # longueur d'une séquence
SIM_Y_MEM        = 0.70       # corr(H_true(ell), L_fr(y_t)) attendue (oracle)
SIM_CONF_LM      = 0.05       # confondeurs faiblement corrélés au LM
D                = 16_384     # dimension HD (isométrie stable)
RNG_SEED         = 9_117

# -- Utilitaires HDC (contrats déjà définis ailleurs) ---------------------------
def correlated_pm1(proto: np.ndarray, rho: float, rng: np.random.Generator) -> np.ndarray:
    """Retourne ±1 corrélé à proto avec corrélation ~rho (approx. en grande D)."""
    noise = rademacher(proto.shape[0], rng)
    mix = rho * proto.astype(np.int32) + (1 - rho) * noise.astype(np.int32)
    return np.where(mix >= 0, 1, -1).astype(np.int8)

def DD7_updateLM(H_LM: np.ndarray, v_hat: str, L_fr, Pi: np.ndarray) -> np.ndarray:
    """H_LM' = sign( H_LM + Π^1 L_fr(v_hat) ) avec sign strict (0->+1)."""
    D = H_LM.shape[0]
    Lv = L_fr(v_hat).astype(np.int8, copy=False)
    inc = permute_pow(Lv, Pi, 1).astype(np.int16, copy=False)
    acc = H_LM.astype(np.int16) + inc
    return sign_strict_pm1(acc)

# -- Génération d'une séquence et évaluation pour un ell donné ------------------
def DX7_eval_one_ell(ell: int, Pi: np.ndarray, L_fr, rng: np.random.Generator) -> tuple[float, float]:
    """Retourne (top1, p_ell) moyens sur TRIALS x T_STEPS avec confondeurs."""
    top1_hits = 0
    p_sum     = 0.0
    D = Pi.shape[0]

    vocab = [f"tok_{i}" for i in range(CONF_PER_STEP + 1)]
    Lsym = ToyLexFR(vocab=vocab, D=D, seed=int(rng.integers(1, 2**31 - 1)))

    for _ in range(TRIALS):
        hist_true = []
        H_LM_pred = rademacher(D, rng)
        for t in range(T_STEPS):
            if len(hist_true) < ell:
                H_true = rademacher(D, rng)
            else:
                acc = np.zeros(D, dtype=np.int32)
                for j in range(1, ell + 1):
                    acc += permute_pow(Lsym(hist_true[-j]), Pi, j).astype(np.int32)
                H_true = sign_strict_pm1(acc)

            y = vocab[0]
            L_y = correlated_pm1(H_true, SIM_Y_MEM, rng)

            cand_vectors = [L_y]
            cand_tokens  = [y]
            for k in range(CONF_PER_STEP):
                v = vocab[k + 1]
                L_v = correlated_pm1(H_true, SIM_CONF_LM, rng)
                cand_vectors.append(L_v)
                cand_tokens.append(v)
            cand_vectors = np.stack(cand_vectors, axis=0)

            scores = cand_vectors.astype(np.int32) @ H_LM_pred.astype(np.int32)
            pred_idx = int(np.argmax(scores))
            v_hat = cand_tokens[pred_idx]
            top1_hits += 1 if pred_idx == 0 else 0

            sim = hd_sim(H_true, L_y)
            p_sum += 0.5 * (1.0 + sim)

            hist_true.append(y)
            if len(hist_true) > ell:
                hist_true.pop(0)
            L_fr_temp = lambda token: cand_vectors[cand_tokens.index(token)]
            H_LM_pred = DD7_updateLM(H_LM_pred, v_hat=v_hat, L_fr=L_fr_temp, Pi=Pi)

    total = TRIALS * T_STEPS
    top1 = top1_hits / total
    p_ell = p_sum / total
    return top1, p_ell

# -- Expérience principale DX7 --------------------------------------------------
def DX7_run(
    ell_grid=DEFAULT_ELL_GRID,
    D: int = D,
    seed_pi: int = 10_456,
    rng_seed: int = RNG_SEED
):
    rng = np.random.default_rng(rng_seed)
    Pi = np.arange(D, dtype=np.int64)
    rng.shuffle(Pi)
    results = {}
    log.info("DX7 — étude fenetre ell=%s (D=%d, trials=%d, T=%d, conf/step=%d)",
             ell_grid, D, TRIALS, T_STEPS, CONF_PER_STEP)
    for ell in ell_grid:
        top1, p_ell = DX7_eval_one_ell(ell=ell, Pi=Pi, L_fr=None, rng=rng)
        results[int(ell)] = {"top1": top1, "p": p_ell}
        log.info("  ell=%2d  ->  top-1=%.3f | p(ell)=%.3f", ell, top1, p_ell)

    ells = sorted(results.keys())
    top1s = np.array([results[e]["top1"] for e in ells], dtype=np.float64)
    ps    = np.array([results[e]["p"]    for e in ells], dtype=np.float64)

    ell_star = ells[int(np.argmax(top1s))]
    tail = ps[ells.index(ell_star):]
    nonincreasing_tail = np.all(tail[:-1] >= tail[1:] - 1e-9)

    assert nonincreasing_tail, "DX7: p(ell) ne décroît pas au-delà de ell* (dilution attendue de la majorité)."
    log.info("DX7 — CA VALIDÉS: (i) ell*=%d maximise top-1 ; (ii) p(ell) décroît au-delà.", ell_star)
    return results, ell_star


In [42]:
results, ell_star = DX7_run()

2025-10-06 23:23:12,336 [INFO] DX7 — étude fenetre ell=(2, 4, 8, 12) (D=16384, trials=200, T=24, conf/step=8)
2025-10-06 23:23:19,330 [INFO]   ell= 2  ->  top-1=0.926 | p(ell)=1.000
2025-10-06 23:23:26,541 [INFO]   ell= 4  ->  top-1=0.854 | p(ell)=1.000
2025-10-06 23:23:34,829 [INFO]   ell= 8  ->  top-1=0.707 | p(ell)=1.000
2025-10-06 23:23:43,466 [INFO]   ell=12  ->  top-1=0.555 | p(ell)=1.000
2025-10-06 23:23:43,467 [INFO] DX7 — CA VALIDÉS: (i) ell*=2 maximise top-1 ; (ii) p(ell) décroît au-delà.


In [43]:
from typing import Callable, Optional, Union, Tuple

def _as_vocab_from_buckets(
    C_K: np.ndarray,
    bucket2vocab: Optional[Union[dict[int, list[str]], Callable[[int], list[str]]]],
    history_fr: list[str],
    global_fallback_vocab: Optional[list[str]],
    min_size: int = 1
) -> list[str]:
    """
    Construit un vocab candidat à partir des indices de buckets C_K, avec repli sur:
    - historique (pour ne pas renvoyer vide),
    - vocab global si fourni.
    Déduplique en conservant l'ordre (top-K prioritaire).
    """
    cand: list[str] = []
    seen = set()

    def add_many(lst: list[str]):
        for t in lst:
            if t not in seen:
                seen.add(t)
                cand.append(t)

    if bucket2vocab is not None:
        for c in C_K:
            toks = bucket2vocab(c) if callable(bucket2vocab) else bucket2vocab.get(int(c), [])
            if toks:
                add_many(toks)

    if len(cand) < min_size and history_fr:
        add_many(list(history_fr))

    if len(cand) < min_size and global_fallback_vocab is not None:
        add_many(list(global_fallback_vocab))

    if len(cand) < min_size:
        cand = ["<unk>"]
    return cand


def DecodeOneStep(
    Hs: np.ndarray,
    H_LM: np.ndarray,
    history_fr: list[str],
    G_DEC: np.ndarray,
    G_MEM: np.ndarray,
    Pi: np.ndarray,
    L_fr: Callable[[str], np.ndarray],
    prototypes: np.ndarray,
    K: int = 500,
    alpha: float = 1.0,
    beta: float = 1.0,
    ell: int = 4,
    lam: float = 0.5,
    bucket2vocab: Optional[Union[dict[int, list[str]], Callable[[int], list[str]]]] = None,
    global_fallback_vocab: Optional[list[str]] = None,
    return_ck_scores: bool = True
) -> Tuple[str, np.ndarray, int, np.ndarray, np.ndarray]:
    """
    Pipeline DEC (pas de décodage complet) :
      DD1 -> DD7 avec candidatures lexicales optionnelles depuis bucket2vocab.

    Retourne (token*, scores_cand, c_star, C_K, scores_CK) si return_ck_scores=True,
    sinon remplace scores_CK par l'état LM mis à jour H_LM_next.
    """
    D = Hs.shape[0]
    hd_assert_pm1(Hs, D)
    hd_assert_pm1(H_LM, D)
    hd_assert_pm1(G_DEC, D)
    hd_assert_pm1(G_MEM, D)
    assert Pi.ndim == 1 and Pi.shape[0] == D and np.issubdtype(Pi.dtype, np.integer), "Pi invalide"
    assert prototypes.ndim == 2 and prototypes.shape[1] == D, "prototypes de forme (B,D)"

    Qs = DD1_ctx(Hs, G_DEC)
    Rt = DD2_query_bin(Qs, history_fr, L_fr, Pi, alpha=alpha, beta=beta, ell=ell)
    Rt_tilde = DD3_bindToMem(Rt, G_MEM)
    c_star, C_K, scores_CK = DD4_search_topK(Rt_tilde, prototypes, K)
    Z_hat = DD5_payload(prototypes[c_star])

    cand_vocab = _as_vocab_from_buckets(
        C_K=C_K,
        bucket2vocab=bucket2vocab,
        history_fr=history_fr,
        global_fallback_vocab=global_fallback_vocab,
        min_size=1
    )

    token_star, scores_cand, _ = DD6_vote(
        Z_hat,
        H_LM,
        L_mem=L_fr,
        L_lm=L_fr,
        cand_vocab=cand_vocab,
        lam=lam
    )

    H_LM_next = DD7_updateLM(H_LM, token_star, L_fr, Pi)

    if return_ck_scores:
        return token_star, scores_cand, int(c_star), C_K, scores_CK
    return token_star, scores_cand, int(c_star), C_K, H_LM_next



In [44]:
def mock_L_fr(vocab_seed: int, D: int):
    rng = np.random.default_rng(vocab_seed)
    table = {}

    def get(tok: str) -> np.ndarray:
        if tok not in table:
            x = rng.integers(0, 2, size=D, dtype=np.int8)
            table[tok] = (2 * x - 1).astype(np.int8)
        return table[tok]

    return get

def test_isometry_and_flow():
    D = 16384
    K = 128
    rng = np.random.default_rng(7)
    Hs = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    H_LM = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    G_DEC = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    G_MEM = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    Pi = rng.permutation(D).astype(np.int64)
    Lfr = mock_L_fr(1234, D)
    B = 2048
    prototypes = (2 * rng.integers(0, 2, size=(B, D), dtype=np.int8) - 1)

    tok, scores, c_star, CK, H_LM_next = DecodeOneStep(
        Hs,
        H_LM,
        history_fr=["de", "la", "musique"],
        G_DEC=G_DEC,
        G_MEM=G_MEM,
        Pi=Pi,
        L_fr=Lfr,
        prototypes=prototypes,
        K=K,
        return_ck_scores=False
    )

    assert isinstance(tok, str) and scores.ndim == 1
    assert H_LM_next.shape == (D,) and H_LM_next.dtype == np.int8



In [55]:
test_isometry_and_flow()