# explore_bis_v3

Notebook simplifié montrant comment utiliser les blocs **ENC** et **MEM**
exposés par la librairie `hdc_project.encoder`.

In [4]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print(f'Using src path: {SRC}')


Using src path: /Users/aymenmejri/Desktop/MyCode/experiments/hdc_v2/hdc_project/src


In [5]:
import numpy as np

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline



## Chargement du sous-corpus OPUS

On réutilise `opus_load_subset` depuis la librairie pour récupérer un petit
sous-échantillon bilingue (EN/FR). En environnement hors-ligne, un jeu de
repli est utilisé pour que le notebook reste exécutable.



In [6]:
import numpy as np
from tqdm import tqdm

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline

# ----------------------------
# 0) Chargement données OPUS
# ----------------------------
try:
    ens_raw, frs_raw = enc_pipeline.opus_load_subset(
        name="opus_books",
        config="en-fr",
        split="train",
        N=10_000,
        seed=2025,
    )
    print(f"OPUS subset loaded: {len(ens_raw)} pairs")
except Exception as exc:
    print("Warning: OPUS download failed, falling back to local toy corpus.")
    print(f"Original error: {exc}")
    ens_raw = [
        "hyperdimensional computing is fun",
        "vector symbolic architectures are powerful",
        "encoding words into hyperspace",
        "memory augmented networks love clean data",
    ]
    frs_raw = [
        "le calcul hyperdimensionnel est amusant",
        "les architectures symboliques vectorielles sont puissantes",
        "encoder des mots dans l'hyperspace",
        "les réseaux augmentés de mémoire aiment les données propres",
    ]

enc_sample_size = min(10_000, len(ens_raw))
mem_sample_size = min(10_000, len(ens_raw))
ens_sample = ens_raw[:enc_sample_size]
frs_sample = frs_raw[:enc_sample_size]
print(f"ENC sample size: {enc_sample_size}")
print(f"MEM sample size: {mem_sample_size}")

# ----------------------------
# 1) Encodage ENC (M5–M7)
# ----------------------------
D = 8192
n = 5
rng = np.random.default_rng(123)

Lex_en = m4.M4_LexEN_new(seed=1, D=D)
Lex_fr = m4.M4_LexEN_new(seed=2, D=D)
pi = rng.permutation(D).astype(np.int64)

encoded_en = enc_pipeline.encode_corpus_ENC(ens_sample, Lex_en, pi, D, n, seg_seed0=999)
encoded_fr = enc_pipeline.encode_corpus_ENC(frs_sample, Lex_fr, pi, D, n, seg_seed0=1999)

E_list_en = [segment["E_seq"] for segment in encoded_en]
H_list_en = [segment["H"] for segment in encoded_en]
print(f"Encoded {len(encoded_en)} sentences; signature shape = {H_list_en[0].shape}")

# Quelques stats ENC
s_intra, s_inter = enc_pipeline.intra_inter_ngram_sims(E_list_en, D)
inter_seg = enc_pipeline.inter_segment_similarity(H_list_en)
maj_curves = enc_pipeline.majority_error_curve(E_list_en, pi, D, eta_list=(0.0, 0.05))
print(f"intra={s_intra:.4f}, inter(abs)={s_inter:.4f}, inter segments={inter_seg:.4f}")
print("majority curve (eta=0):", maj_curves[0.0][:2])

# -------------------------------------------------------------
# 2) Helpers de "contenu" (sans K_s) pour fabriquer les paires
#    -> on somme des X_t (déjà alignés par Pi^Δ), puis on seuillle
# -------------------------------------------------------------
def content_signature_from_Xseq(X_seq, majority: str = "strict"):
    if not X_seq:
        raise ValueError("X_seq vide")
    S = np.zeros((X_seq[0].shape[0],), dtype=np.int32)
    for x in X_seq:
        S += x.astype(np.int32, copy=False)
    if majority == "strict":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    elif majority == "unbiased":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    else:
        raise ValueError("majority must be 'strict' or 'unbiased'")

def span_signatures_from_trace(X_seq, win: int = 12, stride: int = 6, majority: str = "unbiased"):
    if not X_seq:
        return []
    T = len(X_seq)
    out = []
    if T <= win:
        out.append(content_signature_from_Xseq(X_seq, majority))
        return out
    for start in range(0, T - win + 1, max(1, stride)):
        stop = start + win
        out.append(content_signature_from_Xseq(X_seq[start:stop], majority))
    return out

def build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict", max_pairs=None):
    pairs = []
    N = min(len(encoded_en), len(encoded_fr))
    for i in range(N):
        X_en = encoded_en[i]["X_seq"]
        X_fr = encoded_fr[i]["X_seq"]
        spans_en = span_signatures_from_trace(X_en, win=win, stride=stride, majority=majority)
        spans_fr = span_signatures_from_trace(X_fr, win=win, stride=stride, majority=majority)
        L = min(len(spans_en), len(spans_fr))
        for t in range(L):
            pairs.append((
                spans_en[t].astype(np.int8, copy=False),
                spans_fr[t].astype(np.int8, copy=False),
            ))
            if max_pairs is not None and len(pairs) >= max_pairs:
                return pairs
    return pairs

# -------------------------------------------------------------
# 3) Paires MEM = spans EN/FR (contenu, sans K_s)
# -------------------------------------------------------------
pairs_mem = build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict")
print(f"Pairs available for MEM training: {len(pairs_mem)}")

# -------------------------------------------------------------
# 4) Instanciation MEM et entraînement one-pass
#    (k ≈ log2(B) + marge ; ici B=256, k=24 convient)
# -------------------------------------------------------------
MEM_K = 16
MEM_BUCKETS = 128
cfg = mem_pipeline.MemConfig(D=D, B=MEM_BUCKETS, k=MEM_K, seed_lsh=10, seed_gmem=11)
comp = mem_pipeline.make_mem_pipeline(cfg)
mem_pipeline.train_one_pass_MEM(comp, pairs_mem)
print("Training complete; few bucket counts:", comp.mem.n[:64])

# -------------------------------------------------------------
# 5) Probe correcte : on interroge avec Z_en (span) et on compare
#    le prototype choisi à Z_fr (span) correspondant
# -------------------------------------------------------------
probe_count = min(200, len(pairs_mem))
sim_values = []
for Z_en_vec, Z_fr_vec in tqdm(pairs_mem[:probe_count]):
    bucket_idx, score = mem_pipeline.infer_map_top1(comp, Z_en_vec)  # Z_en (span), pas H_en
    prototype = comp.mem.H[bucket_idx].astype(np.int32, copy=False)
    sim = float(np.dot(prototype, Z_fr_vec.astype(np.int32, copy=False)) / D)
    sim_values.append(sim)

print(f"Top-1 mean similarity over {probe_count} span-probes: {np.mean(sim_values):.4f}")
print(f"Top-1 median similarity: {np.median(sim_values):.4f}")

OPUS subset loaded: 10000 pairs
ENC sample size: 10000
MEM sample size: 10000


Processing: 100%|██████████| 10000/10000 [00:26<00:00, 381.24it/s]
Processing: 100%|██████████| 10000/10000 [00:26<00:00, 381.62it/s]


Encoded 10000 sentences; signature shape = (8192,)


100%|██████████| 10000/10000 [07:08<00:00, 23.35it/s]


intra=0.0009, inter(abs)=0.0253, inter segments=0.0089
majority curve (eta=0): [(1, 0.0), (2, 0.0)]
Pairs available for MEM training: 37968
Training complete; few bucket counts: [249 229 310 312 265 289 325 291 326 361 249 251 224 314 335 299 287 308
 255 302 318 347 296 319 249 297 259 312 271 270 345 281 287 257 305 318
 301 303 321 308 326 307 280 261 338 296 279 330 259 224 269 284 252 293
 394 297 332 343 256 333 336 246 293 266]


100%|██████████| 200/200 [00:00<00:00, 2307.57it/s]

Top-1 mean similarity over 200 span-probes: 0.2727
Top-1 median similarity: 0.2773





In [8]:
nb = comp.mem.n
print("pop mean/median/min/max/std:",
      float(nb.mean()), float(np.median(nb)), int(nb.min()), int(nb.max()), float(nb.std()))
print("p90/p99:", int(np.quantile(nb, 0.90)), int(np.quantile(nb, 0.99)))

pop mean/median/min/max/std: 296.625 293.0 216 477 39.670714954485
p90/p99: 345 394



> ℹ️ **Remarque pratique** : si le téléchargement OPUS échoue (exécution hors-ligne),
> le notebook bascule automatiquement sur un mini corpus embarqué afin de
> conserver une démonstration reproductible des blocs ENC et MEM.



# DEC

## DEC-0 : 

In [1]:
import numpy as np, logging
log = logging.getLogger("DEC")
if not log.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

def sign_strict_pm1(x: np.ndarray) -> np.ndarray:
    """Retourne int8 dans {-1,+1}; convention 0 -> +1 (majorité stricte)."""
    y = (x >= 0).astype(np.int8, copy=False)  # 0/1
    return ((y << 1) - 1).astype(np.int8, copy=False)

def hd_assert_pm1(x: np.ndarray, D: int | None = None) -> None:
    assert x.dtype == np.int8 and np.all((x == 1) | (x == -1)), "Vecteur non binaire ±1/int8"
    if D is not None:
        assert x.ndim == 1 and x.shape[0] == D, "Dimension inattendue"

def hd_bind(x: np.ndarray, key: np.ndarray) -> np.ndarray:
    """Binding Hadamard en int8 (produit élémentaire)."""
    return (x.astype(np.int8, copy=False) * key.astype(np.int8, copy=False)).astype(np.int8, copy=False)

def hd_sim(x: np.ndarray, y: np.ndarray) -> float:
    """Similarité cosinus sur ±1 (équivalente à corrélation normalisée)."""
    assert x.shape == y.shape
    return float((x.astype(np.int32) @ y.astype(np.int32)) / x.shape[0])

def permute_pow(x: np.ndarray, pi: np.ndarray, power: int) -> np.ndarray:
    """Applique Π^power via indices précalculés si fournis (sinon compose)."""
    idx = np.arange(x.shape[0], dtype=np.int64)
    for _ in range(power % x.shape[0]):
        idx = pi[idx]
    return x[idx]

In [2]:
import numpy as np

# ---------------------------
# Helpers (cf. DD0 du tutoriel)
# ---------------------------
def hd_assert_pm1(x: np.ndarray, D: int | None = None) -> None:
    assert x.dtype == np.int8 and np.all((x == 1) | (x == -1)), "expect ±1/int8"
    if D is not None:
        assert x.ndim == 1 and x.shape[0] == D, "wrong shape"

def hd_bind(x: np.ndarray, key: np.ndarray) -> np.ndarray:
    return (x.astype(np.int8, copy=False) * key.astype(np.int8, copy=False)).astype(np.int8, copy=False)

def hd_sim(x: np.ndarray, y: np.ndarray) -> float:
    assert x.shape == y.shape
    # produit scalaire en int32 pour stabilité, renvoyé en float64 (double précision)
    return float((x.astype(np.int32) @ y.astype(np.int32)) / x.shape[0])

def pm1(shape, rng) -> np.ndarray:
    """Tire des vecteurs Rademacher ±1 en int8, shape=(...), dtype=int8."""
    return (2 * rng.integers(0, 2, size=shape, dtype=np.int8) - 1).astype(np.int8, copy=False)

# ---------------------------
# DX0: tests
# ---------------------------
def dx0_sanity(D: int = 16_384, N_sim: int = 1_000, seed: int = 2024, tol: float = 5e-3) -> None:
    """
    Vérifie:
      1) hd_sim(x,x)=1 et hd_sim(x,-x)=-1 (à tol près)
      2) Invariance de similarité par binding: sim(x,y)=sim(x⊗k, y⊗k)
      3) Préservation de la norme (||x||_2/√D = 1) avant/après binding
    Critère d'acceptation (CA): écarts absolus ≤ 5e-3.
    """
    rng = np.random.default_rng(seed)

    max_err_self = 0.0
    max_err_neg  = 0.0
    max_err_bind = 0.0
    max_err_norm = 0.0

    for _ in range(N_sim):
        x = pm1(D, rng); y = pm1(D, rng); k = pm1(D, rng)
        hd_assert_pm1(x, D); hd_assert_pm1(y, D); hd_assert_pm1(k, D)

        # (1) Identités de similarité
        s_xx = hd_sim(x, x)
        s_xnx = hd_sim(x, (-x).astype(np.int8, copy=False))

        max_err_self = max(max_err_self, abs(s_xx - 1.0))
        max_err_neg  = max(max_err_neg,  abs(s_xnx + 1.0))

        # (2) Invariance par binding (DEC1)
        s_xy      = hd_sim(x, y)
        xk, yk    = hd_bind(x, k), hd_bind(y, k)
        s_xy_bind = hd_sim(xk, yk)
        max_err_bind = max(max_err_bind, abs(s_xy - s_xy_bind))

        # (3) Normes (avant/après binding)
        # Pour des vecteurs ±1, ||x||_2 = sqrt(D). On vérifie la normalisation relative.
        norm_x  = np.linalg.norm(x.astype(np.float64)) / np.sqrt(D)
        norm_xk = np.linalg.norm(xk.astype(np.float64)) / np.sqrt(D)
        max_err_norm = max(max_err_norm, abs(norm_x - 1.0), abs(norm_xk - 1.0))

    # Rapport
    print("DX0 — Sanity checks (double précision)")
    print(f"  D={D}, N={N_sim}, tol={tol:.1e}")
    print(f"  max|sim(x,x)-1|         = {max_err_self:.3e}")
    print(f"  max|sim(x,-x)+1|        = {max_err_neg:.3e}")
    print(f"  max|sim(x,y)-sim(x⊗k,y⊗k)| = {max_err_bind:.3e}")
    print(f"  max| ||x||/√D - 1 | (incl. bind) = {max_err_norm:.3e}")

    # Assertions CA
    assert max_err_self <= tol,     "CA non satisfait: sim(x,x) s'écarte de 1"
    assert max_err_neg  <= tol,     "CA non satisfait: sim(x,-x) s'écarte de -1"
    assert max_err_bind <= tol,     "CA non satisfait: invariance de similarité après binding"
    assert max_err_norm <= tol,     "CA non satisfait: norme non préservée (relative)"


In [3]:
dx0_sanity()


DX0 — Sanity checks (double précision)
  D=16384, N=1000, tol=5.0e-03
  max|sim(x,x)-1|         = 0.000e+00
  max|sim(x,-x)+1|        = 0.000e+00
  max|sim(x,y)-sim(x⊗k,y⊗k)| = 0.000e+00
  max| ||x||/√D - 1 | (incl. bind) = 0.000e+00


## DD1 .


In [4]:
def DD1_ctx(Hs: np.ndarray, G_DEC: np.ndarray) -> np.ndarray:
    """
    Q^(s) = H^(s) ⊗ G_DEC, binding isométrique (int8 -> int8).
    """
    assert Hs.dtype == np.int8 and G_DEC.dtype == np.int8
    hd_assert_pm1(Hs); hd_assert_pm1(G_DEC, Hs.shape[0])
    return hd_bind(Hs, G_DEC)

In [5]:
# --- DX1: tests détaillés ---
def dx1_test_DD1_ctx(D: int = 16_384, m: int = 64, trials: int = 200, seed: int = 1234, tol: float = 5e-3):
    rng = np.random.default_rng(seed)

    # 1) Similarité inchangée et normes préservées (sur 'trials' paires)
    max_err_sim = 0.0
    max_err_norm = 0.0
    for _ in range(trials):
        H1, H2, G = pm1(D, rng), pm1(D, rng), pm1(D, rng)
        # Copies pour vérifier non-mutation
        H1_copy, H2_copy, G_copy = H1.copy(), H2.copy(), G.copy()

        Q1, Q2 = DD1_ctx(H1, G), DD1_ctx(H2, G)
        # Similarité
        s0 = hd_sim(H1, H2)
        s1 = hd_sim(Q1, Q2)
        max_err_sim = max(max_err_sim, abs(s0 - s1))

        # Normes relatives
        nH1  = np.linalg.norm(H1.astype(np.float64)) / np.sqrt(D)
        nQ1  = np.linalg.norm(Q1.astype(np.float64)) / np.sqrt(D)
        nH2  = np.linalg.norm(H2.astype(np.float64)) / np.sqrt(D)
        nQ2  = np.linalg.norm(Q2.astype(np.float64)) / np.sqrt(D)
        max_err_norm = max(max_err_norm, abs(nH1 - 1.0), abs(nQ1 - 1.0),
                                           abs(nH2 - 1.0), abs(nQ2 - 1.0))

        # Contrats: dtype & non-mutation
        assert Q1.dtype == np.int8 and Q2.dtype == np.int8
        assert np.all(H1 == H1_copy) and np.all(H2 == H2_copy) and np.all(G == G_copy), "mutation détectée"
        assert np.all((Q1 == 1) | (Q1 == -1)) and np.all((Q2 == 1) | (Q2 == -1)), "sortie hors ±1"

    # 2) Isométrie de Gram (m vecteurs)
    H = np.stack([pm1(D, rng) for _ in range(m)], axis=0)  # (m, D) ±1/int8
    G = pm1(D, rng)
    Q = np.stack([DD1_ctx(H[i], G) for i in range(m)], axis=0)

    # Gram avant/après, en double précision
    G0 = (H.astype(np.int32) @ H.astype(np.int32).T) / D
    G1 = (Q.astype(np.int32) @ Q.astype(np.int32).T) / D
    max_err_gram = float(np.max(np.abs(G0.astype(np.float64) - G1.astype(np.float64))))

    # --- Rapport ---
    print("DX1 — DD1_ctx (isométrie & contrats)")
    print(f"  D={D}, m={m}, trials={trials}, tol={tol:.1e}")
    print(f"  max|sim_before - sim_after|  = {max_err_sim:.3e}")
    print(f"  max| ||H||/√D - 1 | (incl. bind) = {max_err_norm:.3e}")
    print(f"  max|Gram_before - Gram_after| = {max_err_gram:.3e}")

    # --- Critères d'acceptation ---
    assert max_err_sim  <= tol, "Invariance de similarité violée (DEC1)"
    assert max_err_norm <= tol, "Norme non préservée (relative)"
    assert max_err_gram <= tol, "Isométrie de Gram violée (DEC1)"

dx1_test_DD1_ctx()

DX1 — DD1_ctx (isométrie & contrats)
  D=16384, m=64, trials=200, tol=5.0e-03
  max|sim_before - sim_after|  = 0.000e+00
  max| ||H||/√D - 1 | (incl. bind) = 0.000e+00
  max|Gram_before - Gram_after| = 0.000e+00


# DD2 . 

In [None]:
import numpy as np
import logging
from typing import List, Tuple
from tqdm import tqdm

log = logging.getLogger("DEC.DX2.v2")

# -- utilitaires (identiques à DD0) --
def sign_strict_pm1(x: np.ndarray) -> np.ndarray:
    y = (x >= 0).astype(np.int8, copy=False)
    return ((y << 1) - 1).astype(np.int8, copy=False)

def hd_assert_pm1(x: np.ndarray, D: int | None = None) -> None:
    assert x.dtype == np.int8 and np.all((x == 1) | (x == -1)), "±1/int8 attendu"
    if D is not None:
        assert x.ndim == 1 and x.shape[0] == D, "shape inattendu"

def permute_pow(x: np.ndarray, pi: np.ndarray, power: int) -> np.ndarray:
    # NOTE: pour la perf réelle, pré-calculer pi_pows ; ici: version simple et sûre.
    idx = np.arange(x.shape[0], dtype=np.int64)
    for _ in range(power % x.shape[0]):
        idx = pi[idx]
    return x[idx]

def hd_sim(x: np.ndarray, y: np.ndarray) -> float:
    return float((x.astype(np.int32) @ y.astype(np.int32)) / x.shape[0])



  H_hist = sign( sum_{j=1..ell} Pi^j L_fr(\hat v_{t-j}) ).


In [34]:
def build_perm_inverse(pi: np.ndarray) -> np.ndarray:
    """Construit l'inverse de la permutation pi (ndarray d'indices)."""
    assert isinstance(pi, np.ndarray) and pi.ndim == 1
    pi_inv = np.empty_like(pi)
    pi_inv[pi] = np.arange(pi.shape[0], dtype=pi.dtype)
    return pi_inv

def permute_pow_signed(x: np.ndarray, pi: np.ndarray, pi_inv: np.ndarray, power: int) -> np.ndarray:
    r"""
    Applique Π^power sur x (±1/int8) avec gestion des puissances négatives via Π^{-1}.
    - Complexity OK ici car |power| ≤ ell (≤ 8 dans DX2).
    - Pour de grands exponents, préférer pré-calcul d'un tableau pi_pows.
    """
    assert x.ndim == 1 and x.shape[0] == pi.shape[0] == pi_inv.shape[0]
    if power == 0:
        return x
    idx = np.arange(x.shape[0], dtype=np.int64)
    if power > 0:
        for _ in range(power):
            idx = pi[idx]
    else:
        for _ in range(-power):
            idx = pi_inv[idx]
    return x[idx].astype(np.int8, copy=False)

In [36]:
def DD2_query(Qs: np.ndarray,
              hist_tokens: list[np.ndarray],  # vecteurs L_fr(\hat v)
              pi: np.ndarray,
              alpha: float, beta: float,
              ell: int) -> np.ndarray:
    r"""R_t = α Qs + β * sign( sum_{j=1..ell} Π^j L_{t-j} ), normalisé."""
    D = Qs.shape[0]
    pi_inv = build_perm_inverse(pi)
    # positionnement historique
    shifted = [permute_pow_signed(hist_tokens[j], pi, pi_inv, j+1) for j in range(ell)]
    H_hist = np.sum(np.stack(shifted, axis=0).astype(np.int16), axis=0)
    H_hist = ((H_hist >= 0).astype(np.int8) * 2 - 1).astype(np.int8)  # sign strict
    # combinaison et normalisation
    Rt = alpha*Qs.astype(np.float64) + beta*H_hist.astype(np.float64)
    Rt = Rt / np.linalg.norm(Rt) * np.sqrt(D)
    return Rt.astype(np.float64)

def DX2_run():
    D, trials = 16384, 200
    ells = (2,4,8); ratios = (1/3, 1.0, 3.0)
    g = np.random.default_rng(2025)
    pi = np.arange(D, dtype=np.int64); g.shuffle(pi)
    pi_inv = build_perm_inverse(pi)

    def rand_pm1():
        r = g.integers(0, 2, size=D, dtype=np.int8)
        return ((r << 1) - 1).astype(np.int8, copy=False)

    def sim(a,b): return float((a.astype(np.int32) @ b.astype(np.int32)) / D)

    norms = {}
    gram_uniform_ok = True
    pair_shift_ok = True

    for ell in tqdm(ells):
        for r in ratios:
            alpha, beta = r, 1.0
            vals = []
            for _ in range(trials):
                Qs   = rand_pm1()
                hist = [rand_pm1() for _ in range(ell)]
                P = np.stack([permute_pow_signed(hist[j], pi, pi_inv, j+1) for j in range(ell)], axis=0)

                # (i) invariance Gram sous permutation uniforme
                s = int(g.integers(1, 7))
                P_uni = np.stack([permute_pow_signed(P[j], pi, pi_inv, s) for j in range(ell)], axis=0)
                G  = (P.astype(np.int32) @ P.T.astype(np.int32)) / D
                Gu = (P_uni.astype(np.int32) @ P_uni.T.astype(np.int32)) / D
                if not np.allclose(G, Gu, atol=5e-3, rtol=0):
                    gram_uniform_ok = False

                # (ii) identité paire-à-paire: <Π^i Li, Π^k Lk> == <Li, Π^{k-i} Lk>
                for i in range(ell):
                    for k in range(ell):
                        lhs = sim(P[i], P[k])
                        rhs = sim(hist[i], permute_pow_signed(hist[k], pi, pi_inv, (k+1)-(i+1)))
                        if abs(lhs - rhs) > 5e-3:
                            pair_shift_ok = False
                            break

                # (iii) norme de R_t
                Rt = DD2_query(Qs, hist, pi, alpha=alpha, beta=beta, ell=ell)
                vals.append(float(np.linalg.norm(Rt) / np.sqrt(D)))

            norms[(ell, r)] = (min(vals), float(np.median(vals)), max(vals))

    # reporting
    import logging
    log = logging.getLogger("DX2")
    if not log.handlers:
        logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
    log.info("DX2 — Norme(R_t)/sqrt(D) par (ell, alpha/beta): min | median | max")
    for (ell, r), (mn, md, mx) in sorted(norms.items()):
        log.info("  ell=%d, alpha/beta=%.3g  ->  %.3f | %.3f | %.3f", ell, r, mn, md, mx)

    # CA
    in_band = all(0.9 <= s <= 1.1 for stats in norms.values() for s in stats)
    assert in_band, "DX2: norme(R_t)/sqrt(D) hors bande [0.9,1.1]."
    assert gram_uniform_ok, "DX2: Gram NON invariant sous permutation uniforme."
    assert pair_shift_ok, "DX2: identité de décalage paire-à-paire violée."

In [37]:
DX2_run()

100%|██████████| 3/3 [00:06<00:00,  2.21s/it]
2025-10-06 10:41:53,673 [INFO] DX2 — Norme(R_t)/sqrt(D) par (ell, alpha/beta): min | median | max
2025-10-06 10:41:53,673 [INFO]   ell=2, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,673 [INFO]   ell=2, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=2, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=4, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=4, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=4, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=8, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=8, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 10:41:53,674 [INFO]   ell=8, alpha/beta=3  ->  1.000 | 1.000 | 1.000


# DD3 . 

In [40]:
def DD3_bindToMem(Rt: np.ndarray, G_MEM: np.ndarray) -> np.ndarray:
    """~R_t = R_t ⊗ G_MEM (int8 -> int8)."""
    hd_assert_pm1(Rt); hd_assert_pm1(G_MEM, Rt.shape[0])
    return hd_bind(Rt, G_MEM)

def hd_sim_dot(x: np.ndarray, y: np.ndarray) -> int:
    """Produit scalaire entier (évite l'arrondi); x,y en int8 ±1."""
    return int(x.astype(np.int32) @ y.astype(np.int32))

In [43]:
# --- test KS (asymptotique) sans dépendance externe ---
def ks_2samp_asymp(x: np.ndarray, y: np.ndarray) -> tuple[float, float]:
    """
    KS à 2 échantillons: renvoie (D_stat, pval approx).
    Correctifs:
      - si D_stat == 0 => p = 1.0 (distributions identiques)
      - clamp numérique sur lambda pour petits D_stat
    """
    x = np.asarray(x, dtype=np.float64)
    y = np.asarray(y, dtype=np.float64)
    n, m = x.size, y.size
    x_sorted = np.sort(x); y_sorted = np.sort(y)
    i = j = 0
    cdf_x = cdf_y = 0.0
    D_stat = 0.0
    while i < n and j < m:
        if x_sorted[i] < y_sorted[j]:
            cdf_x = (i + 1) / n; i += 1
        elif x_sorted[i] > y_sorted[j]:
            cdf_y = (j + 1) / m; j += 1
        else:
            v = x_sorted[i]
            while i < n and x_sorted[i] == v: i += 1
            while j < m and y_sorted[j] == v: j += 1
            cdf_x = i / n; cdf_y = j / m
        D_stat = max(D_stat, abs(cdf_x - cdf_y))
    if i < n: D_stat = max(D_stat, abs(1.0 - (j / m)))
    if j < m: D_stat = max(D_stat, abs(1.0 - (i / n)))

    # --- Correctif dégénéré ---
    if D_stat == 0.0:
        return 0.0, 1.0

    en = np.sqrt(n * m / (n + m))
    lam = (en + 0.12 + 0.11 / max(en, 1e-12)) * D_stat
    # Pour très petits "lam", la série tend vers 1 => borne supérieure 1.0
    if lam < 1e-8:
        return float(D_stat), 1.0

    # Évaluation de la série alternée (tronquée) avec coupe stricte dans [0,1]
    terms = [np.exp(-2.0 * (k**2) * (lam**2)) for k in range(1, 201)]
    pval = 2.0 * sum(((-1)**(k-1)) * terms[k-1] for k in range(1, len(terms)+1))
    pval = float(max(0.0, min(1.0, pval)))
    return float(D_stat), pval

# --- campagne DX3 ---
def DX3_run(D: int = 16384, C: int = 500, T: int = 200, seed: int = 2025,
            rel_tol: float = 0.01, pmin: float = 0.10) -> None:
    """
    D: dimension; C: #protos mémoire; T: #requêtes; rel_tol: seuil d'écart relatif moyen; pmin: seuil KS.
    """
    g = np.random.default_rng(seed)

    # Génère clés et banques en ±1/int8
    def rand_pm1(size: int) -> np.ndarray:
        r = g.integers(0, 2, size=size, dtype=np.int8)
        return ((r << 1) - 1).astype(np.int8, copy=False)

    G_MEM = rand_pm1(D)
    M_bank = np.stack([rand_pm1(D) for _ in range(C)], axis=0)   # (C, D), int8
    Q_batch = np.stack([rand_pm1(D) for _ in range(T)], axis=0)  # (T, D), int8

    # Scores "dans la tranche mémoire" vs "débindés"
    #   S_mem[t,c]   = < Rt⊗G_MEM , M_c >
    #   S_unbd[t,c]  = < Rt , M_c⊗G_MEM >
    S_mem  = np.zeros((T, C), dtype=np.int32)
    S_unbd = np.zeros((T, C), dtype=np.int32)

    for t in range(T):
        Rt = Q_batch[t]
        Rt_mem = DD3_bindToMem(Rt, G_MEM)            # Rt ⊗ G_MEM
        for c in range(C):
            Mc = M_bank[c]
            S_mem[t, c]  = hd_sim_dot(Rt_mem, Mc)
            S_unbd[t, c] = hd_sim_dot(Rt, hd_bind(Mc, G_MEM))

    # a) Erreur relative moyenne (sur tous les scores)
    A = S_mem.astype(np.float64).ravel()
    B = S_unbd.astype(np.float64).ravel()
    denom = np.maximum(1.0, np.abs(B))               # évite division par 0
    rel_err = np.abs(A - B) / denom
    rel_err_mean = float(np.mean(rel_err))

    # b) Test KS sur distributions aplaties
    D_stat, pval = ks_2samp_asymp(A, B)

    # Reporting
    log.info("DX3 — Invariance (dé)binding mémoire")
    log.info("  D=%d, C=%d, T=%d", D, C, T)
    log.info("  Erreur relative moyenne  = %.6f", rel_err_mean)
    log.info("  KS: D=%.6f, p=%.3f", D_stat, pval)

    # CA
    assert rel_err_mean <= rel_tol, f"DX3: erreur relative moyenne {rel_err_mean:.4f} > {rel_tol}"
    assert pval > pmin, f"DX3: p-value KS {pval:.3f} ≤ {pmin:.2f}"

In [44]:
DX3_run()

2025-10-06 10:54:12,268 [INFO] DX3 — Invariance (dé)binding mémoire
2025-10-06 10:54:12,268 [INFO]   D=16384, C=500, T=200
2025-10-06 10:54:12,268 [INFO]   Erreur relative moyenne  = 0.000000
2025-10-06 10:54:12,268 [INFO]   KS: D=0.000000, p=1.000


# DD4 . 

In [49]:
def DD4_search_topK(Rt_tilde: np.ndarray, prototypes: np.ndarray, K: int) -> tuple[int, np.ndarray, np.ndarray]:
    """
    prototypes: array shape (B, D) en int8 (±1) pour M_c seuillés ou non seuillés normalisés.
    Retour: (c_star, C_K, scores_CK)
    """
    D = Rt_tilde.shape[0]
    assert prototypes.ndim == 2 and prototypes.shape[1] == D and prototypes.dtype == np.int8
    # Produits scalaires stables
    scores = (prototypes.astype(np.int32) @ Rt_tilde.astype(np.int32)).astype(np.int32)  # (B,)
    K = min(K, scores.shape[0])
    idx = np.argpartition(scores, -K)[-K:]
    top_order = idx[np.argsort(scores[idx])[::-1]]
    c_star = int(top_order[0])
    return c_star, top_order, scores[top_order]

In [52]:
def DX4_run(D: int = 16384, B: int = 10000, trials: int = 200, 
            Ks=(100, 500, 2000), seed: int = 0) -> dict[int, float]:
    """
    Mesure empirique du rappel de c* parmi les top-K prototypes.
    """
    rng = np.random.default_rng(seed)
    recalls = {K: 0 for K in Ks}
    for _ in range(trials):
        # Génère B prototypes ±1 (int8)
        prototypes = rng.choice([-1, 1], size=(B, D))
        prototypes = prototypes.astype(np.int8)
        # Choisit une classe cible c*
        c_star = rng.integers(0, B)
        Rt = prototypes[c_star].copy()
        # Appel au module DD4
        _, C_K, _ = DD4_search_topK(Rt, prototypes, max(Ks))
        for K in Ks:
            if c_star in C_K[:K]:
                recalls[K] += 1
    # Moyenne
    return {K: recalls[K]/trials for K in Ks}

In [53]:
DX4_run()

{100: 1.0, 500: 1.0, 2000: 1.0}

# DD5 . 

In [54]:
def DD5_payload(Mc: np.ndarray) -> np.ndarray:
    """
    Mc: prototype non seuillé (int16/int32) OU déjà binaire int8.
    Renvoie Z_hat en int8 (±1).
    """
    if Mc.dtype == np.int8:
        hd_assert_pm1(Mc)
        return Mc
    return sign_strict_pm1(Mc)

In [55]:
def DX5_run(D: int = 16384, trials: int = 200, ms=(4, 8, 16), seed: int = 0):
    """
    Mesure l’exactitude binaire en fonction du nombre m_{c*}.
    """
    rng = np.random.default_rng(seed)
    accuracies = {}
    for m in ms:
        accs = []
        for _ in range(trials):
            # Vecteur de référence
            ref = rng.choice([-1, 1], size=D).astype(np.int8)
            # Accumulation de m copies bruitées
            acc = np.zeros(D, dtype=np.int32)
            for _ in range(m):
                acc += ref
            # Seuillage
            Z_hat = DD5_payload(acc)
            # Exactitude binaire
            accs.append(np.mean(Z_hat == ref))
        accuracies[m] = float(np.mean(accs))
    return accuracies

In [56]:
DX5_run()

{4: 1.0, 8: 1.0, 16: 1.0}

# DD6 . 

In [None]:
# def DD6_vote(
#     Z_hat: np.ndarray,
#     H_LM: np.ndarray,
#     L_fr,
#     cand_vocab: list[str],
#     lam: float = 0.0
# ) -> tuple[str, np.ndarray]:
#     """
#     Renvoie (token*, scores) sur cand_vocab.
#     """
#     D = Z_hat.shape[0]
#     hd_assert_pm1(Z_hat, D); hd_assert_pm1(H_LM, D)
#     scores = []
#     for v in cand_vocab:
#         Lv = L_fr(v).astype(np.int8, copy=False)
#         hd_assert_pm1(Lv, D)
#         s = (Z_hat.astype(np.int32) @ Lv.astype(np.int32)) \
#             + lam * (H_LM.astype(np.int32) @ Lv.astype(np.int32))
#         scores.append(float(s))
#     scores = np.asarray(scores, dtype=np.float32)
#     best = int(np.argmax(scores))
#     return cand_vocab[best], scores

In [None]:
class ToyLexFR:
    def __init__(self, vocab: list[str], D: int, seed: int = 1234):
        self.vocab = vocab
        self.D = D
        self.rng = np.random.default_rng(seed)
        # table de vecteurs ±1/int8
        self.table = {v: self.rng.choice(np.array([-1, 1], dtype=np.int8), size=D) for v in vocab}

    def __call__(self, v: str) -> np.ndarray:
        return self.table[v]

# -- Génération contrôlée de corrélations (flip par coordonnée) ----------------
def flip_to_target(vec: np.ndarray, target_sim: float, rng: np.random.Generator) -> np.ndarray:
    """
    Retourne une copie de 'vec' dont la similarité attendue vaut 'target_sim'.
    Pour ±1, si p_flip = (1 - target_sim)/2, alors E[sim] = 1 - 2*p_flip = target_sim.
    """
    D = vec.shape[0]
    p_flip = max(0.0, min(1.0, (1.0 - float(target_sim)) / 2.0))
    mask = (rng.random(D) < p_flip).astype(np.int8)          # 1 si on flippe
    flips = (1 - 2 * mask).astype(np.int8, copy=False)       # 1 -> -1, 0 -> +1
    out = (vec.astype(np.int8, copy=False) * flips).astype(np.int8, copy=False)
    return out

# -- Module testé (fourni) ------------------------------------------------------
def DD6_vote(
    Z_hat: np.ndarray,
    H_LM: np.ndarray,
    L_fr,
    cand_vocab: list[str],
    lam: float = 0.0
) -> tuple[str, np.ndarray]:
    """
    Renvoie (token*, scores) sur cand_vocab.
    scores[v] = <Z_hat, L_fr(v)> + lam * <H_LM, L_fr(v)>.
    """
    D = Z_hat.shape[0]
    hd_assert_pm1(Z_hat, D); hd_assert_pm1(H_LM, D)
    scores = []
    for v in cand_vocab:
        Lv = L_fr(v).astype(np.int8, copy=False)
        hd_assert_pm1(Lv, D)
        s = (Z_hat.astype(np.int32) @ Lv.astype(np.int32)) \
            + lam * (H_LM.astype(np.int32) @ Lv.astype(np.int32))
        scores.append(float(s))
    scores = np.asarray(scores, dtype=np.float64)   # précision numerique
    best = int(np.argmax(scores))
    return cand_vocab[best], scores

# -- Perplexité HD: softmax sur scores normalisés par D -------------------------
def hd_perplexity(scores: np.ndarray, true_idx: int, D: int, tau: float = 1.0) -> float:
    """
    Perplexité = exp( - log p(true) ), avec p ∝ exp( (scores/D)/tau ).
    On divise par D pour éviter des logits trop grands (HD).
    """
    logits = scores / (D * max(1e-6, tau))
    logits = logits - np.max(logits)               # stabilité
    exps = np.exp(logits)
    p = exps / np.sum(exps)
    p_true = float(max(p[true_idx], 1e-12))
    return float(np.exp(-np.log(p_true)))
