# explore_bis_v3

Notebook simplifié montrant comment utiliser les blocs **ENC** et **MEM**
exposés par la librairie `hdc_project.encoder`.

In [1]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print(f'Using src path: {SRC}')


Using src path: /Users/aymenmejri/Desktop/MyCode/experiments/hdc_v2/hdc_project/src


In [2]:
import numpy as np

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline
from hdc_project.decoder import (
    DD1_ctx,
    DD2_query,
    DD2_query_bin,
    DD3_bindToMem,
    DD4_search_topK,
    DD5_payload,
    DD6_vote,
    DD7_updateLM,
    DecodeOneStep,
    DX2_run,
    DX3_run,
    DX4_run,
    DX5_run,
    DX6_run,
    DX7_run,
)
from hdc_project.decoder.dec import (
    hd_assert_pm1,
    hd_bind,
    hd_sim,
    hd_sim_dot,
    build_perm_inverse,
    permute_pow,
    permute_pow_signed,
    rademacher,
    ks_2samp_asymp,
    hd_perplexity_from_scores,
    flip_to_target,
    correlated_pm1,
    DEFAULT_ELL_GRID,
    CONF_PER_STEP,
    TRIALS,
    T_STEPS,
    SIM_Y_MEM,
    SIM_CONF_LM,
    DEC_D,
    RNG_SEED,
)

In [None]:
# Utilitaires HDC désormais importés depuis hdc_project.decoder.dec



## Chargement du sous-corpus OPUS

On réutilise `opus_load_subset` depuis la librairie pour récupérer un petit
sous-échantillon bilingue (EN/FR). En environnement hors-ligne, un jeu de
repli est utilisé pour que le notebook reste exécutable.



In [None]:
import numpy as np
from tqdm import tqdm

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline

# ----------------------------
# 0) Chargement données OPUS
# ----------------------------
try:
    ens_raw, frs_raw = enc_pipeline.opus_load_subset(
        name="opus_books",
        config="en-fr",
        split="train",
        N=10_000,
        seed=2025,
    )
    print(f"OPUS subset loaded: {len(ens_raw)} pairs")
except Exception as exc:
    print("Warning: OPUS download failed, falling back to local toy corpus.")
    print(f"Original error: {exc}")
    ens_raw = [
        "hyperdimensional computing is fun",
        "vector symbolic architectures are powerful",
        "encoding words into hyperspace",
        "memory augmented networks love clean data",
    ]
    frs_raw = [
        "le calcul hyperdimensionnel est amusant",
        "les architectures symboliques vectorielles sont puissantes",
        "encoder des mots dans l'hyperspace",
        "les réseaux augmentés de mémoire aiment les données propres",
    ]

enc_sample_size = min(10_000, len(ens_raw))
mem_sample_size = min(10_000, len(ens_raw))
ens_sample = ens_raw[:enc_sample_size]
frs_sample = frs_raw[:enc_sample_size]
print(f"ENC sample size: {enc_sample_size}")
print(f"MEM sample size: {mem_sample_size}")

# ----------------------------
# 1) Encodage ENC (M5–M7)
# ----------------------------
D = 8192
n = 5
rng = np.random.default_rng(123)

Lex_en = m4.M4_LexEN_new(seed=1, D=D)
Lex_fr = m4.M4_LexEN_new(seed=2, D=D)
pi = rng.permutation(D).astype(np.int64)

encoded_en = enc_pipeline.encode_corpus_ENC(ens_sample, Lex_en, pi, D, n, seg_seed0=999)
encoded_fr = enc_pipeline.encode_corpus_ENC(frs_sample, Lex_fr, pi, D, n, seg_seed0=1999)

E_list_en = [segment["E_seq"] for segment in encoded_en]
H_list_en = [segment["H"] for segment in encoded_en]
print(f"Encoded {len(encoded_en)} sentences; signature shape = {H_list_en[0].shape}")

# Quelques stats ENC
s_intra, s_inter = enc_pipeline.intra_inter_ngram_sims(E_list_en, D)
inter_seg = enc_pipeline.inter_segment_similarity(H_list_en)
maj_curves = enc_pipeline.majority_error_curve(E_list_en, pi, D, eta_list=(0.0, 0.05))
print(f"intra={s_intra:.4f}, inter(abs)={s_inter:.4f}, inter segments={inter_seg:.4f}")
print("majority curve (eta=0):", maj_curves[0.0][:2])

# -------------------------------------------------------------
# 2) Helpers de "contenu" (sans K_s) pour fabriquer les paires
#    -> on somme des X_t (déjà alignés par Pi^Δ), puis on seuillle
# -------------------------------------------------------------
def content_signature_from_Xseq(X_seq, majority: str = "strict"):
    if not X_seq:
        raise ValueError("X_seq vide")
    S = np.zeros((X_seq[0].shape[0],), dtype=np.int32)
    for x in X_seq:
        S += x.astype(np.int32, copy=False)
    if majority == "strict":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    elif majority == "unbiased":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    else:
        raise ValueError("majority must be 'strict' or 'unbiased'")

def span_signatures_from_trace(X_seq, win: int = 12, stride: int = 6, majority: str = "unbiased"):
    if not X_seq:
        return []
    T = len(X_seq)
    out = []
    if T <= win:
        out.append(content_signature_from_Xseq(X_seq, majority))
        return out
    for start in range(0, T - win + 1, max(1, stride)):
        stop = start + win
        out.append(content_signature_from_Xseq(X_seq[start:stop], majority))
    return out

def build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict", max_pairs=None):
    pairs = []
    N = min(len(encoded_en), len(encoded_fr))
    for i in range(N):
        X_en = encoded_en[i]["X_seq"]
        X_fr = encoded_fr[i]["X_seq"]
        spans_en = span_signatures_from_trace(X_en, win=win, stride=stride, majority=majority)
        spans_fr = span_signatures_from_trace(X_fr, win=win, stride=stride, majority=majority)
        L = min(len(spans_en), len(spans_fr))
        for t in range(L):
            pairs.append((
                spans_en[t].astype(np.int8, copy=False),
                spans_fr[t].astype(np.int8, copy=False),
            ))
            if max_pairs is not None and len(pairs) >= max_pairs:
                return pairs
    return pairs

# -------------------------------------------------------------
# 3) Paires MEM = spans EN/FR (contenu, sans K_s)
# -------------------------------------------------------------
pairs_mem = build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict")
print(f"Pairs available for MEM training: {len(pairs_mem)}")

# -------------------------------------------------------------
# 4) Instanciation MEM et entraînement one-pass
#    (k ≈ log2(B) + marge ; ici B=256, k=24 convient)
# -------------------------------------------------------------
MEM_K = 16
MEM_BUCKETS = 128
cfg = mem_pipeline.MemConfig(D=D, B=MEM_BUCKETS, k=MEM_K, seed_lsh=10, seed_gmem=11)
comp = mem_pipeline.make_mem_pipeline(cfg)
mem_pipeline.train_one_pass_MEM(comp, pairs_mem)
print("Training complete; few bucket counts:", comp.mem.n[:64])

# -------------------------------------------------------------
# 5) Probe correcte : on interroge avec Z_en (span) et on compare
#    le prototype choisi à Z_fr (span) correspondant
# -------------------------------------------------------------
probe_count = min(200, len(pairs_mem))
sim_values = []
for Z_en_vec, Z_fr_vec in tqdm(pairs_mem[:probe_count]):
    bucket_idx, score = mem_pipeline.infer_map_top1(comp, Z_en_vec)  # Z_en (span), pas H_en
    prototype = comp.mem.H[bucket_idx].astype(np.int32, copy=False)
    sim = float(np.dot(prototype, Z_fr_vec.astype(np.int32, copy=False)) / D)
    sim_values.append(sim)

print(f"Top-1 mean similarity over {probe_count} span-probes: {np.mean(sim_values):.4f}")
print(f"Top-1 median similarity: {np.median(sim_values):.4f}")

In [None]:
nb = comp.mem.n
print("pop mean/median/min/max/std:",
      float(nb.mean()), float(np.median(nb)), int(nb.min()), int(nb.max()), float(nb.std()))
print("p90/p99:", int(np.quantile(nb, 0.90)), int(np.quantile(nb, 0.99)))

pop mean/median/min/max/std: 296.625 293.0 216 477 39.670714954485
p90/p99: 345 394



> ℹ️ **Remarque pratique** : si le téléchargement OPUS échoue (exécution hors-ligne),
> le notebook bascule automatiquement sur un mini corpus embarqué afin de
> conserver une démonstration reproductible des blocs ENC et MEM.



# DEC

## DEC-0 : 

In [1]:
import logging
log = logging.getLogger("DEC")
if not log.handlers:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")


In [2]:
def pm1(D: int, rng: np.random.Generator) -> np.ndarray:
    """Alias vers rademacher pour générer des vecteurs +/-1 en int8."""
    return rademacher(D, rng)

# ---------------------------
# DX0: tests
# ---------------------------
def dx0_sanity(D: int = 16_384, N_sim: int = 1_000, seed: int = 2024, tol: float = 5e-3) -> None:
    """
    Vérifie:
      1) hd_sim(x,x)=1 et hd_sim(x,-x)=-1 (à tol près)
      2) Invariance de similarité par binding: sim(x,y)=sim(x⊗k, y⊗k)
      3) Préservation de la norme (||x||_2/√D = 1) avant/après binding
    Critère d'acceptation (CA): écarts absolus ≤ 5e-3.
    """
    rng = np.random.default_rng(seed)

    max_err_self = 0.0
    max_err_neg = 0.0
    max_err_bind = 0.0
    max_err_norm = 0.0

    for _ in range(N_sim):
        x = pm1(D, rng)
        y = pm1(D, rng)
        k = pm1(D, rng)
        hd_assert_pm1(x, D)
        hd_assert_pm1(y, D)
        hd_assert_pm1(k, D)

        # (1) Identités de similarité
        s_xx = hd_sim(x, x)
        s_xnx = hd_sim(x, (-x).astype(np.int8, copy=False))

        max_err_self = max(max_err_self, abs(s_xx - 1.0))
        max_err_neg = max(max_err_neg, abs(s_xnx + 1.0))

        # (2) Invariance par binding (DEC1)
        s_xy = hd_sim(x, y)
        xk = hd_bind(x, k)
        yk = hd_bind(y, k)
        s_xy_bind = hd_sim(xk, yk)
        max_err_bind = max(max_err_bind, abs(s_xy - s_xy_bind))

        # (3) Normes (avant/après binding)
        norm_x = np.linalg.norm(x.astype(np.float64)) / np.sqrt(D)
        norm_xk = np.linalg.norm(xk.astype(np.float64)) / np.sqrt(D)
        max_err_norm = max(max_err_norm, abs(norm_x - 1.0), abs(norm_xk - 1.0))

    # Rapport
    print("DX0 — Sanity checks (double précision)")
    print(f"  D={D}, N={N_sim}, tol={tol:.1e}")
    print(f"  max|sim(x,x)-1|         = {max_err_self:.3e}")
    print(f"  max|sim(x,-x)+1|        = {max_err_neg:.3e}")
    print(f"  max|sim(x,y)-sim(x⊗k,y⊗k)| = {max_err_bind:.3e}")
    print(f"  max| ||x||/√D - 1 | (incl. bind) = {max_err_norm:.3e}")

    # Assertions CA
    assert max_err_self <= tol, "CA non satisfait: sim(x,x) s'écarte de 1"
    assert max_err_neg <= tol, "CA non satisfait: sim(x,-x) s'écarte de -1"
    assert max_err_bind <= tol, "CA non satisfait: invariance de similarité après binding"
    assert max_err_norm <= tol, "CA non satisfait: norme non préservée (relative)"


In [3]:
dx0_sanity()


DX0 — Sanity checks (double précision)
  D=16384, N=1000, tol=5.0e-03
  max|sim(x,x)-1|         = 0.000e+00
  max|sim(x,-x)+1|        = 0.000e+00
  max|sim(x,y)-sim(x⊗k,y⊗k)| = 0.000e+00
  max| ||x||/√D - 1 | (incl. bind) = 0.000e+00


## DD1 .


In [4]:
# DD1_ctx est importé depuis hdc_project.decoder


In [5]:
# --- DX1: tests détaillés ---
def dx1_test_DD1_ctx(D: int = 16_384, m: int = 64, trials: int = 200, seed: int = 1234, tol: float = 5e-3):
    rng = np.random.default_rng(seed)

    # 1) Similarité inchangée et normes préservées (sur 'trials' paires)
    max_err_sim = 0.0
    max_err_norm = 0.0
    for _ in range(trials):
        H1, H2, G = pm1(D, rng), pm1(D, rng), pm1(D, rng)
        # Copies pour vérifier non-mutation
        H1_copy, H2_copy, G_copy = H1.copy(), H2.copy(), G.copy()

        Q1, Q2 = DD1_ctx(H1, G), DD1_ctx(H2, G)
        # Similarité
        s0 = hd_sim(H1, H2)
        s1 = hd_sim(Q1, Q2)
        max_err_sim = max(max_err_sim, abs(s0 - s1))

        # Normes relatives
        nH1  = np.linalg.norm(H1.astype(np.float64)) / np.sqrt(D)
        nQ1  = np.linalg.norm(Q1.astype(np.float64)) / np.sqrt(D)
        nH2  = np.linalg.norm(H2.astype(np.float64)) / np.sqrt(D)
        nQ2  = np.linalg.norm(Q2.astype(np.float64)) / np.sqrt(D)
        max_err_norm = max(max_err_norm, abs(nH1 - 1.0), abs(nQ1 - 1.0),
                                           abs(nH2 - 1.0), abs(nQ2 - 1.0))

        # Contrats: dtype & non-mutation
        assert Q1.dtype == np.int8 and Q2.dtype == np.int8
        assert np.all(H1 == H1_copy) and np.all(H2 == H2_copy) and np.all(G == G_copy), "mutation détectée"
        assert np.all((Q1 == 1) | (Q1 == -1)) and np.all((Q2 == 1) | (Q2 == -1)), "sortie hors ±1"

    # 2) Isométrie de Gram (m vecteurs)
    H = np.stack([pm1(D, rng) for _ in range(m)], axis=0)  # (m, D) ±1/int8
    G = pm1(D, rng)
    Q = np.stack([DD1_ctx(H[i], G) for i in range(m)], axis=0)

    # Gram avant/après, en double précision
    G0 = (H.astype(np.int32) @ H.astype(np.int32).T) / D
    G1 = (Q.astype(np.int32) @ Q.astype(np.int32).T) / D
    max_err_gram = float(np.max(np.abs(G0.astype(np.float64) - G1.astype(np.float64))))

    # --- Rapport ---
    print("DX1 — DD1_ctx (isométrie & contrats)")
    print(f"  D={D}, m={m}, trials={trials}, tol={tol:.1e}")
    print(f"  max|sim_before - sim_after|  = {max_err_sim:.3e}")
    print(f"  max| ||H||/√D - 1 | (incl. bind) = {max_err_norm:.3e}")
    print(f"  max|Gram_before - Gram_after| = {max_err_gram:.3e}")

    # --- Critères d'acceptation ---
    assert max_err_sim  <= tol, "Invariance de similarité violée (DEC1)"
    assert max_err_norm <= tol, "Norme non préservée (relative)"
    assert max_err_gram <= tol, "Isométrie de Gram violée (DEC1)"

dx1_test_DD1_ctx()

DX1 — DD1_ctx (isométrie & contrats)
  D=16384, m=64, trials=200, tol=5.0e-03
  max|sim_before - sim_after|  = 0.000e+00
  max| ||H||/√D - 1 | (incl. bind) = 0.000e+00
  max|Gram_before - Gram_after| = 0.000e+00


# DD2 . 

In [6]:
import numpy as np
import logging
from typing import List, Tuple
from tqdm import tqdm

log = logging.getLogger("DEC.DX2.v2")




In [None]:
# DD2_query / DD2_query_bin et la campagne DX2 proviennent de hdc_project.decoder


In [40]:
DX2_run()

2025-10-06 23:22:58,772 [INFO] DX2 — Norme(R_t)/sqrt(D) par (ell, alpha/beta): min | median | max
2025-10-06 23:22:58,772 [INFO]   ell=2, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,772 [INFO]   ell=2, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,772 [INFO]   ell=2, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,772 [INFO]   ell=4, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=4, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=4, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=8, alpha/beta=0.333  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=8, alpha/beta=1  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO]   ell=8, alpha/beta=3  ->  1.000 | 1.000 | 1.000
2025-10-06 23:22:58,773 [INFO] DX2 — CA validés: (i) norme ∈ [0.9,1.1] ; (ii) Gram invariant ; (iii) identité paire-à-paire OK.


# DD3 . 

In [10]:
# DD3_bindToMem et hd_sim_dot sont fournis par hdc_project.decoder


In [11]:
# ks_2samp_asymp et DX3_run sont désormais accessibles via hdc_project.decoder


In [12]:
DX3_run()

2025-10-06 22:57:03,498 [INFO] DX3 — Invariance (dé)binding mémoire
2025-10-06 22:57:03,498 [INFO]   D=16384, C=500, T=200
2025-10-06 22:57:03,498 [INFO]   Erreur relative moyenne  = 0.000000
2025-10-06 22:57:03,498 [INFO]   KS: D=0.000000, p=1.000


# DD4 . 

In [13]:
# DD4_search_topK est importé depuis hdc_project.decoder


In [14]:
# DX4_run est importé depuis hdc_project.decoder


In [15]:
DX4_run()

100%|██████████| 200/200 [03:23<00:00,  1.02s/it]


{100: 1.0, 500: 1.0, 2000: 1.0}

# DD5 . 

In [16]:
# DD5_payload est importé depuis hdc_project.decoder


In [17]:
# DX5_run est importé depuis hdc_project.decoder


In [18]:
DX5_run()

{4: 1.0, 8: 1.0, 16: 1.0}

# DD6 . 

In [19]:
# def DD6_vote(
#     Z_hat: np.ndarray,
#     H_LM: np.ndarray,
#     L_fr,
#     cand_vocab: list[str],
#     lam: float = 0.0
# ) -> tuple[str, np.ndarray]:
#     """
#     Renvoie (token*, scores) sur cand_vocab.
#     """
#     D = Z_hat.shape[0]
#     hd_assert_pm1(Z_hat, D); hd_assert_pm1(H_LM, D)
#     scores = []
#     for v in cand_vocab:
#         Lv = L_fr(v).astype(np.int8, copy=False)
#         hd_assert_pm1(Lv, D)
#         s = (Z_hat.astype(np.int32) @ Lv.astype(np.int32)) \
#             + lam * (H_LM.astype(np.int32) @ Lv.astype(np.int32))
#         scores.append(float(s))
#     scores = np.asarray(scores, dtype=np.float32)
#     best = int(np.argmax(scores))
#     return cand_vocab[best], scores

In [20]:
class ToyLexFR:
    def __init__(self, vocab: list[str], D: int, seed: int = 1234):
        self.vocab = vocab
        self.D = D
        self.rng = np.random.default_rng(seed)
        # table de vecteurs ±1/int8
        self.table = {v: self.rng.choice(np.array([-1, 1], dtype=np.int8), size=D) for v in vocab}

    def __call__(self, v: str) -> np.ndarray:
        return self.table[v]

# # -- Génération contrôlée de corrélations (flip par coordonnée) ----------------
# def flip_to_target(vec: np.ndarray, target_sim: float, rng: np.random.Generator) -> np.ndarray:
#     """
#     Retourne une copie de 'vec' dont la similarité attendue vaut 'target_sim'.
#     Pour ±1, si p_flip = (1 - target_sim)/2, alors E[sim] = 1 - 2*p_flip = target_sim.
#     """
#     D = vec.shape[0]
#     p_flip = max(0.0, min(1.0, (1.0 - float(target_sim)) / 2.0))
#     mask = (rng.random(D) < p_flip).astype(np.int8)          # 1 si on flippe
#     flips = (1 - 2 * mask).astype(np.int8, copy=False)       # 1 -> -1, 0 -> +1
#     out = (vec.astype(np.int8, copy=False) * flips).astype(np.int8, copy=False)
#     return out

# # -- Module testé (fourni) ------------------------------------------------------

# def _batch_lex(cand_vocab, L):
#     """
#     Applique le callable 'L' (v -> ±1/int8 de forme (D,)) sur tout le vocabulaire candidat
#     et empile en une matrice (V, D) en int8.
#     """
#     mats = []
#     for v in cand_vocab:
#         vec = L(v).astype(np.int8, copy=False)
#         mats.append(vec)
#     M = np.vstack(mats).astype(np.int8, copy=False)
#     return M

# def DD6_vote(
#     Z_hat: np.ndarray,
#     H_LM: np.ndarray,
#     L_mem,                  # callable: v -> ±1 int8 (D,)
#     L_lm,                   # callable: v -> ±1 int8 (D,)
#     cand_vocab: list[str],
#     lam: float = 0.0,
#     *,
#     normalize: str = "sqrtD",   # {"none","sqrtD"} ; "sqrtD" conseillé pour perplexité
#     return_probs: bool = False, # si True, renvoie aussi les probabilités softmax
#     tau: float = 1.0            # température du softmax (si return_probs=True)
# ) -> tuple[str, np.ndarray, np.ndarray | None]:
#     """
#     s(v) = <Z_hat, L_mem(v)> + lam * <H_LM, L_lm(v)>
#     Retourne: (token*, scores_raw, probs|None)
#       - scores_raw: np.float64 de taille V (non normalisés, utiles pour debug/traçage)
#       - probs:      np.float64 de taille V si return_probs=True (softmax stable)
#     Contrats:
#       Z_hat, H_LM: ±1/int8, de longueur D identique.
#       L_mem, L_lm: renvoient ±1/int8 (D,) pour tout v de cand_vocab.
#     """
#     # --- Contrats de forme et de type
#     D = int(Z_hat.shape[0])
#     hd_assert_pm1(Z_hat, D)
#     hd_assert_pm1(H_LM, D)
#     assert isinstance(cand_vocab, (list, tuple)) and len(cand_vocab) > 0, "cand_vocab vide"

#     # --- Matrices lexicales (V, D) en int8 (vectorisation)
#     M_mem = _batch_lex(cand_vocab, L_mem)   # (V, D)
#     M_lm  = _batch_lex(cand_vocab, L_lm)    # (V, D)
#     assert M_mem.shape == M_lm.shape == (len(cand_vocab), D), "Shapes (V,D) incohérents"

#     # --- Produits scalaires vectorisés (int32 pour éviter overflow)
#     z32  = Z_hat.astype(np.int32, copy=False)
#     h32  = H_LM.astype(np.int32, copy=False)
#     mem_scores = (M_mem.astype(np.int32, copy=False) @ z32)              # (V,)
#     lm_scores  = (M_lm.astype(np.int32,  copy=False) @ h32)              # (V,)
#     scores_raw = mem_scores.astype(np.float64) + float(lam) * lm_scores.astype(np.float64)

#     # --- Argmax sur scores bruts (l'échelle n'affecte pas l'argmax)
#     best_idx   = int(np.argmax(scores_raw))
#     token_star = cand_vocab[best_idx]

#     # --- Option: probabilités (softmax stable) avec normalisation choisie
#     probs = None
#     if return_probs:
#         if normalize == "sqrtD":
#             logits = scores_raw / (np.sqrt(D) * max(1e-6, float(tau)))
#         elif normalize == "none":
#             logits = scores_raw / max(1e-6, float(tau))
#         else:
#             raise ValueError("normalize ∈ {'none','sqrtD'} attendu")
#         logits = logits - np.max(logits)                  # stabilité num.
#         exps   = np.exp(logits, dtype=np.float64)
#         probs  = exps / np.sum(exps, dtype=np.float64)    # (V,)
#         probs  = probs.astype(np.float64, copy=False)

#     return token_star, scores_raw, probs

# # -- Perplexité HD: softmax sur scores normalisés par D -------------------------
# def hd_perplexity(scores: np.ndarray, true_idx: int, D: int, tau: float = 1.0) -> float:
#     """
#     Perplexité = exp( - log p(true) ), avec p ∝ exp( (scores/D)/tau ).
#     On divise par D pour éviter des logits trop grands (HD).
#     """
#     logits = scores / (D * max(1e-6, tau))
#     logits = logits - np.max(logits)               # stabilité
#     exps = np.exp(logits)
#     p = exps / np.sum(exps)
#     p_true = float(max(p[true_idx], 1e-12))
#     return float(np.exp(-np.log(p_true)))


In [21]:
# def _softmax_probs(scores: np.ndarray, D: int, tau: float = 1.0) -> np.ndarray:
#     # Normalisation par sqrt(D) pour éviter la sur-concentration à grande dimension
#     s = scores / (np.sqrt(D) * tau)
#     s = s - np.max(s)                       # stabilité numérique
#     exps = np.exp(s)
#     return exps / np.sum(exps)

# def hd_perplexity(scores: np.ndarray, true_index: int, D: int, tau: float = 1.0) -> float:
#     p = _softmax_probs(scores, D=D, tau=tau)[true_index]
#     # Perplexité = exp(-log p_y) ; bornée inférieurement par 1
#     return float(np.exp(-np.log(max(p, 1e-12))))

In [22]:
# def DX6_run_two_spaces(
#     D: int = 16384, trials: int = 400,
#     lam_grid=(0.0, 0.5, 1.0),
#     # corrélations du vrai token:
#     sim_payload: float = 0.82,   # corr(Z_hat, L_mem(y))
#     sim_lm: float      = 0.65,   # corr(H_LM, L_lm(y))
#     # confondeurs:
#     n_confounders: int = 6,
#     rho_mem_conf: float = 0.72,  # corr(Z_hat, L_mem(conf))
#     rho_lm_conf: float  = 0.05,  # corr(H_LM, L_lm(conf))
#     tau: float = 1.0,
#     rng_seed: int = 7031
# ):
#     g = np.random.default_rng(rng_seed)

#     def rademacher(D):  # ±1/int8
#         return g.choice(np.array([-1,1], dtype=np.int8), size=D)

#     def correlated_pm1(proto: np.ndarray, rho: float) -> np.ndarray:
#         noise = rademacher(proto.shape[0])
#         mix = rho * proto.astype(np.int32) + (1-rho) * noise.astype(np.int32)
#         return np.where(mix >= 0, 1, -1).astype(np.int8)

#     def make_trial():
#         Z_true  = rademacher(D)  # payload cible
#         H_true  = rademacher(D)  # LM cible
#         # Construire DEUX lexiques: L_mem (pour la mémoire) et L_lm (pour le LM)
#         V = n_confounders + 1
#         L_mem = np.empty((V, D), dtype=np.int8)
#         L_lm  = np.empty((V, D), dtype=np.int8)
#         # y (indice 0)
#         L_mem[0] = correlated_pm1(Z_true, sim_payload)
#         L_lm[0]  = correlated_pm1(H_true, sim_lm)
#         # confondeurs
#         for i in range(1, V):
#             L_mem[i] = correlated_pm1(Z_true, rho_mem_conf)
#             L_lm[i]  = correlated_pm1(H_true, rho_lm_conf)
#         return L_mem, L_lm, 0, Z_true, H_true  # (lexiques, true_id, payload, LM)

#     def vote_scores_two_lex(L_mem: np.ndarray, L_lm: np.ndarray,
#                             Z_hat: np.ndarray, H_LM: np.ndarray, lam: float) -> np.ndarray:
#         # int32 pour éviter overflow ; (V,D) @ (D,) -> (V,)
#         return (L_mem.astype(np.int32) @ Z_hat.astype(np.int32)) + \
#                lam * (L_lm.astype(np.int32)  @ H_LM.astype(np.int32))

#     def _softmax_probs(scores: np.ndarray, D: int, tau: float = 1.0) -> np.ndarray:
#         s = scores / (np.sqrt(D) * max(tau, 1e-6))
#         s = s - np.max(s)
#         exps = np.exp(s)
#         return exps / np.sum(exps)

#     def hd_perplexity(scores: np.ndarray, true_idx: int, D: int, tau: float = 1.0) -> float:
#         p_true = float(_softmax_probs(scores, D=D, tau=tau)[true_idx])
#         return float(np.exp(-np.log(max(p_true, 1e-12))))

#     stats = {lam: {"top1_hits": 0, "ppl_sum": 0.0} for lam in lam_grid}

#     for _ in range(trials):
#         L_mem, L_lm, true_idx, Z_true, H_true = make_trial()
#         Z_hat = Z_true; H_LM = H_true
#         for lam in lam_grid:
#             scores = vote_scores_two_lex(L_mem, L_lm, Z_hat, H_LM, float(lam))
#             pred = int(np.argmax(scores))
#             stats[lam]["top1_hits"] += 1 if pred == true_idx else 0
#             stats[lam]["ppl_sum"]   += hd_perplexity(scores, true_idx, D, tau)

#     results = {lam: {"top1": stats[lam]["top1_hits"]/trials,
#                      "ppl":  stats[lam]["ppl_sum"]/trials}
#                for lam in lam_grid}

#     base_top1, base_ppl = results[0.0]["top1"], results[0.0]["ppl"]
#     saturated = (abs(base_top1 - 1.0) < 1e-12)

#     log.info(("DX6(2-spaces|fixed) — D=%d, trials=%d, conf=%d, ρ_mem(conf)=%.2f, ρ_lm(conf)=%.2f, "
#               "sim_payload=%.2f, sim_lm=%.2f"),
#               D, trials, n_confounders, rho_mem_conf, rho_lm_conf, sim_payload, sim_lm)
#     for lam in lam_grid:
#         log.info("  lambda=%.2f  ->  top-1=%.3f | ppl=%.3f", lam, results[lam]["top1"], results[lam]["ppl"])

#     if saturated:
#         ok = any(results[lam]["ppl"] < base_ppl - 1e-12 for lam in lam_grid if lam != 0.0)
#         assert ok, "DX6(fixed): régime saturé — aucune baisse de perplexité vs λ=0."
#     else:
#         ok = any((results[lam]["top1"] > base_top1 + 1e-12) and (results[lam]["ppl"] < base_ppl - 1e-12)
#                  for lam in lam_grid if lam != 0.0)
#         assert ok, "DX6(fixed): aucun λ n'améliore simultanément top-1 ET perplexité vs λ=0."

#     log.info("DX6(fixed) — CA VALIDÉ (%s).", "saturé" if saturated else "non-saturé")
#     return results

In [23]:
# DD6_vote, flip_to_target et DX6_run sont importés depuis hdc_project.decoder


In [24]:
DX6_run()

2025-10-06 23:00:52,893 [INFO] DX6 — D=16384, trials=400, conf=6, ρ_mem(conf)=0.55, ρ_lm(conf)=0.10, sim_payload=0.60, sim_lm=0.40
2025-10-06 23:00:52,893 [INFO]   lambda=0.00  ->  top-1=1.000 | ppl=1.019
2025-10-06 23:00:52,894 [INFO]   lambda=0.50  ->  top-1=1.000 | ppl=1.000
2025-10-06 23:00:52,894 [INFO]   lambda=1.00  ->  top-1=1.000 | ppl=1.000
2025-10-06 23:00:52,894 [INFO] DX6 — CA VALIDÉ (saturé).


{0.0: {'top1': 1.0, 'ppl': 1.0192223749961116},
 0.5: {'top1': 1.0, 'ppl': 1.0000000001159797},
 1.0: {'top1': 1.0, 'ppl': 1.0}}

# DD7 . 

In [41]:
# DX7_run et les constantes associées résident dans hdc_project.decoder


In [42]:
results, ell_star = DX7_run()

2025-10-06 23:23:12,336 [INFO] DX7 — étude fenetre ell=(2, 4, 8, 12) (D=16384, trials=200, T=24, conf/step=8)
2025-10-06 23:23:19,330 [INFO]   ell= 2  ->  top-1=0.926 | p(ell)=1.000
2025-10-06 23:23:26,541 [INFO]   ell= 4  ->  top-1=0.854 | p(ell)=1.000
2025-10-06 23:23:34,829 [INFO]   ell= 8  ->  top-1=0.707 | p(ell)=1.000
2025-10-06 23:23:43,466 [INFO]   ell=12  ->  top-1=0.555 | p(ell)=1.000
2025-10-06 23:23:43,467 [INFO] DX7 — CA VALIDÉS: (i) ell*=2 maximise top-1 ; (ii) p(ell) décroît au-delà.


In [43]:
# DecodeOneStep est importé depuis hdc_project.decoder


In [44]:
def mock_L_fr(vocab_seed: int, D: int):
    rng = np.random.default_rng(vocab_seed)
    table = {}

    def get(tok: str) -> np.ndarray:
        if tok not in table:
            x = rng.integers(0, 2, size=D, dtype=np.int8)
            table[tok] = (2 * x - 1).astype(np.int8)
        return table[tok]

    return get

def test_isometry_and_flow():
    D = 16384
    K = 128
    rng = np.random.default_rng(7)
    Hs = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    H_LM = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    G_DEC = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    G_MEM = (2 * rng.integers(0, 2, size=D, dtype=np.int8) - 1)
    Pi = rng.permutation(D).astype(np.int64)
    Lfr = mock_L_fr(1234, D)
    B = 2048
    prototypes = (2 * rng.integers(0, 2, size=(B, D), dtype=np.int8) - 1)

    tok, scores, c_star, CK, H_LM_next = DecodeOneStep(
        Hs,
        H_LM,
        history_fr=["de", "la", "musique"],
        G_DEC=G_DEC,
        G_MEM=G_MEM,
        Pi=Pi,
        L_fr=Lfr,
        prototypes=prototypes,
        K=K,
        return_ck_scores=False
    )

    assert isinstance(tok, str) and scores.ndim == 1
    assert H_LM_next.shape == (D,) and H_LM_next.dtype == np.int8



In [55]:
test_isometry_and_flow()