# explore_bis_v2

Notebook simplifié montrant comment utiliser les blocs **ENC** et **MEM**
exposés par la librairie `hdc_project.encoder`.

In [4]:
from pathlib import Path
import sys

ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print(f'Using src path: {SRC}')


Using src path: /Users/aymenmejri/Desktop/MyCode/experiments/hdc_v2/hdc_project/src


In [5]:
import numpy as np

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline



## Chargement du sous-corpus OPUS

On réutilise `opus_load_subset` depuis la librairie pour récupérer un petit
sous-échantillon bilingue (EN/FR). En environnement hors-ligne, un jeu de
repli est utilisé pour que le notebook reste exécutable.



In [6]:
import numpy as np
from tqdm import tqdm

from hdc_project.encoder import m4, pipeline as enc_pipeline
from hdc_project.encoder.mem import pipeline as mem_pipeline

# ----------------------------
# 0) Chargement données OPUS
# ----------------------------
try:
    ens_raw, frs_raw = enc_pipeline.opus_load_subset(
        name="opus_books",
        config="en-fr",
        split="train",
        N=10_000,
        seed=2025,
    )
    print(f"OPUS subset loaded: {len(ens_raw)} pairs")
except Exception as exc:
    print("Warning: OPUS download failed, falling back to local toy corpus.")
    print(f"Original error: {exc}")
    ens_raw = [
        "hyperdimensional computing is fun",
        "vector symbolic architectures are powerful",
        "encoding words into hyperspace",
        "memory augmented networks love clean data",
    ]
    frs_raw = [
        "le calcul hyperdimensionnel est amusant",
        "les architectures symboliques vectorielles sont puissantes",
        "encoder des mots dans l'hyperspace",
        "les réseaux augmentés de mémoire aiment les données propres",
    ]

enc_sample_size = min(10_000, len(ens_raw))
mem_sample_size = min(10_000, len(ens_raw))
ens_sample = ens_raw[:enc_sample_size]
frs_sample = frs_raw[:enc_sample_size]
print(f"ENC sample size: {enc_sample_size}")
print(f"MEM sample size: {mem_sample_size}")

# ----------------------------
# 1) Encodage ENC (M5–M7)
# ----------------------------
D = 8192
n = 5
rng = np.random.default_rng(123)

Lex_en = m4.M4_LexEN_new(seed=1, D=D)
Lex_fr = m4.M4_LexEN_new(seed=2, D=D)
pi = rng.permutation(D).astype(np.int64)

encoded_en = enc_pipeline.encode_corpus_ENC(ens_sample, Lex_en, pi, D, n, seg_seed0=999)
encoded_fr = enc_pipeline.encode_corpus_ENC(frs_sample, Lex_fr, pi, D, n, seg_seed0=1999)

E_list_en = [segment["E_seq"] for segment in encoded_en]
H_list_en = [segment["H"] for segment in encoded_en]
print(f"Encoded {len(encoded_en)} sentences; signature shape = {H_list_en[0].shape}")

# Quelques stats ENC
s_intra, s_inter = enc_pipeline.intra_inter_ngram_sims(E_list_en, D)
inter_seg = enc_pipeline.inter_segment_similarity(H_list_en)
maj_curves = enc_pipeline.majority_error_curve(E_list_en, pi, D, eta_list=(0.0, 0.05))
print(f"intra={s_intra:.4f}, inter(abs)={s_inter:.4f}, inter segments={inter_seg:.4f}")
print("majority curve (eta=0):", maj_curves[0.0][:2])

# -------------------------------------------------------------
# 2) Helpers de "contenu" (sans K_s) pour fabriquer les paires
#    -> on somme des X_t (déjà alignés par Pi^Δ), puis on seuillle
# -------------------------------------------------------------
def content_signature_from_Xseq(X_seq, majority: str = "strict"):
    if not X_seq:
        raise ValueError("X_seq vide")
    S = np.zeros((X_seq[0].shape[0],), dtype=np.int32)
    for x in X_seq:
        S += x.astype(np.int32, copy=False)
    if majority == "strict":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    elif majority == "unbiased":
        return np.where(S >= 0, 1, -1).astype(np.int8, copy=False)
    else:
        raise ValueError("majority must be 'strict' or 'unbiased'")

def span_signatures_from_trace(X_seq, win: int = 12, stride: int = 6, majority: str = "unbiased"):
    if not X_seq:
        return []
    T = len(X_seq)
    out = []
    if T <= win:
        out.append(content_signature_from_Xseq(X_seq, majority))
        return out
    for start in range(0, T - win + 1, max(1, stride)):
        stop = start + win
        out.append(content_signature_from_Xseq(X_seq[start:stop], majority))
    return out

def build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict", max_pairs=None):
    pairs = []
    N = min(len(encoded_en), len(encoded_fr))
    for i in range(N):
        X_en = encoded_en[i]["X_seq"]
        X_fr = encoded_fr[i]["X_seq"]
        spans_en = span_signatures_from_trace(X_en, win=win, stride=stride, majority=majority)
        spans_fr = span_signatures_from_trace(X_fr, win=win, stride=stride, majority=majority)
        L = min(len(spans_en), len(spans_fr))
        for t in range(L):
            pairs.append((
                spans_en[t].astype(np.int8, copy=False),
                spans_fr[t].astype(np.int8, copy=False),
            ))
            if max_pairs is not None and len(pairs) >= max_pairs:
                return pairs
    return pairs

# -------------------------------------------------------------
# 3) Paires MEM = spans EN/FR (contenu, sans K_s)
# -------------------------------------------------------------
pairs_mem = build_mem_pairs_from_encoded(encoded_en, encoded_fr, win=8, stride=4, majority="strict")
print(f"Pairs available for MEM training: {len(pairs_mem)}")

# -------------------------------------------------------------
# 4) Instanciation MEM et entraînement one-pass
#    (k ≈ log2(B) + marge ; ici B=256, k=24 convient)
# -------------------------------------------------------------
MEM_K = 16
MEM_BUCKETS = 128
cfg = mem_pipeline.MemConfig(D=D, B=MEM_BUCKETS, k=MEM_K, seed_lsh=10, seed_gmem=11)
comp = mem_pipeline.make_mem_pipeline(cfg)
mem_pipeline.train_one_pass_MEM(comp, pairs_mem)
print("Training complete; few bucket counts:", comp.mem.n[:64])

# -------------------------------------------------------------
# 5) Probe correcte : on interroge avec Z_en (span) et on compare
#    le prototype choisi à Z_fr (span) correspondant
# -------------------------------------------------------------
probe_count = min(200, len(pairs_mem))
sim_values = []
for Z_en_vec, Z_fr_vec in tqdm(pairs_mem[:probe_count]):
    bucket_idx, score = mem_pipeline.infer_map_top1(comp, Z_en_vec)  # Z_en (span), pas H_en
    prototype = comp.mem.H[bucket_idx].astype(np.int32, copy=False)
    sim = float(np.dot(prototype, Z_fr_vec.astype(np.int32, copy=False)) / D)
    sim_values.append(sim)

print(f"Top-1 mean similarity over {probe_count} span-probes: {np.mean(sim_values):.4f}")
print(f"Top-1 median similarity: {np.median(sim_values):.4f}")

OPUS subset loaded: 10000 pairs
ENC sample size: 10000
MEM sample size: 10000


Processing: 100%|██████████| 10000/10000 [00:26<00:00, 381.24it/s]
Processing: 100%|██████████| 10000/10000 [00:26<00:00, 381.62it/s]


Encoded 10000 sentences; signature shape = (8192,)


100%|██████████| 10000/10000 [07:08<00:00, 23.35it/s]


intra=0.0009, inter(abs)=0.0253, inter segments=0.0089
majority curve (eta=0): [(1, 0.0), (2, 0.0)]
Pairs available for MEM training: 37968
Training complete; few bucket counts: [249 229 310 312 265 289 325 291 326 361 249 251 224 314 335 299 287 308
 255 302 318 347 296 319 249 297 259 312 271 270 345 281 287 257 305 318
 301 303 321 308 326 307 280 261 338 296 279 330 259 224 269 284 252 293
 394 297 332 343 256 333 336 246 293 266]


100%|██████████| 200/200 [00:00<00:00, 2307.57it/s]

Top-1 mean similarity over 200 span-probes: 0.2727
Top-1 median similarity: 0.2773





In [8]:
nb = comp.mem.n
print("pop mean/median/min/max/std:",
      float(nb.mean()), float(np.median(nb)), int(nb.min()), int(nb.max()), float(nb.std()))
print("p90/p99:", int(np.quantile(nb, 0.90)), int(np.quantile(nb, 0.99)))

pop mean/median/min/max/std: 296.625 293.0 216 477 39.670714954485
p90/p99: 345 394



> ℹ️ **Remarque pratique** : si le téléchargement OPUS échoue (exécution hors-ligne),
> le notebook bascule automatiquement sur un mini corpus embarqué afin de
> conserver une démonstration reproductible des blocs ENC et MEM.

