In [None]:
import os

# -----------------------------------------------------------------------------
# Hugging Face config (mirror + caches)
# IMPORTANT: must be set BEFORE importing anything that might import
# huggingface_hub / datasets / transformers.
# If you already imported those in this kernel, restart the kernel.
# -----------------------------------------------------------------------------

# Use HF mirror
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

# Put *all* HF caches under /data2/ali to avoid filling the OS disk.
HF_CACHE_BASE = "/data2/ali/hf"
HF_HOME = os.path.join(HF_CACHE_BASE, "home")
HF_HUB_CACHE = os.path.join(HF_CACHE_BASE, "hub")
HF_DATASETS_CACHE = os.path.join(HF_CACHE_BASE, "datasets")
HF_TRANSFORMERS_CACHE = os.path.join(HF_CACHE_BASE, "transformers")

os.makedirs(HF_HOME, exist_ok=True)
os.makedirs(HF_HUB_CACHE, exist_ok=True)
os.makedirs(HF_DATASETS_CACHE, exist_ok=True)
os.makedirs(HF_TRANSFORMERS_CACHE, exist_ok=True)

# huggingface_hub
os.environ["HF_HOME"] = HF_HOME
os.environ["HF_HUB_CACHE"] = HF_HUB_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = HF_HUB_CACHE

# datasets
os.environ["DATASETS_CACHE"] = HF_DATASETS_CACHE

# transformers (and some libs that respect this)
os.environ["TRANSFORMERS_CACHE"] = HF_TRANSFORMERS_CACHE

os.environ["DASHSCOPE_API_KEY"] = 'sk-'

from vcache import VCache, VCachePolicy, VerifiedDecisionPolicy

error_rate_bound: int = 0.01
policy: VCachePolicy = VerifiedDecisionPolicy(delta=error_rate_bound)
vcache: VCache = VCache(policy=policy)

response: str = vcache.infer("Is the sky blue?")
print(response)

In [None]:
import os, time

# Must be set BEFORE importing anything that uses huggingface_hub/transformers/sentence-transformers.
os.environ.setdefault("HF_ENDPOINT", "https://hf-mirror.com")
os.environ["DASHSCOPE_API_KEY"] = 'sk-'

# Prefer setting this in your shell instead of hard-coding secrets in notebooks:
#   export DASHSCOPE_API_KEY=...
os.environ.setdefault("BGE_MODEL_PATH", "/data2/ali/models/BAAI__bge-base-en-v1.5")

from vcache import (
    VCache, VCacheConfig,
    VerifiedDecisionPolicy, VerifiedSplitterDecisionPolicy,
    HNSWLibVectorDB, InMemoryEmbeddingMetadataStorage,
    LangChainEmbeddingEngine,
)

from vcache.vcache_core.splitter.embedding_model import EmbeddingModel
from vcache.vcache_core.splitter.MaxSimSplitter import MaxSimSplitter

# Point to the *directory*; MaxSimSplitter will auto-pick the latest epoch checkpoint.
# (It prefers epoch=*.ckpt and only falls back to last.ckpt if that's the only one.)
CHECKPOINT_PATH = "/data2/ali/checkpoints_words"

device = "cuda"  # or "cpu"

# RL splitter + embedder used for MaxSim
embedder = EmbeddingModel()
splitter = MaxSimSplitter(CHECKPOINT_PATH, device=device, embedding_model=embedder)

# Keep the embedding engine the same for both policies (this is for storing/retrieving in the cache)
# Use a local embedding model to avoid extra API cost:
embedding_engine = LangChainEmbeddingEngine(model_name="sentence-transformers/all-mpnet-base-v2")

def run_sequence(policy, prompts):
    config = VCacheConfig(
        vector_db=HNSWLibVectorDB(),
        embedding_metadata_storage=InMemoryEmbeddingMetadataStorage(),
        embedding_engine=embedding_engine,
        system_prompt="Please answer in a single word with the first letter capitalized. Example: London",
    )
    v = VCache(config=config, policy=policy)

    hits = 0
    rows = []
    t0 = time.time()
    for p in prompts:
        hit, resp, _, _ = v.infer_with_cache_info(prompt=p)
        hits += int(hit)
        rows.append((hit, p, resp))
    return hits, len(prompts), time.time() - t0, rows

punct_prompts = [
    "Question: What is the capital of France?",
    "Q: What's France's capital city?",
    "France {capital} : what is it?",
    "France's capital-city (name): what?",
    "Tell me, please: the capital of France.",
    "Which city is the capital of France? (answer: one word)",
    "The capital of France is... what?",
    "Can you tell me: what the capital of France is?",
    "France -> capital? (city name)",
    "Pick one: {Paris, Lyon, Marseille} — which is France's capital?",
    "Fill in the blank: France's capital is {_____}.",
    "Capital of France: [_____]? Give just the city.",
    "France's capital (city) = ?",
    "What is the capital of France; respond with 'CityName' only.",
    "Answer-format: {City}. Prompt: capital of France?",
]
# Original policy
orig_policy = VerifiedDecisionPolicy(delta=0.05)

# New MaxSim policy:
# - candidate_selection="all"  => MaxSim used for choosing the NN too (slow but “MaxSim everywhere”)
# - candidate_selection="top_k" => fast rerank
maxsim_policy = VerifiedSplitterDecisionPolicy(
    delta=0.05,
    splitter=splitter,
    candidate_selection="all",   # or "top_k"
    candidate_k=10,              # only used for "top_k"
)

# Compare on the punctuation-heavy prompt set
orig_hits, n, orig_t, orig_rows = run_sequence(orig_policy, punct_prompts)
ms_hits, _, ms_t, ms_rows = run_sequence(maxsim_policy, punct_prompts)

print(f"Original Verified hits: {orig_hits}/{n}  ({orig_hits/n:.1%})  time={orig_t:.2f}s")
print(f"MaxSim Verified hits:   {ms_hits}/{n}  ({ms_hits/n:.1%})  time={ms_t:.2f}s")

print("\nPer-prompt hits (orig vs maxsim):")
for (h1, p, _), (h2, _, _) in zip(orig_rows, ms_rows):
    print(f"- orig={h1}  maxsim={h2}  :: {p}")

  from .autonotebook import tqdm as notebook_tqdm


INFO 12-29 02:43:56 [__init__.py:216] Automatically detected platform cuda.


2025-12-29 02:43:58,014	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[DEVICE] EmbeddingModel loaded on cpu
正在加载分句模型: /data2/ali/checkpoints_words/epoch=9-step=540.ckpt ...
MaxSimSplitter: 复用外部传入的 EmbeddingModel
[GEN] bs=1 device=cpu lm_device=cpu max_len=512




分句模型加载完成。


  self.embeddings = HuggingFaceEmbeddings(model_name=model_name)


Original Verified hits: 9/15  (60.0%)  time=18.48s
MaxSim Verified hits:   10/15  (66.7%)  time=14.45s

Per-prompt hits (orig vs maxsim):
- orig=False  maxsim=False  :: Question: What is the capital of France?
- orig=False  maxsim=False  :: Q: What's France's capital city?
- orig=False  maxsim=False  :: France {capital} : what is it?
- orig=False  maxsim=False  :: France's capital-city (name): what?
- orig=False  maxsim=False  :: Tell me, please: the capital of France.
- orig=True  maxsim=True  :: Which city is the capital of France? (answer: one word)
- orig=True  maxsim=True  :: The capital of France is... what?
- orig=True  maxsim=True  :: Can you tell me: what the capital of France is?
- orig=True  maxsim=True  :: France -> capital? (city name)
- orig=False  maxsim=True  :: Pick one: {Paris, Lyon, Marseille} — which is France's capital?
- orig=True  maxsim=True  :: Fill in the blank: France's capital is {_____}.
- orig=True  maxsim=True  :: Capital of France: [_____]? Give just the



In [4]:
import json
import os
import time
from typing import Any

# HF mirror + caches: keep consistent even if you run this cell standalone.
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

HF_CACHE_BASE = "/data2/ali/hf"
HF_HOME = os.path.join(HF_CACHE_BASE, "home")
HF_HUB_CACHE = os.path.join(HF_CACHE_BASE, "hub")
HF_DATASETS_CACHE = os.path.join(HF_CACHE_BASE, "datasets")
HF_TRANSFORMERS_CACHE = os.path.join(HF_CACHE_BASE, "transformers")

os.makedirs(HF_HOME, exist_ok=True)
os.makedirs(HF_HUB_CACHE, exist_ok=True)
os.makedirs(HF_DATASETS_CACHE, exist_ok=True)
os.makedirs(HF_TRANSFORMERS_CACHE, exist_ok=True)

os.environ["HF_HOME"] = HF_HOME
os.environ["HF_HUB_CACHE"] = HF_HUB_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = HF_HUB_CACHE
os.environ["DATASETS_CACHE"] = HF_DATASETS_CACHE
os.environ["TRANSFORMERS_CACHE"] = HF_TRANSFORMERS_CACHE

from datasets import load_dataset

from vcache import (
    BenchmarkComparisonSimilarityEvaluator,
    BenchmarkEmbeddingEngine,
    BenchmarkInferenceEngine,
    HNSWLibVectorDB,
    InMemoryEmbeddingMetadataStorage,
    NoEvictionPolicy,
    SimilarityMetricType,
    VCache,
    VCacheConfig,
    VerifiedDecisionPolicy,
)



INFO 12-29 18:42:41 [__init__.py:216] Automatically detected platform cuda.


In [None]:
import os, time

# HF mirror + caches: must be set BEFORE importing datasets/huggingface_hub in this kernel.
# If you already imported those in this kernel, restart the kernel so the new endpoint takes effect.
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

HF_CACHE_BASE = "/data2/ali/hf"
HF_HOME = os.path.join(HF_CACHE_BASE, "home")
HF_HUB_CACHE = os.path.join(HF_CACHE_BASE, "hub")
HF_DATASETS_CACHE = os.path.join(HF_CACHE_BASE, "datasets")
HF_TRANSFORMERS_CACHE = os.path.join(HF_CACHE_BASE, "transformers")

os.makedirs(HF_HOME, exist_ok=True)
os.makedirs(HF_HUB_CACHE, exist_ok=True)
os.makedirs(HF_DATASETS_CACHE, exist_ok=True)
os.makedirs(HF_TRANSFORMERS_CACHE, exist_ok=True)

os.environ["HF_HOME"] = HF_HOME
os.environ["HF_HUB_CACHE"] = HF_HUB_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = HF_HUB_CACHE
os.environ["DATASETS_CACHE"] = HF_DATASETS_CACHE
os.environ["TRANSFORMERS_CACHE"] = HF_TRANSFORMERS_CACHE

# Prefer setting tokens in your shell instead of hard-coding secrets in notebooks:
#   export HF_TOKEN=...
#   export DASHSCOPE_API_KEY=...
os.environ["DASHSCOPE_API_KEY"] = 'sk-'
os.environ["HF_TOKEN"] = ""

# Quick sanity-check (after a kernel restart, this should print https://hf-mirror.com)
try:
    import huggingface_hub

    print("HF_ENDPOINT env:", os.environ.get("HF_ENDPOINT"))
    print("huggingface_hub ENDPOINT:", getattr(huggingface_hub.constants, "ENDPOINT", None))
    print("HF_HOME:", os.environ.get("HF_HOME"))
    print("DATASETS_CACHE:", os.environ.get("DATASETS_CACHE"))
except Exception as e:
    print("huggingface_hub not available yet:", e)

DATASET_ID = "vCache/SemBenchmarkClassification"  # or: vCache/SemBenchmarkLmArena, vCache/SemBenchmarkSearchQueries, ...
EMBEDDING_COL = "emb_gte"  # from benchmarks/benchmark.py -> EmbeddingModel.<X>.value[0]
LLM_COL = "response_llama_3_8b"  # from benchmarks/benchmark.py -> LargeLanguageModel.<Y>.value[0]
MAX_SAMPLES = 500

DELTA = 0.02

from datasets import load_dataset

rows = load_dataset(
    DATASET_ID,
    split=f"train[:{MAX_SAMPLES}]",
    cache_dir=HF_DATASETS_CACHE,
)

HF_ENDPOINT env: https://hf-mirror.com
huggingface_hub ENDPOINT: https://hf-mirror.com
HF_HOME: /data2/ali/hf/home
DATASETS_CACHE: /data2/ali/hf/datasets


Generating train split: 45000 examples [00:13, 3293.75 examples/s]


In [5]:


# ------------------------------------------------------------
# Single-dataset benchmark (HuggingFace) using VerifiedDecisionPolicy
# - No live model/API calls: uses Benchmark* engines + precomputed columns in the dataset
# - Change these 4 variables to run a different dataset/model combo
# -----------------------------------------------------------

# Build vCache with benchmark engines
config = VCacheConfig(
    inference_engine=BenchmarkInferenceEngine(),
    embedding_engine=BenchmarkEmbeddingEngine(),
    vector_db=HNSWLibVectorDB(
        similarity_metric_type=SimilarityMetricType.COSINE,
        max_capacity=200_000,
    ),
    embedding_metadata_storage=InMemoryEmbeddingMetadataStorage(),
    eviction_policy=NoEvictionPolicy(),
    # For SemBenchmark datasets, correctness is encoded via id_set, so this avoids any LLM judge.
    similarity_evaluator=BenchmarkComparisonSimilarityEvaluator(),
)

policy = VerifiedDecisionPolicy(delta=DELTA)
vcache = VCache(config=config, policy=policy)


def _to_float_list(x: Any):
    if isinstance(x, str):
        try:
            x = json.loads(x)
        except Exception:
            import ast

            x = ast.literal_eval(x)
    if hasattr(x, "tolist"):
        x = x.tolist()
    return [float(v) for v in x]


hits = 0
hit_correct = 0
n = 0

t0 = time.time()
for r in rows:
    prompt = r["prompt"]
    system_prompt = r.get("output_format", "")

    # id_set is used by BenchmarkComparisonSimilarityEvaluator
    id_set = r.get("id_set", -1)
    if id_set == -1:
        id_set = r.get("ID_Set", -1)

    emb = _to_float_list(r[EMBEDDING_COL])
    label_response = r[LLM_COL]

    # Inject precomputed values for *this* request
    config.embedding_engine.set_next_embedding(emb)
    config.inference_engine.set_next_response(label_response)

    is_hit, resp, resp_meta, _nn_meta = vcache.infer_with_cache_info(
        prompt=prompt,
        system_prompt=system_prompt,
        id_set=id_set,
    )

    n += 1
    hits += int(is_hit)
    if is_hit and id_set != -1 and resp_meta is not None:
        hit_correct += int(resp_meta.id_set == id_set)

elapsed = time.time() - t0
print(f"Dataset: {DATASET_ID}")
print(f"Columns: embedding={EMBEDDING_COL}  llm={LLM_COL}")
print(f"delta={DELTA}  n={n}  time={elapsed:.2f}s")
print(f"hit_rate = {hits}/{n} = {hits/n:.1%}")
if hits:
    print(f"hit_correct_rate = {hit_correct}/{hits} = {hit_correct/hits:.1%} (requires id_set)")

# Give background updates a moment, then stop background threads cleanly.
time.sleep(0.1)
vcache.vcache_policy.shutdown()







Dataset: vCache/SemBenchmarkClassification
Columns: embedding=emb_gte  llm=response_llama_3_8b
delta=0.02  n=500  time=3.77s
hit_rate = 1/500 = 0.2%
hit_correct_rate = 0/1 = 0.0% (requires id_set)
