# LMCache Backend Latency Test (GPU + Disk)

This Colab-friendly notebook starts an LMCache controller, configures a local Disk tier,
runs a small vLLM model, and measures latency for repeated prompts while steering
placement between GPU (hot) and Disk (warm) using perplexity/time-variance.

## 1) Setup

- Define repo, config, log, and disk paths.
- Install dependencies and optionally unzip your repo into /content/src.
- Update LMCache config for GPU + Disk (no CPU/Remote).

In [1]:
print('=== [1] Setting up environment and paths ===')
# Environment and paths
import os, sys, textwrap
REPO_DIR = '/content/src'
CONFIG   = f'{REPO_DIR}/config.yaml'
LOGFILE  = f'{REPO_DIR}/controller.log'
DISK_DIR = '/content/lmcache_warm'
os.makedirs(REPO_DIR, exist_ok=True)
os.makedirs(DISK_DIR, exist_ok=True)
print('Paths set:', REPO_DIR, CONFIG, LOGFILE, DISK_DIR)
print('=== [1] Setup paths DONE ===')


=== [1] Setting up environment and paths ===
Paths set: /content/src /content/src/config.yaml /content/src/controller.log /content/lmcache_warm
=== [1] Setup paths DONE ===


In [2]:
print('=== [2] Installing dependencies and checking GPU ===')
# Install dependencies (rerun on fresh runtimes)
!pip -q install -U pip
!pip -q install transformers jedi pydantic
!pip show transformers
# vLLM provides the LLM runtime; install if not present
!pip -q install vllm
!pip show vllm
# LMCache Python package (provides lmcache.v1.api_server)
!pip -q install lmcache
import subprocess, importlib
gpu_rc = subprocess.run(['bash','-lc','nvidia-smi'], capture_output=True)
gpu_available = (gpu_rc.returncode == 0)
print('GPU available:', gpu_available)
try:
    importlib.import_module('lmcache.v1.api_server')
    print('lmcache.v1.api_server import OK')
except ImportError as e:
    raise RuntimeError('lmcache.v1.api_server not available; ensure pip install lmcache succeeded.') from e
print('=== [2] Dependencies/GPU check DONE ===')


=== [2] Installing dependencies and checking GPU ===
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hName: transformers
Version: 4.57.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.49.1 requires pydantic<2.12,>=

## Unzip files

In [3]:
print('=== [3] Checking for uploaded repo (src.zip) ===')
# If you uploaded src.zip, unzip it to /content/src
from pathlib import Path
if Path('src.zip').exists():
    !unzip -o src.zip -d /content > /dev/null
!ls -la /content/src || true
print('=== [3] Repo presence check DONE ===')


=== [3] Checking for uploaded repo (src.zip) ===
total 8
drwxr-xr-x 2 root root 4096 Nov 21 14:24 .
drwxr-xr-x 1 root root 4096 Nov 21 14:24 ..
=== [3] Repo presence check DONE ===


## 3. Check GPU status

In [None]:
import torch

if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    cc = torch.cuda.get_device_capability(0)
    mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)

    print(f"GPU: {device_name}")
    print(f"Compute Capability: {cc[0]}.{cc[1]}")
    print(f"Memory: {mem_gb:.1f} GB")

    if cc[0] >= 7 and cc[1] >= 5:
        print("✓ T4 or better - W4A16 quantization supported")
    else:
        print("⚠️  GPU may not support quantization kernels")
else:
    print("✗ No CUDA GPU detected!")
    raise RuntimeError("GPU required")

## 2) Initialize Model and Cache

- Initialize a small model in vLLM (Gemma 270M).
- Bind LMCache controller + MultiTierCache manager.

In [8]:
from huggingface_hub import notebook_login, whoami

# This will prompt you to paste your token
notebook_login()
user_info = whoami()

# If whoami() runs without raising an exception, the login was successful.
print("✅ Login was **successful**!")
print(f"Logged in as **{user_info['name']}** (user ID: {user_info['id']})")
print(f"Permissions: {user_info['auth']['accessTokenRole']}")
print('=== [4] Setup complete. You can now run the LMCache backend server. ===')

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

✅ Login was **successful**!
Logged in as **redbeardthedetective** (user ID: 6907daf9831f263751c64fb3)


KeyError: 'accessTokenRole'

In [13]:
%time
from huggingface_hub import snapshot_download
import os

model_id = "google/gemma-3-270m-it"
local_dir = "/content/models/gemma-3-270m-it"

if os.path.exists(local_dir):
    print(f"✓ Model already cached at {local_dir}")
else:
    print(f"Downloading {model_id}...")
    snapshot_download(
        repo_id=model_id,
        local_dir=local_dir,
        local_dir_use_symlinks=False
    )
    print(f"✓ Downloaded to {local_dir}")

# Verify download
!ls -lh {local_dir}

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs
✓ Model already cached at /content/models/gemma-3-270m-it
total 28K
-rw-r--r-- 1 root root 28K Nov 21 14:29 README.md


In [14]:
%time
from vllm import LLM, SamplingParams

model_path = "/content/models/gemma-3-270m-it"

print("Loading model with vLLM + LMCache...")
print("(This takes 30-60s on T4)\n")

# LMCache configuration
kv_cache_config = {
    "kv_connector": "LMCacheConnectorV1",
    "kv_role": "kv_both"
}

try:
    llm = LLM(
        model=model_path,
        dtype="auto",
        gpu_memory_utilization=0.8,
        max_model_len=2048,
        kv_transfer_config=kv_cache_config,
        enforce_eager=True  # Disable CUDA graphs for compatibility
    )
    print("\n✓ Model loaded successfully")
except Exception as e:
    print(f"\n✗ Model loading failed: {e}")
    raise

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs
Loading model with vLLM + LMCache...
(This takes 30-60s on T4)

INFO 11-21 16:02:50 [utils.py:253] non-default args: {'max_model_len': 2048, 'gpu_memory_utilization': 0.8, 'disable_log_stats': True, 'enforce_eager': True, 'kv_transfer_config': KVTransferConfig(kv_connector='LMCacheConnectorV1', engine_id='e2cf25d7-8247-46ff-8c9b-50661c32f6ef', kv_buffer_device='cuda', kv_buffer_size=1000000000.0, kv_role='kv_both', kv_rank=None, kv_parallel_size=1, kv_ip='127.0.0.1', kv_port=14579, kv_connector_extra_config={}, kv_connector_module_path=None, enable_permute_local_kv=False), 'model': '/content/models/gemma-3-270m-it'}

✗ Model loading failed: 1 validation error for ModelConfig
  Value error, Invalid repository ID or local directory specified: '/content/models/gemma-3-270m-it'.
Please verify the following requirements:
1. Provide a valid Hugging Face repository ID.
2. Specify a local directory that contains a recognized confi

ValidationError: 1 validation error for ModelConfig
  Value error, Invalid repository ID or local directory specified: '/content/models/gemma-3-270m-it'.
Please verify the following requirements:
1. Provide a valid Hugging Face repository ID.
2. Specify a local directory that contains a recognized configuration file.
   - For Hugging Face models: ensure the presence of a 'config.json'.
   - For Mistral models: ensure the presence of a 'params.json'.
3. For GGUF: pass the local path of the GGUF checkpoint.
   Loading GGUF from a remote repo directly is not yet supported.
 [type=value_error, input_value=ArgsKwargs((), {'model': ...rocessor_plugin': None}), input_type=ArgsKwargs]
    For further information visit https://errors.pydantic.dev/2.12/v/value_error

## 3) Start LMCache Controller

- Tail controller logs for quick diagnostics.
- Start via `src.notebook_bootstrap.start_controller()` (wraps the lmcache CLI for Colab).
- Verify health on http://127.0.0.1:9000/health.

In [None]:
print('=== Wiring LMCacheController + MultiTierCache ===')
# Import LMCache client + manager
sys.path.insert(0, REPO_DIR)
from src.cache_controller import LMCacheController
from src.tiered_caching import MultiTierCache
controller = LMCacheController(host='127.0.0.1', port=9000, model='google/gemma-3-270m-it')
cache_manager = MultiTierCache(controller)
cache_manager.set_llm(llm)
print('Cache manager ready')
print('=== [9] LMCacheController wiring DONE ===')

In [None]:
# Helper to tail last N lines of a file
from collections import deque
def print_last_lines(path: str, n: int = 80):
    if not os.path.exists(path):
        print('No log at', path) 
        return
    try:
        with open(path, 'rb') as f:
            last = deque(f, maxlen=n)
        print(f'--- Last {len(last)} lines of {path} ---')
        for b in last:
            print(b.decode('utf-8', errors='replace').rstrip())
    except Exception as e:
        print('log read error:', e)
print('=== [5] Helper print_last_lines() defined ===')
print('=== [5] Log helper cell DONE ===')


=== [5] Helper print_last_lines() defined ===


In [None]:
print('=== [6] Starting LMCache controller via cache_controller ===')
import time
os.environ['LMCACHE_CONFIG_FILE'] = CONFIG
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)
try:
    pid = controller.start_controller(
        config_path=CONFIG,
        host='127.0.0.1',
        port=9000,
        log_path=LOGFILE,
    )
    print('Controller PID:', pid)
    print('Controller running?', controller.controller_running())
except Exception as e:
    print('Error starting controller via cache_controller:', e)
time.sleep(1.0)
print_last_lines(LOGFILE, 60)
print('=== [6] Controller start cell DONE ===')


=== [6] Starting LMCache controller (script or CLI) ===
Starting controller via lmcache_controller CLI
--- Last 4 lines of /content/src/controller.log ---
usage: lmcache_controller [-h] [--host HOST] [--port PORT]
                          [--monitor-ports MONITOR_PORTS]
                          [--monitor-port MONITOR_PORT]
lmcache_controller: error: unrecognized arguments: --config /content/src/config.yaml


In [None]:
print('=== [7] Health check: LMCache controller /health ===')
# Health check loop
import requests, time
ok=False
for _ in range(10):
    r = controller.health()
    print('health:', r)
    ok=True
    break
    if not r.ok:
        time.sleep(0.5)
if not ok:
    print('Error: Controller not reachable; check logs above')
print('=== [7] Health check cell DONE ===')

=== [7] Health check: LMCache controller /healthz ===
Controller not reachable; check logs above


## 4) Latency Tests (GPU vs Disk)

- Use perplexity and time-variance to steer placement.
- Run twice to observe warm-cache speedups.

In [None]:
# Helper to run a latency test with desired perplexity/time_variance
import time as _t
def latency_run(label: str, prompt: str, perplexity: float, time_variance: float, max_tokens: int = 64):
    print('Controller running?', controller.controller_running())
    
    meta = {'perplexity': perplexity, 'time_variance': time_variance}
    sp_local = SamplingParams(max_tokens=max_tokens, temperature=0.0)
    print(f'=== [10] latency_run start: {label} | perplexity={perplexity}, time_variance={time_variance}, max_tokens={max_tokens} ===')
    print('Prompt length (chars):', len(prompt))
    # Quick sanity on controller + LLM
    try:
        h = controller.health()
        print('controller.health():', h)
    except Exception as e:
        print('controller.health() failed:', repr(e))
    print('llm type:', type(llm))
    # First run
    t0 = _t.time()
    try:
        _ = cache_manager.generate_and_manage(prompt, sp_local, metadata=meta.copy())
    except Exception as e:
        t_err = _t.time()
        print(f'ERROR during first generate_and_manage for {label}:', repr(e))
        print('Elapsed before error (s):', t_err - t0)
        raise
    t1 = _t.time()
    tokens = controller.tokenize(prompt)
    layout = controller.lookup(tokens)
    print('layout after first run:', layout)
    b1 = layout.get('lmcache_default_instance', [None])[0]
    # Second run
    t2 = _t.time()
    try:
        _ = cache_manager.generate_and_manage(prompt, sp_local, metadata=meta.copy())
    except Exception as e:
        t_err2 = _t.time()
        print(f'ERROR during second generate_and_manage for {label}:', repr(e))
        print('Elapsed before error (s):', t_err2 - t2)
        raise
    t3 = _t.time()
    layout2 = controller.lookup(tokens)
    print('layout after second run:', layout2)
    b2 = layout2.get('lmcache_default_instance', [None])[0]
    print(f'[{label}] Run1 backend={b1} latency={t1-t0:.3f}s | Run2 backend={b2} latency={t3-t2:.3f}s')
    return {'label': label, 'b1': b1, 't1': t1-t0, 'b2': b2, 't2': t3-t2}
print('=== [10] latency_run helper definition DONE ===')


In [None]:
# GPU-preferred test (hot): low perplexity / low variance
print('--- GPU test: checking controller health before latency_run ---')
try:
    print('pre-run controller.health():', controller.health())
except Exception as e:
    print('pre-run controller.health() failed:', repr(e))
print('--- GPU test: calling latency_run ---')
res_gpu = latency_run('GPU', 'Hello GPU test ' * 128, perplexity=5.0, time_variance=0.1, max_tokens=64)
print('GPU test result dict:', res_gpu)
print('=== [10a] GPU latency_run invocation DONE ===')


In [None]:
# Disk-preferred test (warm): high perplexity / high variance
res_disk = latency_run('Disk', 'Move to disk test ' * 512, perplexity=100.0, time_variance=0.9, max_tokens=32)
res_disk
print('=== [10b] Disk latency_run invocation DONE ===')


In [None]:
# Custom test: tweak to steer placement
custom_prompt = 'Custom backend placement test ' * 256
res_custom = latency_run('Custom', custom_prompt, perplexity=30.0, time_variance=0.5, max_tokens=48)
res_custom
print('=== [10c] Custom latency_run invocation DONE ===')


## 5) Summary & Inspect Disk

- Review timings and backends.
- List a few files from the Disk tier directory.

In [None]:
print('=== [11] Summary and Disk inspection ===')
# Summary
from pprint import pprint
pprint({'GPU': res_gpu, 'Disk': res_disk, 'Custom': res_custom})
print('Disk dir contents (first few files):')
!find /content/lmcache_warm -type f | head -n 10
print('=== [11] Summary cell DONE ===')


## 6) TTFT Batch Benchmarks (Long Contexts)

Measure approximate TTFT by generating only 1 token.
Runs multiple prompts with long shared prefixes, comparing first vs second run.

In [None]:
print('=== [12] Running TTFT batch benchmarks (long-context prompts) ===')
import time as _t
from pprint import pprint

def make_long_context(repeats: int = 2048) -> str:
    base = 'In a distant future, advanced neural architectures collaborate with humans to solve complex problems. '
    return base * (repeats // 8)

base_ctx = make_long_context(2048)
prompts = [f'{base_ctx} Task {i}: Write a concise plan.' for i in range(5)]
sp_ttft = SamplingParams(max_tokens=1, temperature=0.0)

results = []
for i, p in enumerate(prompts):
    t0 = _t.time()
    _ = cache_manager.generate_and_manage(p, sp_ttft, metadata={'perplexity': 12.0, 'time_variance': 0.2})
    t1 = _t.time()
    tokens = controller.tokenize(p)
    layout = controller.lookup(tokens)
    b1 = layout.get('lmcache_default_instance', [None])[0]

    t2 = _t.time()
    _ = cache_manager.generate_and_manage(p, sp_ttft, metadata={'perplexity': 12.0, 'time_variance': 0.2})
    t3 = _t.time()
    layout2 = controller.lookup(tokens)
    b2 = layout2.get('lmcache_default_instance', [None])[0]

    results.append({'idx': i, 'ttft_first': t1-t0, 'ttft_second': t3-t2, 'backend_first': b1, 'backend_second': b2})

print('TTFT results (seconds):')
pprint(results)
print('Avg first:', sum(r['ttft_first'] for r in results)/len(results))
print('Avg second:', sum(r['ttft_second'] for r in results)/len(results))
print('=== [12] TTFT benchmarks cell DONE ===')
