# LMCache Backend Latency Test (GPU + Disk)

This Colab-friendly notebook starts an LMCache controller, configures a local Disk tier,
runs a small vLLM model, and measures latency for repeated prompts while steering
placement between GPU (hot) and Disk (warm) using perplexity/time-variance.

## 1) Setup

- Define repo, config, log, and disk paths.
- Install dependencies and optionally unzip your repo into /content/src.
- Update LMCache config for GPU + Disk (no CPU/Remote).

In [1]:
print('=== [1] Setting up environment and paths ===')
# Environment and paths
import os, sys, textwrap
REPO_DIR = '/src'
CONFIG   = f'{REPO_DIR}/config.yaml'
LOGFILE  = f'{REPO_DIR}/controller.log'
DISK_DIR = f'{REPO_DIR}lmcache_warm'
os.makedirs(REPO_DIR, exist_ok=True)
os.makedirs(DISK_DIR, exist_ok=True)
print('Paths set:', REPO_DIR, CONFIG, LOGFILE, DISK_DIR)


=== [1] Setting up environment and paths ===
Paths set: /src /src/config.yaml /src/controller.log /srclmcache_warm


In [2]:
print('=== [2] Installing dependencies and checking GPU ===')
# Install dependencies (rerun on fresh runtimes)
!pip -q install -U pip
!pip -q install requests transformers accelerate safetensors
# vLLM provides the LLM runtime; install if not present
!pip -q install vllm || true
# LMCache CLI (controller); if unavailable from pip in your env, use your script instead
!pip -q install lmcache || true
import subprocess, json
print('GPU available:', subprocess.run(['bash','-lc','nvidia-smi'], capture_output=True).returncode == 0)


=== [2] Installing dependencies and checking GPU ===
GPU available: True


In [None]:
print('=== [3] Checking for uploaded repo (src.zip) ===')
# If you uploaded src.zip, unzip it to /content/src
from pathlib import Path
if Path('src.zip').exists():
    !rm -rf /{}/src
    !mkdir -p /{REPO_DIR}
    !unzip -o src.zip -d /{REPO_DIR} > /dev/null
!ls -la /{REPO_DIR}/src || true


=== [3] Checking for uploaded repo (src.zip) ===
ls: cannot access '//src/src': No such file or directory


In [4]:
print('=== [4] Writing LMCache config (GPU+Disk only) ===')
# Update config: enable Disk, disable CPU/Remote, set disk root_dir to /content/lmcache_warm
import yaml
cfg = yaml.safe_load(open(CONFIG)) if os.path.exists(CONFIG) else {}
cfg.setdefault('local_disk', {})
cfg['local_disk']['enable'] = True
cfg['local_disk']['root_dir'] = DISK_DIR
cfg['local_disk']['max_disk_size_gb'] = cfg['local_disk'].get('max_disk_size_gb', 50)
cfg['local_disk']['eviction_policy'] = 'lru'
cfg['local_cpu'] = False
cfg.setdefault('remote_storage', {})
cfg['remote_storage']['enable'] = False
open(CONFIG, 'w').write(yaml.safe_dump(cfg))
print('Wrote config to', CONFIG)
print(open(CONFIG).read())


=== [4] Writing LMCache config (GPU+Disk only) ===
Wrote config to /src/config.yaml
local_cpu: false
local_disk:
  enable: true
  eviction_policy: lru
  max_disk_size_gb: 50
  root_dir: /srclmcache_warm
remote_storage:
  enable: false



## 2) Start LMCache Controller

- Tail controller logs for quick diagnostics.
- Start via repo script if present, else use lmcache_controller CLI.
- Verify health on http://127.0.0.1:9000/healthz.

In [5]:
# Helper to tail last N lines of a file
from collections import deque
def print_last_lines(path: str, n: int = 80):
    if not os.path.exists(path):
        print('No log at', path) 
        return
    try:
        with open(path, 'rb') as f:
            last = deque(f, maxlen=n)
        print(f'--- Last {len(last)} lines of {path} ---')
        for b in last:
            print(b.decode('utf-8', errors='replace').rstrip())
    except Exception as e:
        print('log read error:', e)
print('=== [5] Helper print_last_lines() defined ===')


=== [5] Helper print_last_lines() defined ===


In [6]:
print('=== [6] Starting LMCache controller (script or CLI) ===')
# Start LMCache controller (prefer repo script; fallback to CLI)
import subprocess, time, shutil
os.environ['LMCACHE_CONFIG_FILE'] = CONFIG
START_SCRIPT = f'{REPO_DIR}/start_controller_server.sh'
proc = None
if os.path.exists(START_SCRIPT):
    print('Starting controller via script:', START_SCRIPT)
    os.chmod(START_SCRIPT, 0o755)
    proc = subprocess.Popen(['bash', START_SCRIPT], cwd=REPO_DIR, stdout=open(LOGFILE, 'ab'), stderr=subprocess.STDOUT, env=os.environ.copy())
elif shutil.which('lmcache_controller'):
    print('Starting controller via lmcache_controller CLI')
    cmd = ['lmcache_controller','--host','127.0.0.1','--port','9000','--monitor-port','9001','--config', CONFIG]
    proc = subprocess.Popen(cmd, stdout=open(LOGFILE, 'ab'), stderr=subprocess.STDOUT, env=os.environ.copy())
else:
    print('No controller found. Install lmcache or provide start_controller_server.sh in /content/src')
time.sleep(1.0)
print_last_lines(LOGFILE, 60)


=== [6] Starting LMCache controller (script or CLI) ===
Starting controller via lmcache_controller CLI
--- Last 0 lines of /src/controller.log ---


In [7]:
print('=== [7] Health check: LMCache controller /healthz ===')
# Health check loop
import requests, time
ok=False
for _ in range(10):
    try:
        r = requests.get('http://127.0.0.1:9000/healthz', timeout=1.0)
        print('health:', r.status_code, r.text)
        ok=True
        break
    except Exception as e:
        time.sleep(0.5)
if not ok:
    print('Error: Controller not reachable; check logs above')


=== [7] Health check: LMCache controller /healthz ===
Error: Controller not reachable; check logs above


## 3) Initialize Model and Cache

- Initialize a small model in vLLM (Gemma 270M).
- Bind LMCache controller + MultiTierCache manager.

In [8]:
print('=== [8] Initializing vLLM LLM (Gemma 270M) ===')
# Initialize vLLM LLM (Gemma 270M)
from vllm import LLM, SamplingParams
try:
    llm = LLM(model='google/gemma-3-270m-it')
    sp = SamplingParams(max_tokens=64, temperature=0.0)
    print('LLM initialized')
except Exception as e:
    print('Failed to init vLLM. Ensure GPU runtime and permissions for the model.')
    raise


=== [8] Initializing vLLM LLM (Gemma 270M) ===
INFO 11-21 01:57:30 [utils.py:253] non-default args: {'disable_log_stats': True, 'model': 'google/gemma-3-270m-it'}
Failed to init vLLM. Ensure GPU runtime and permissions for the model.


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-3-270m-it.
401 Client Error. (Request ID: Root=1-691fc714-79bc2e874b76a75e60ca385f;58b7dee3-4a11-46dc-8a19-7f5b1b56c3fa)

Cannot access gated repo for url https://huggingface.co/google/gemma-3-270m-it/resolve/main/config.json.
Access to model google/gemma-3-270m-it is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
print('=== [9] Wiring LMCacheController + MultiTierCache ===')
# Import LMCache client + manager
sys.path.insert(0, REPO_DIR)
from src.cache_controller import LMCacheController
from src.tiered_caching import MultiTierCache
controller = LMCacheController(host='127.0.0.1', port=9000, model='google/gemma-3-270m-it')
cache_manager = MultiTierCache(controller)
cache_manager.set_llm(llm)
print('Cache manager ready')


## 4) Latency Tests (GPU vs Disk)

- Use perplexity and time-variance to steer placement.
- Run twice to observe warm-cache speedups.

In [None]:
# Helper to run a latency test with desired perplexity/time_variance
import time as _t
def latency_run(label: str, prompt: str, perplexity: float, time_variance: float, max_tokens: int = 64):
    meta = {'perplexity': perplexity, 'time_variance': time_variance}
    sp_local = SamplingParams(max_tokens=max_tokens, temperature=0.0)
    print(f'=== [10] latency_run start: {label} | perplexity={perplexity}, time_variance={time_variance}, max_tokens={max_tokens} ===')
    print('Prompt length (chars):', len(prompt))
    # First run
    t0 = _t.time()
    _ = cache_manager.generate_and_manage(prompt, sp_local, metadata=meta.copy())
    t1 = _t.time()
    tokens = controller.tokenize(prompt)
    layout = controller.lookup(tokens)
    b1 = layout.get('lmcache_default_instance', [None])[0]
    # Second run
    t2 = _t.time()
    _ = cache_manager.generate_and_manage(prompt, sp_local, metadata=meta.copy())
    t3 = _t.time()
    layout2 = controller.lookup(tokens)
    b2 = layout2.get('lmcache_default_instance', [None])[0]
    print(f'[{label}] Run1 backend={b1} latency={t1-t0:.3f}s | Run2 backend={b2} latency={t3-t2:.3f}s')
    return {'label': label, 'b1': b1, 't1': t1-t0, 'b2': b2, 't2': t3-t2}


In [None]:
# GPU-preferred test (hot): low perplexity / low variance
res_gpu = latency_run('GPU', 'Hello GPU test ' * 128, perplexity=5.0, time_variance=0.1, max_tokens=64)
res_gpu


In [None]:
# Disk-preferred test (warm): high perplexity / high variance
res_disk = latency_run('Disk', 'Move to disk test ' * 512, perplexity=100.0, time_variance=0.9, max_tokens=32)
res_disk


In [None]:
# Custom test: tweak to steer placement
custom_prompt = 'Custom backend placement test ' * 256
res_custom = latency_run('Custom', custom_prompt, perplexity=30.0, time_variance=0.5, max_tokens=48)
res_custom


## 5) Summary & Inspect Disk

- Review timings and backends.
- List a few files from the Disk tier directory.

In [None]:
print('=== [11] Summary and Disk inspection ===')
# Summary
from pprint import pprint
pprint({'GPU': res_gpu, 'Disk': res_disk, 'Custom': res_custom})
print('Disk dir contents (first few files):')
!find /{REPO_DIR}/lmcache_warm -type f | head -n 10


## 6) TTFT Batch Benchmarks (Long Contexts)

Measure approximate TTFT by generating only 1 token.
Runs multiple prompts with long shared prefixes, comparing first vs second run.

In [None]:
print('=== [12] Running TTFT batch benchmarks (long-context prompts) ===')
import time as _t
from pprint import pprint

def make_long_context(repeats: int = 2048) -> str:
    base = 'In a distant future, advanced neural architectures collaborate with humans to solve complex problems. '
    return base * (repeats // 8)

base_ctx = make_long_context(2048)
prompts = [f'{base_ctx} Task {i}: Write a concise plan.' for i in range(5)]
sp_ttft = SamplingParams(max_tokens=1, temperature=0.0)

results = []
for i, p in enumerate(prompts):
    t0 = _t.time()
    _ = cache_manager.generate_and_manage(p, sp_ttft, metadata={'perplexity': 12.0, 'time_variance': 0.2})
    t1 = _t.time()
    tokens = controller.tokenize(p)
    layout = controller.lookup(tokens)
    b1 = layout.get('lmcache_default_instance', [None])[0]

    t2 = _t.time()
    _ = cache_manager.generate_and_manage(p, sp_ttft, metadata={'perplexity': 12.0, 'time_variance': 0.2})
    t3 = _t.time()
    layout2 = controller.lookup(tokens)
    b2 = layout2.get('lmcache_default_instance', [None])[0]

    results.append({'idx': i, 'ttft_first': t1-t0, 'ttft_second': t3-t2, 'backend_first': b1, 'backend_second': b2})

print('TTFT results (seconds):')
pprint(results)
print('Avg first:', sum(r['ttft_first'] for r in results)/len(results))
print('Avg second:', sum(r['ttft_second'] for r in results)/len(results))
