In [None]:
# Cell A - install packages
!pip install --upgrade pip
# core libs
!pip install vllm lmcache transformers accelerate huggingface_hub
!pip install torch

# optional: git-lfs if you need to clone on-colab (not recommended for big models)
!apt-get update && apt-get install -y git-lfs unzip
!git lfs install

# Show versions for debugging
import importlib, sys
for pkg in ("vllm","lmcache","transformers","accelerate","huggingface_hub","torch"):
    try:
        m = importlib.import_module(pkg)
        print(pkg, m.__version__)
    except Exception as e:
        print(pkg, "NOT INSTALLED or failed to import:", e)

In [None]:
# Cell B - mount Drive and unzip local copies
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# define paths (edit if you used different folder names)
DRIVE_BASE = "/content/drive/MyDrive/Tilli"
MODEL_ZIP = f"{DRIVE_BASE}/deepseek-coder-6.7B-instruct-GPTQ.zip"
#REPO_ZIP  = f"{DRIVE_BASE}/myrepo.zip"

# Copy to local ephemeral storage (faster than working on Drive)
!cp "{MODEL_ZIP}" /content/ || echo "Model zip not found at {MODEL_ZIP}"
#!cp "{REPO_ZIP}" /content/ || echo "Repo zip not found at {REPO_ZIP}"

# Unzip (overwrite if exists)
!unzip -o /content/deepseek-coder-6.7B-instruct-GPTQ.zip -d /content/deepseek_model || true
#!unzip -o /content/myrepo.zip -d /content/myrepo || true

# List files for verification
print("Model directory listing:")
!ls -lah /content/deepseek_model | sed -n '1,200p'
#print("\nRepo directory listing:")
#!ls -lah /content/myrepo | sed -n '1,200p'


In [None]:
import os, json, textwrap
from pathlib import Path

# The correct model root is directly at /content/deepseek_model/deepseek-coder-6.7B-instruct-GPTQ
model_root = "/content/deepseek_model/deepseek-coder-6.7B-instruct-GPTQ"

print("Using model_root:", model_root)
# Diagnostic prints to check path and directory status
print(f"Checking os.path.isdir({model_root}): {os.path.isdir(model_root)}")
assert os.path.isdir(model_root), f"Model folder not found at {model_root}. Check unzip step."

# Inspect files to find config / quant metadata
print("\nFiles in model root (first 200):")
!ls -lah "{model_root}" | sed -n '1,200p'

# Try to read a config.json if present (common)
cfg_paths = [os.path.join(model_root, f) for f in ("config.json", "model_index.json", "config.yaml")]
for cp in cfg_paths:
    if os.path.isfile(cp):
        print("\nFound config:", cp)
        try:
            with open(cp, "r", encoding="utf-8") as fh:
                raw = fh.read()
                preview = (raw[:1000] + "...") if len(raw) > 1000 else raw
                print(textwrap.fill(preview, width=200))
                # quick check for 'mxfp4'
                if "mxfp4" in raw.lower():
                    print("\n*** Warning: 'mxfp4' quant found in config. MXFP4 requires GPU compute capability >= 8.0 (Ampere+).")
        except Exception as e:
            print("Failed to read config:", e)
        break
else:
    print("\nNo config.json found; check model files for quant format (e.g. .bin, .gguf).")

# Check GPU capability programmatically
import torch
if torch.cuda.is_available():
    cc = torch.cuda.get_device_capability(0)
    print("\nCUDA device capability:", cc)
    if isinstance(cc, tuple) and (cc[0] >= 8):
        print("GPU seems Ampere+ (compatible with MXFP4).")
    else:
        print("GPU compute capability < 8.0. If model uses MXFP4 you will see an error loading it.")

In [None]:
# Now instantiate vLLM with LMCache. We will try to use AWQ quantization.
print("\nAttempting to create LM with vLLM + LMCache (this may fail if format unsupported)")
try:
    import lmcache
    from vllm import LLM, SamplingParams

    # 2. Define the LMCache configuration
    kv_cache_config = {
        "kv_connector": "LMCacheConnectorV1",
        "kv_role": "kv_both"
    }

    # 3. Initialize the vLLM engine with LMCache
    print("Loading model...")
    llm = LLM(
        model=model_root,
        quantization="GPTQ",
        kv_transfer_config=kv_cache_config,
        dtype="auto" 
    )
    print("Model loaded.")
 
except Exception as e:
    print("Failed to instantiate vLLM LLM():\n", e)
    raise


# 4. Define sampling parameters
from datetime import time
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)

# 5. Define prompts to test caching
prompts = [
    ("What is the capital of France?"),

    ("What is the capital of France?")
]

# --- Run Generations ---

# Run the first prompt (will be slower and populate the cache)
print("\n--- Running first prompt (populating cache) ---")
start_time = time.time()
outputs = llm.generate([prompts[0]], sampling_params)
end_time = time.time()

print(f"Time taken: {end_time - start_time:.2f} seconds")
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Prompt: {output.prompt}")
    print(f"Generated: {generated_text}\n")


# Run the second prompt (will be faster due to cached prefix)
print("\n--- Running second prompt (using cache) ---")
start_time = time.time()
outputs = llm.generate([prompts[1]], sampling_params)
end_time = time.time()

print(f"Time taken: {end_time - start_time:.2f} seconds")
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Prompt: {output.prompt}")
    print(f"Generated: {generated_text}\n")