In [None]:
!pip install vllm==0.11.0 lmcache
!pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 xformers==0.0.32.post1 --index-url https://download.pytorch.org/whl/cu121

: 

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# The model's identifier on the Hugging Face Hub
model_id = "TheBloke/deepseek-coder-6.7B-instruct-GPTQ"

# Load thes Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the Model
# The key arguments for large models are:
# 1. device_map="auto": Automatically loads model layers onto the GPU(s) to manage memory.
# 2. torch_dtype="auto": Uses optimized data types (like bfloat16) for efficiency.
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype="auto",
    device_map="auto",
)

print(f"Model {model_id} has been successfully downloaded and loaded onto the GPU.")

In [None]:
import os

# LMCache configuration
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
os.environ["LMCACHE_LOCAL_CPU"] = "True"
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "8.0"  # 8GB CPU cache
os.environ["LMCACHE_CACHE_POLICY"] = "LRU"
os.environ["LMCACHE_LOG_LEVEL"] = "INFO"

print("âœ“ LMCache environment configured")
print(f"  CPU Cache Size: {os.environ['LMCACHE_MAX_LOCAL_CPU_SIZE']}GB")
print(f"  Cache Policy: {os.environ['LMCACHE_CACHE_POLICY']}")

In [None]:
import time
from vllm import LLM, SamplingParams

# 1. Define the model ID
model_id = "TheBloke/deepseek-coder-6.7B-instruct-GPTQ"

# 2. Define the LMCache configuration
kv_cache_config = {
    "kv_connector": "LMCacheConnectorV1",
    "kv_role": "kv_both"
}

# 3. Initialize the vLLM engine with LMCache
print("Loading model...")
llm = LLM(
    model=model_id,
    kv_transfer_config=kv_cache_config,
    gpu_memory_utilization=0.8,
    dtype="float16" # Explicitly set dtype to float16, as bfloat16 is not supported by T4
    # Removed quantization="gptq" as it conflicts with the model's config
)
print("Model loaded.")

# 4. Define sampling parameters
sampling_params = SamplingParams(temperature=0.7, max_tokens=100)


In [None]:

# 5. Define prompts to test caching
prompts = [
    ("What is the capital of France?"),

    ("What is the capital of France?")
]

# --- Run Generations ---

# Run the first prompt (will be slower and populate the cache)
print("\n--- Running first prompt (populating cache) ---")
start_time = time.time()
outputs = llm.generate([prompts[0]], sampling_params)
end_time = time.time()

print(f"Time taken: {end_time - start_time:.2f} seconds")
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Prompt: {output.prompt}")
    print(f"Generated: {generated_text}\n")


# Run the second prompt (will be faster due to cached prefix)
print("\n--- Running second prompt (using cache) ---")
start_time = time.time()
outputs = llm.generate([prompts[1]], sampling_params)
end_time = time.time()

print(f"Time taken: {end_time - start_time:.2f} seconds")
for output in outputs:
    generated_text = output.outputs[0].text
    print(f"Prompt: {output.prompt}")
    print(f"Generated: {generated_text}\n")