# Lab-2.1 Part 2: Basic Inference with vLLM

## Objectives
- Master vLLM API usage
- Implement batch inference
- Measure performance metrics
- Analyze memory usage

## Estimated Time: 60-90 minutes

---
## 1. Setup and Model Loading

In [3]:
# Imports
from vllm import LLM, SamplingParams
import torch
import time
import numpy as np
import matplotlib.pyplot as plt
from typing import List

print(f"PyTorch: {torch.__version__}")
print(f"vLLM: {vllm.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch: 2.8.0+cu128


NameError: name 'vllm' is not defined

### Load Llama-2-7B Model

We'll use a 7B model for more realistic benchmarks.

**Note**: This requires ~16GB GPU memory. If you don't have enough, use a smaller model like `facebook/opt-1.3b`.

In [None]:
# Configuration
MODEL_NAME = "meta-llama/Llama-2-7b-hf"  # Change if needed
# MODEL_NAME = "facebook/opt-1.3b"  # Alternative for smaller GPUs

print(f"Loading {MODEL_NAME}...")
start_time = time.time()

llm = LLM(
    model=MODEL_NAME,
    tensor_parallel_size=1,      # Single GPU
    gpu_memory_utilization=0.9,  # Use 90% of GPU memory
    max_model_len=2048,          # Context length
    trust_remote_code=True,
)

load_time = time.time() - start_time
print(f"✅ Model loaded in {load_time:.2f} seconds")

In [None]:
# Check GPU memory after loading
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    total = torch.cuda.get_device_properties(0).total_memory / 1e9
    
    print(f"\nGPU Memory Usage:")
    print(f"  Model size:  {allocated:.2f} GB")
    print(f"  Reserved:    {reserved:.2f} GB")
    print(f"  Available:   {total - reserved:.2f} GB")
    print(f"  Total:       {total:.2f} GB")
    print(f"  Utilization: {reserved/total*100:.1f}%")

---
## 2. Single Request Inference

In [None]:
# Define sampling parameters
sampling_params = SamplingParams(
    temperature=0.8,
    top_p=0.95,
    max_tokens=100,
    stop=["\n\n"],  # Stop at double newline
)

# Single prompt
prompt = "Explain the concept of machine learning in simple terms:"

print("Generating...")
start_time = time.time()

outputs = llm.generate([prompt], sampling_params)

inference_time = time.time() - start_time

# Display result
generated_text = outputs[0].outputs[0].text
num_tokens = len(outputs[0].outputs[0].token_ids)

print(f"\nPrompt: {prompt}")
print(f"Generated: {generated_text}")
print(f"\n⏱️  Time: {inference_time:.2f}s")
print(f"📊 Tokens: {num_tokens}")
print(f"⚡ Throughput: {num_tokens/inference_time:.1f} tokens/s")

---
## 3. Batch Inference

vLLM excels at batch processing with dynamic batching.

In [None]:
# Create multiple prompts
prompts = [
    "What is Python programming language?",
    "Explain quantum computing in simple terms:",
    "What are the benefits of electric vehicles?",
    "How does blockchain technology work?",
    "What is the difference between AI and machine learning?",
    "Explain the concept of cloud computing:",
    "What is the purpose of cryptocurrency?",
    "How do neural networks learn?",
]

print(f"Processing {len(prompts)} prompts...")
start_time = time.time()

outputs = llm.generate(prompts, sampling_params)

batch_time = time.time() - start_time

# Analyze results
total_tokens = sum(len(o.outputs[0].token_ids) for o in outputs)
avg_tokens = total_tokens / len(outputs)

print(f"\n✅ Batch processing complete!")
print(f"⏱️  Total time: {batch_time:.2f}s")
print(f"⏱️  Time per prompt: {batch_time/len(prompts):.2f}s")
print(f"📊 Total tokens: {total_tokens}")
print(f"📊 Avg tokens/prompt: {avg_tokens:.1f}")
print(f"⚡ Throughput: {total_tokens/batch_time:.1f} tokens/s")

In [None]:
# Display sample outputs
print("\n" + "="*80)
print("SAMPLE OUTPUTS")
print("="*80)

for i, output in enumerate(outputs[:3]):  # Show first 3
    print(f"\n[{i+1}] Prompt: {output.prompt}")
    print(f"    Output: {output.outputs[0].text[:150]}...")
    print(f"    Tokens: {len(output.outputs[0].token_ids)}")

---
## 4. Performance Comparison: Batch vs Sequential

In [None]:
# Sequential processing (for comparison)
print("Testing sequential processing...")
sequential_times = []

for prompt in prompts[:4]:  # Test with 4 prompts
    start = time.time()
    _ = llm.generate([prompt], sampling_params)
    sequential_times.append(time.time() - start)

sequential_total = sum(sequential_times)
print(f"Sequential total time: {sequential_total:.2f}s")

In [None]:
# Batch processing (same prompts)
print("\nTesting batch processing...")
start = time.time()
_ = llm.generate(prompts[:4], sampling_params)
batch_total = time.time() - start
print(f"Batch total time: {batch_total:.2f}s")

In [None]:
# Comparison
speedup = sequential_total / batch_total

print("\n" + "="*80)
print("BATCH vs SEQUENTIAL COMPARISON")
print("="*80)
print(f"Sequential:  {sequential_total:.2f}s")
print(f"Batch:       {batch_total:.2f}s")
print(f"Speedup:     {speedup:.2f}x faster ⚡")
print("="*80)

# Visualize
fig, ax = plt.subplots(figsize=(8, 5))
methods = ['Sequential', 'Batch']
times = [sequential_total, batch_total]
colors = ['#ff6b6b', '#51cf66']

bars = ax.bar(methods, times, color=colors)
ax.set_ylabel('Time (seconds)')
ax.set_title('Sequential vs Batch Processing (4 prompts)')
ax.set_ylim(0, max(times) * 1.2)

for i, (bar, t) in enumerate(zip(bars, times)):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
            f'{t:.2f}s', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

---
## 5. Throughput Scaling Test

Test how throughput scales with batch size.

In [None]:
# Generate test prompts
test_prompts = [
    f"Write a short story about topic {i}: "
    for i in range(32)
]

# Shorter generation for faster testing
test_params = SamplingParams(
    temperature=0.8,
    max_tokens=50,
)

# Test different batch sizes
batch_sizes = [1, 2, 4, 8, 16, 32]
throughputs = []

print("Testing throughput scaling...\n")

for batch_size in batch_sizes:
    prompts_subset = test_prompts[:batch_size]
    
    start = time.time()
    outputs = llm.generate(prompts_subset, test_params)
    elapsed = time.time() - start
    
    total_tokens = sum(len(o.outputs[0].token_ids) for o in outputs)
    throughput = total_tokens / elapsed
    throughputs.append(throughput)
    
    print(f"Batch {batch_size:2d}: {throughput:6.1f} tokens/s")

print("\n✅ Throughput scaling test complete!")

In [None]:
# Visualize throughput scaling
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(batch_sizes, throughputs, marker='o', linewidth=2, markersize=8)
ax.set_xlabel('Batch Size')
ax.set_ylabel('Throughput (tokens/s)')
ax.set_title('vLLM Throughput Scaling')
ax.grid(True, alpha=0.3)
ax.set_xscale('log', base=2)

# Annotate points
for bs, tp in zip(batch_sizes, throughputs):
    ax.annotate(f'{tp:.0f}', xy=(bs, tp), 
                textcoords="offset points", xytext=(0,10), ha='center')

plt.tight_layout()
plt.show()

print(f"\n📊 Throughput increased from {throughputs[0]:.0f} to {throughputs[-1]:.0f} tokens/s")
print(f"📊 Scaling factor: {throughputs[-1]/throughputs[0]:.1f}x")

---
## 6. Memory Profiling

Analyze KV cache memory usage.

In [None]:
# Calculate theoretical KV cache size
def estimate_kv_cache_size(
    num_layers=32,
    num_heads=32,
    head_dim=128,
    batch_size=1,
    seq_len=2048,
    precision=2,  # FP16 = 2 bytes
):
    """
    KV Cache size = 2 (K+V) * batch * layers * heads * seq_len * head_dim * precision
    """
    size_bytes = (
        2 * batch_size * num_layers * num_heads * seq_len * head_dim * precision
    )
    size_gb = size_bytes / (1024 ** 3)
    return size_gb

# For Llama-2-7B
print("KV Cache Size Estimation (Llama-2-7B):")
print()

for batch_size in [1, 4, 8, 16, 32]:
    cache_size = estimate_kv_cache_size(
        num_layers=32,
        num_heads=32,
        head_dim=128,
        batch_size=batch_size,
        seq_len=2048,
    )
    print(f"  Batch {batch_size:2d}: {cache_size:5.2f} GB")

In [None]:
# Monitor actual GPU memory during inference
import gc

def measure_memory_usage(batch_size):
    """Measure GPU memory before and after inference"""
    torch.cuda.reset_peak_memory_stats()
    
    # Before inference
    torch.cuda.synchronize()
    mem_before = torch.cuda.memory_allocated(0) / 1e9
    
    # Inference
    test_prompts = [f"Test prompt {i}" for i in range(batch_size)]
    outputs = llm.generate(test_prompts, test_params)
    
    # After inference
    torch.cuda.synchronize()
    mem_after = torch.cuda.memory_allocated(0) / 1e9
    mem_peak = torch.cuda.max_memory_allocated(0) / 1e9
    
    return {
        'before': mem_before,
        'after': mem_after,
        'peak': mem_peak,
        'used': mem_after - mem_before,
    }

print("Measuring GPU memory usage...\n")

memory_stats = []
test_batch_sizes = [1, 4, 8, 16]

for bs in test_batch_sizes:
    stats = measure_memory_usage(bs)
    memory_stats.append(stats)
    print(f"Batch {bs:2d}: Peak memory = {stats['peak']:.2f} GB")

print("\n✅ Memory profiling complete!")

In [None]:
# Visualize memory usage
fig, ax = plt.subplots(figsize=(10, 6))

peak_mems = [s['peak'] for s in memory_stats]

ax.plot(test_batch_sizes, peak_mems, marker='o', linewidth=2, markersize=8, color='#ff6b6b')
ax.set_xlabel('Batch Size')
ax.set_ylabel('Peak GPU Memory (GB)')
ax.set_title('GPU Memory Usage vs Batch Size')
ax.grid(True, alpha=0.3)

for bs, mem in zip(test_batch_sizes, peak_mems):
    ax.annotate(f'{mem:.2f} GB', xy=(bs, mem), 
                textcoords="offset points", xytext=(0,10), ha='center')

plt.tight_layout()
plt.show()

---
## 7. Compare with HuggingFace (Batch)

Let's compare batch inference performance with HuggingFace.

In [None]:
# Load HuggingFace model (use smaller model for memory)
from transformers import AutoModelForCausalLM, AutoTokenizer

HF_MODEL = "facebook/opt-1.3b"  # Smaller model for fair comparison

print(f"Loading HuggingFace {HF_MODEL}...")
hf_model = AutoModelForCausalLM.from_pretrained(HF_MODEL).to("cuda")
hf_tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)
hf_tokenizer.pad_token = hf_tokenizer.eos_token
print("✅ HuggingFace model loaded")

In [None]:
# Load vLLM with same model
print(f"\nLoading vLLM {HF_MODEL}...")
vllm_model = LLM(
    model=HF_MODEL,
    gpu_memory_utilization=0.5,
    max_model_len=512,
)
print("✅ vLLM model loaded")

In [None]:
# Test prompts
comparison_prompts = [
    "The future of AI is",
    "Machine learning enables",
    "Deep learning networks",
    "Natural language processing",
]

comparison_params = SamplingParams(
    temperature=0.8,
    max_tokens=30,
)

print(f"Testing with {len(comparison_prompts)} prompts...\n")

In [None]:
# HuggingFace batch inference
print("Testing HuggingFace...")
hf_inputs = hf_tokenizer(comparison_prompts, return_tensors="pt", padding=True).to("cuda")

hf_start = time.time()
with torch.no_grad():
    hf_outputs = hf_model.generate(
        **hf_inputs,
        max_new_tokens=30,
        temperature=0.8,
        do_sample=True,
        pad_token_id=hf_tokenizer.eos_token_id,
    )
hf_time = time.time() - hf_start

hf_total_tokens = sum(len(ids) for ids in hf_outputs)
print(f"  Time: {hf_time:.3f}s")
print(f"  Throughput: {hf_total_tokens/hf_time:.1f} tokens/s")

In [None]:
# vLLM batch inference
print("\nTesting vLLM...")
vllm_start = time.time()
vllm_outputs = vllm_model.generate(comparison_prompts, comparison_params)
vllm_time = time.time() - vllm_start

vllm_total_tokens = sum(len(o.outputs[0].token_ids) for o in vllm_outputs)
print(f"  Time: {vllm_time:.3f}s")
print(f"  Throughput: {vllm_total_tokens/vllm_time:.1f} tokens/s")

In [None]:
# Final comparison
speedup = hf_time / vllm_time
throughput_gain = (vllm_total_tokens/vllm_time) / (hf_total_tokens/hf_time)

print("\n" + "="*80)
print("HUGGINGFACE vs vLLM COMPARISON")
print("="*80)
print(f"HuggingFace:")
print(f"  Time:       {hf_time:.3f}s")
print(f"  Throughput: {hf_total_tokens/hf_time:.1f} tokens/s")
print()
print(f"vLLM:")
print(f"  Time:       {vllm_time:.3f}s")
print(f"  Throughput: {vllm_total_tokens/vllm_time:.1f} tokens/s")
print()
print(f"Speedup:           {speedup:.2f}x faster ⚡")
print(f"Throughput gain:   {throughput_gain:.2f}x higher 📊")
print("="*80)

---
## Summary

✅ **Completed**:
1. Loaded and tested Llama-2-7B with vLLM
2. Implemented batch inference
3. Measured throughput scaling
4. Profiled GPU memory usage
5. Compared with HuggingFace baseline

📊 **Key Findings**:
- Batch processing provides 2-3x speedup vs sequential
- Throughput scales well with batch size
- vLLM is 5-15x faster than HuggingFace
- Memory usage grows linearly with batch size

➡️ **Next**: In `03-Advanced_Features.ipynb`, we'll explore:
- Continuous Batching
- Advanced sampling strategies
- Long context handling
- Multi-model management

In [None]:
# Cleanup
del llm, vllm_model, hf_model
torch.cuda.empty_cache()
gc.collect()
print("✅ Memory cleaned up")