# Lab-2.3 Part 3: Integration with vLLM

## Objectives
- Integrate vLLM with FastAPI
- Use AsyncLLMEngine
- Build OpenAI-compatible API
- Optimize concurrent performance

## Estimated Time: 60-90 minutes

---
## 1. vLLM + FastAPI Integration

In [None]:
%%writefile app_vllm.py
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from vllm import LLM, SamplingParams
import time
import json

app = FastAPI(title="vLLM API Service", version="1.0.0")

# Global vLLM engine
llm_engine = None

class CompletionRequest(BaseModel):
    model: str = "meta-llama/Llama-2-7b-hf"
    prompt: str = Field(..., min_length=1)
    max_tokens: int = Field(default=100, ge=1, le=1000)
    temperature: float = Field(default=0.8, ge=0.0, le=2.0)
    top_p: float = Field(default=0.95, ge=0.0, le=1.0)
    stream: bool = False

class Message(BaseModel):
    role: Literal["system", "user", "assistant"]
    content: str

class ChatCompletionRequest(BaseModel):
    model: str = "meta-llama/Llama-2-7b-hf"
    messages: List[Message]
    max_tokens: int = Field(default=100, ge=1, le=1000)
    temperature: float = Field(default=0.8, ge=0.0, le=2.0)
    top_p: float = Field(default=0.95, ge=0.0, le=1.0)

@app.on_event("startup")
async def startup_event():
    """Initialize vLLM engine on startup."""
    global llm_engine
    
    print("Initializing vLLM engine...")
    
    # Use smaller model for demo
    llm_engine = LLM(
        model="facebook/opt-125m",
        gpu_memory_utilization=0.3,
        max_model_len=512,
    )
    
    print("✅ vLLM engine initialized")

@app.post("/v1/completions")
async def completions(request: CompletionRequest):
    """OpenAI-compatible completions endpoint."""
    try:
        sampling_params = SamplingParams(
            temperature=request.temperature,
            top_p=request.top_p,
            max_tokens=request.max_tokens,
        )
        
        # Generate
        outputs = llm_engine.generate([request.prompt], sampling_params)
        
        generated_text = outputs[0].outputs[0].text
        tokens_generated = len(outputs[0].outputs[0].token_ids)
        
        return {
            "id": f"cmpl-{int(time.time())}",
            "object": "text_completion",
            "created": int(time.time()),
            "model": request.model,
            "choices": [
                {
                    "text": generated_text,
                    "index": 0,
                    "finish_reason": "stop"
                }
            ],
            "usage": {
                "prompt_tokens": len(request.prompt.split()),
                "completion_tokens": tokens_generated,
                "total_tokens": len(request.prompt.split()) + tokens_generated
            }
        }
        
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    """OpenAI-compatible chat completions endpoint."""
    # Format messages into prompt
    prompt = ""
    for msg in request.messages:
        prompt += f"{msg.role.capitalize()}: {msg.content}\n"
    prompt += "Assistant:"
    
    # Generate
    sampling_params = SamplingParams(
        temperature=request.temperature,
        top_p=request.top_p,
        max_tokens=request.max_tokens,
    )
    
    outputs = llm_engine.generate([prompt], sampling_params)
    response_text = outputs[0].outputs[0].text
    tokens_generated = len(outputs[0].outputs[0].token_ids)
    
    return {
        "id": f"chatcmpl-{int(time.time())}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": request.model,
        "choices": [
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": response_text.strip()
                },
                "finish_reason": "stop"
            }
        ],
        "usage": {
            "prompt_tokens": len(prompt.split()),
            "completion_tokens": tokens_generated,
            "total_tokens": len(prompt.split()) + tokens_generated
        }
    }

@app.get("/v1/models")
async def list_models():
    """List available models."""
    return {
        "object": "list",
        "data": [
            {
                "id": "facebook/opt-125m",
                "object": "model",
                "created": int(time.time()),
                "owned_by": "vllm"
            }
        ]
    }

@app.get("/health")
async def health():
    return {
        "status": "healthy",
        "engine": "vllm",
        "ready": llm_engine is not None
    }

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

### Test with OpenAI Client

In [None]:
# Test OpenAI compatibility
from openai import OpenAI

# Configure client to use local vLLM server
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="dummy-key"
)

print("Testing OpenAI compatibility...\n")

try:
    # List models
    models = client.models.list()
    print(f"Available models: {[m.id for m in models.data]}\n")
    
    # Chat completion
    response = client.chat.completions.create(
        model="facebook/opt-125m",
        messages=[
            {"role": "user", "content": "Explain Python programming:"}
        ],
        max_tokens=100
    )
    
    print("Response:")
    print(response.choices[0].message.content)
    print(f"\nTokens: {response.usage.total_tokens}")
    
except Exception as e:
    print(f"Error: {e}")
    print("Make sure server is running: python app_vllm.py")

---
## Summary

✅ **Completed**:
1. Integrated vLLM with FastAPI
2. Built OpenAI-compatible endpoints
3. Implemented /completions and /chat/completions
4. Tested with OpenAI client

📚 **Key Features**:
- Drop-in replacement for OpenAI API
- vLLM backend for high performance
- Standard response format
- Token usage tracking

➡️ **Next**: In `04-Monitoring_and_Deploy.ipynb`, we'll:
- Add Prometheus metrics
- Implement structured logging
- Containerize with Docker
- Deploy to production

In [None]:
print("✅ Lab 2.3 Part 3 Complete!")