# vLLM Server Test

This notebook tests the vLLM server running with Llama-3.1-8B-Instruct model.

In [1]:
import requests
import json
from typing import List, Dict, Any

In [2]:
# Server configuration
BASE_URL = "http://localhost:8000/v1"

In [3]:
# Get available models
response = requests.get(f"{BASE_URL}/models")
payload = response.json()['data']
available_models = [item['id'] for item in payload if item['object']=='model']
print(available_models)

DEFAULT_MODEL = next(iter(available_models))

['meta-llama/Llama-3.1-8B-Instruct']


In [4]:
def send_chat_completion(
    messages: List[Dict[str, str]],
    temperature: float = 0.7,
    max_tokens: int = 150,
    stream: bool = False,
    model: str = DEFAULT_MODEL,
) -> Dict[str, Any]:
    """
    Send a chat completion request to the vLLM server.

    Args:
        messages: List of message dictionaries with 'role' and 'content'
        temperature: Sampling temperature (0.0 to 1.0)
        max_tokens: Maximum number of tokens to generate
        stream: Whether to stream the response

    Returns:
        Response dictionary from the server
    """
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "stream": stream,
    }

    try:
        chat_endpoint = f"{BASE_URL}/v1/chat/completions"
        response = requests.post(
            chat_endpoint,
            json=payload,
            headers={"Content-Type": "application/json"},
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

## Test 1: Simple greeting

In [5]:
# Test 1: Simple greeting
messages = [
    {"role": "user", "content": "Hello! How are you today?"}
]

response = send_chat_completion(messages)
if response:
    print("Response:")
    print(json.dumps(response, indent=2))
    print("\nGenerated text:")
    print(response["choices"][0]["message"]["content"])
else:
    print("Failed to get response")

Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response


## Test 2: Multi-turn conversation

In [6]:
# Test 2: Multi-turn conversation
messages = [
    {"role": "user", "content": "What is the capital of France?"},
    {"role": "assistant", "content": "The capital of France is Paris."},
    {"role": "user", "content": "What's the population of that city?"}
]

response = send_chat_completion(messages)
if response:
    print("Multi-turn response:")
    print(response["choices"][0]["message"]["content"])
else:
    print("Failed to get response")

Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response


## Test 3: Reasoning task

In [7]:
# Test 3: Reasoning task
messages = [
    {"role": "user", "content": "If I have 3 apples and I give away 1 apple, then buy 2 more apples, how many apples do I have in total? Please explain your reasoning."}
]

response = send_chat_completion(messages, max_tokens=200)
if response:
    print("Reasoning response:")
    print(response["choices"][0]["message"]["content"])
else:
    print("Failed to get response")

Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response


## Test 4: Different temperature settings

In [8]:
# Test 4: Different temperature settings
prompt = "Write a creative short story about a robot learning to paint."
temperatures = [0.1, 0.7, 1.0]

for temp in temperatures:
    messages = [{"role": "user", "content": prompt}]
    response = send_chat_completion(messages, temperature=temp, max_tokens=100)
    
    if response:
        print(f"\n--- Temperature: {temp} ---")
        print(response["choices"][0]["message"]["content"])
    else:
        print(f"Failed to get response for temperature {temp}")

Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response for temperature 0.1
Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response for temperature 0.7
Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response for temperature 1.0


## Test 5: Server health check

In [9]:
# Test 5: Check server health and model info
try:
    # Check if server is responding
    health_response = requests.get(f"{BASE_URL}/health", timeout=5)
    print(f"Health check status: {health_response.status_code}")
    
    # Try to get model info
    models_response = requests.get(f"{BASE_URL}/v1/models", timeout=5)
    if models_response.status_code == 200:
        models_data = models_response.json()
        print("Available models:")
        for model in models_data.get("data", []):
            print(f"  - {model.get('id', 'Unknown')}")
    else:
        print(f"Models endpoint returned status: {models_response.status_code}")
        
except requests.exceptions.RequestException as e:
    print(f"Server health check failed: {e}")

Health check status: 404
Models endpoint returned status: 404


## Test 6: Performance timing

In [10]:
# Test 6: Performance timing
import time

messages = [
    {"role": "user", "content": "Explain quantum computing in simple terms."}
]

start_time = time.time()
response = send_chat_completion(messages, max_tokens=100)
end_time = time.time()

if response:
    duration = end_time - start_time
    tokens_generated = len(response["choices"][0]["message"]["content"].split())
    
    print(f"Response time: {duration:.2f} seconds")
    print(f"Approximate tokens generated: {tokens_generated}")
    print(f"Approximate tokens per second: {tokens_generated/duration:.2f}")
    print("\nResponse:")
    print(response["choices"][0]["message"]["content"])
else:
    print("Failed to get response for performance test")

Request failed: 404 Client Error: Not Found for url: http://localhost:8000/v1/v1/chat/completions
Failed to get response for performance test


## Summary

Run all the cells above to test various aspects of your vLLM server:

1. **Basic functionality** - Simple chat completion
2. **Multi-turn conversations** - Context awareness
3. **Reasoning capabilities** - Complex problem solving
4. **Temperature effects** - Creativity control
5. **Server health** - Endpoint availability
6. **Performance** - Response timing

If all tests pass successfully, your vLLM server is working correctly!