In [12]:
from openai import OpenAI
import os
from dotenv import load_dotenv
from pprint import pprint as pp

# Load environment variables from .env file
load_dotenv()
VLLM_API_KEY = os.getenv("VLLM_API_KEY", None)

In [None]:
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key=VLLM_API_KEY,
)

completion = client.chat.completions.create(
  model="chatbot",
  messages=[
    {"role": "user", "content": "Hello!"}
  ],
  max_tokens=100
)

print("Response:")
print(completion.choices[0].message.content)

In [None]:
# Example with different parameters
completion = client.chat.completions.create(
  model="chatbot",
  messages=[
    {"role": "user", "content": "Tell me a short joke"}
  ],
  max_tokens=150,
  temperature=0.7,
  stream=False
)

print("Joke Response:")
print(completion.choices[0].message.content)
print(f"Tokens used: {completion.usage.total_tokens}")

"Hello! How can I assist you today? Whether you need help with a specific topic, want to chat about something interesting, or have any questions, feel free to share what you'd like to discuss or know more about."

# vLLM Remote Inference Examples

This notebook demonstrates basic usage of the vLLM server through the OpenAI-compatible API.

## Setup
- Ensure vLLM server is running: `docker compose up -d vllm-server`
- Configure your API key in the `.env` file
- The server should be accessible at `http://localhost:8000`

## Examples
Below are simple examples showing different ways to interact with the vLLM API.

In [None]:
# Health check - verify server is running
import requests

try:
    health_response = requests.get("http://localhost:8000/health")
    if health_response.status_code == 200:
        print("✅ vLLM server is healthy and ready!")
    else:
        print(f"⚠️ Server responded with status: {health_response.status_code}")
except requests.exceptions.ConnectionError:
    print("❌ Cannot connect to vLLM server. Make sure it's running on localhost:8000")

In [None]:
# Get model information
try:
    models_response = client.models.list()
    print("Available models:")
    for model in models_response.data:
        print(f"- {model.id}")
        print(f"  Created: {model.created}")
        print(f"  Owned by: {model.owned_by}")
        print()
except Exception as e:
    print(f"Error getting model info: {e}")