From 5f87889ae31eb3e7d33984cd39deefa28a156725 Mon Sep 17 00:00:00 2001 From: Alex M Date: Mon, 6 Apr 2026 19:17:18 +0000 Subject: [PATCH 1/2] feat: add Prometheus metrics and Grafana dashboard for observability Add yasha:* custom metrics (request latency, errors, model load time, per-usecase timing, client disconnects, cleanup errors) via ray.serve.metrics, gated behind YASHA_METRICS env var with zero-overhead no-op stubs when disabled. Include pre-built Grafana dashboard, /health endpoint, and documentation. --- Dockerfile.dev | 8 +- Dockerfile.prod | 8 +- README.md | 5 + docs/grafana-dashboard.json | 592 ++++++++++++++++++ docs/monitoring.md | 163 +++++ yasha/infer/diffusers/diffusers_infer.py | 4 +- yasha/infer/infer_config.py | 7 +- yasha/infer/model_deployment.py | 49 +- .../infer/transformers/transformers_infer.py | 4 +- yasha/infer/vllm/vllm_infer.py | 4 +- yasha/metrics.py | 192 ++++++ yasha/openai/api.py | 126 ++-- 12 files changed, 1105 insertions(+), 57 deletions(-) create mode 100644 docs/grafana-dashboard.json create mode 100644 docs/monitoring.md create mode 100644 yasha/metrics.py diff --git a/Dockerfile.dev b/Dockerfile.dev index ce983e9..160ba90 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -54,6 +54,8 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0 ENV RAY_HEAD_CPU_NUM=2 ENV RAY_HEAD_GPU_NUM=1 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false +ENV YASHA_METRICS=false +ENV RAY_METRICS_EXPORT_PORT=8079 RUN uv venv ARG PYTHON_VERSION @@ -78,7 +80,11 @@ uv sync --project /yasha --locked \$EXTRAS cd /yasha && uv run pre-commit install if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then - cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats + METRICS_FLAG="" + if [ "\${YASHA_METRICS}" = "true" ]; then + METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}" + fi + cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG} if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then echo "ray cluster failed to start" exit 1 diff --git a/Dockerfile.prod b/Dockerfile.prod index 061c7b5..d6b51f5 100644 --- a/Dockerfile.prod +++ b/Dockerfile.prod @@ -54,6 +54,8 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0 ENV RAY_HEAD_CPU_NUM=2 ENV RAY_HEAD_GPU_NUM=1 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false +ENV YASHA_METRICS=false +ENV RAY_METRICS_EXPORT_PORT=8079 RUN uv venv ARG PYTHON_VERSION @@ -77,7 +79,11 @@ fi uv sync --project /yasha --locked \$EXTRAS if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then - cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats + METRICS_FLAG="" + if [ "\${YASHA_METRICS}" = "true" ]; then + METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}" + fi + cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG} if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then echo "ray cluster failed to start" exit 1 diff --git a/README.md b/README.md index 1c712e4..ebaf649 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,11 @@ For a full guide on writing your own plugin, see [Plugin Development](docs/plugi - [Architecture](docs/architecture.md) — system design, request lifecycle, plugin loading - [Plugin Development](docs/plugins.md) — writing custom TTS backends - [Home Assistant Integration](docs/home-assistant.md) — Wyoming protocol setup for voice automation +- [Monitoring](docs/monitoring.md) — Prometheus metrics, Grafana dashboard, health checks + +## Monitoring + +Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single port. Enable with `YASHA_METRICS=true` and scrape port 8079. A pre-built Grafana dashboard is included. See [Monitoring](docs/monitoring.md) for setup details. ## Future Work diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json new file mode 100644 index 0000000..eb3cfef --- /dev/null +++ b/docs/grafana-dashboard.json @@ -0,0 +1,592 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Prometheus datasource for Yasha metrics", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "title": "Overview", + "type": "row" + }, + { + "title": "Request Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 }, + "id": 1, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(yasha:request_total[5m])) by (endpoint)", + "legendFormat": "{{ endpoint }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "title": "Error Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 }, + "id": 2, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(yasha:request_errors_total[5m])) by (model, error_type)", + "legendFormat": "{{ model }} - {{ error_type }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "title": "In-Flight Requests", + "type": "timeseries", + "gridPos": { "h": 8, "w": 4, "x": 16, "y": 1 }, + "id": 3, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "yasha:request_in_progress", + "legendFormat": "{{ model }}" + } + ] + }, + { + "title": "Models Loaded", + "type": "stat", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 4, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "yasha:models_loaded" + } + ] + }, + { + "title": "Client Disconnects", + "type": "timeseries", + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 5 }, + "id": 5, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(yasha:client_disconnects_total[5m])) by (model)", + "legendFormat": "{{ model }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, + "id": 101, + "title": "Latency", + "type": "row" + }, + { + "title": "Request Latency P50 / P95 / P99", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 10 }, + "id": 10, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))", + "legendFormat": "p50 {{ endpoint }}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))", + "legendFormat": "p95 {{ endpoint }}" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))", + "legendFormat": "p99 {{ endpoint }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Per-Model Latency P95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 10 }, + "id": 11, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p95 {{ model }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Inference Latency by Usecase P95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 10 }, + "id": 12, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:generation_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "generate {{ model }}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:tts_generation_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "tts {{ model }}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:image_generation_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "image {{ model }}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:transcription_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "transcription {{ model }}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:embedding_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "embedding {{ model }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, + "id": 102, + "title": "vLLM Engine", + "type": "row" + }, + { + "title": "KV Cache Usage", + "type": "gauge", + "gridPos": { "h": 8, "w": 4, "x": 0, "y": 19 }, + "id": 20, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "vllm:kv_cache_usage_perc", + "legendFormat": "{{ model_name }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "steps": [ + { "value": 0, "color": "green" }, + { "value": 0.8, "color": "yellow" }, + { "value": 0.95, "color": "red" } + ] + } + } + } + }, + { + "title": "Time to First Token (TTFT)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 5, "x": 4, "y": 19 }, + "id": 21, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Inter-Token Latency (ITL)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 5, "x": 9, "y": 19 }, + "id": 22, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Token Throughput", + "type": "timeseries", + "gridPos": { "h": 8, "w": 5, "x": 14, "y": 19 }, + "id": 23, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(vllm:prompt_tokens_total[5m]))", + "legendFormat": "prefill tok/s" + }, + { + "expr": "sum(rate(vllm:generation_tokens_total[5m]))", + "legendFormat": "decode tok/s" + } + ], + "fieldConfig": { + "defaults": { "unit": "tok/s" } + } + }, + { + "title": "Queue Depth", + "type": "timeseries", + "gridPos": { "h": 8, "w": 5, "x": 19, "y": 19 }, + "id": 24, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "vllm:num_requests_running", + "legendFormat": "running" + }, + { + "expr": "vllm:num_requests_waiting", + "legendFormat": "waiting" + } + ] + }, + { + "title": "Preemptions", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 27 }, + "id": 25, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(vllm:num_preemptions_total[5m]))", + "legendFormat": "preemptions/s" + } + ] + }, + { + "title": "Prefix Cache Hit Rate", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 27 }, + "id": 26, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(vllm:prefix_cache_hits_total[5m])) / clamp_min(sum(rate(vllm:prefix_cache_queries_total[5m])), 1)", + "legendFormat": "hit rate" + } + ], + "fieldConfig": { + "defaults": { "unit": "percentunit" } + } + }, + { + "title": "Queue Wait Time P95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 27 }, + "id": 27, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(vllm:request_queue_time_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "E2E Request Latency P95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 27 }, + "id": 28, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(vllm:e2e_request_latency_seconds_bucket[5m])) by (le))", + "legendFormat": "p95" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }, + "id": 103, + "title": "GPU & System Resources", + "type": "row" + }, + { + "title": "GPU Utilization", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 36 }, + "id": 30, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_node_gpus_utilization", + "legendFormat": "GPU {{ GpuIndex }} ({{ GpuDeviceName }})" + } + ], + "fieldConfig": { + "defaults": { "unit": "percent", "min": 0, "max": 100 } + } + }, + { + "title": "GPU Memory", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 36 }, + "id": 31, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_node_gram_used", + "legendFormat": "used GPU {{ GpuIndex }}" + }, + { + "expr": "ray_node_gram_available", + "legendFormat": "available GPU {{ GpuIndex }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "decgbytes" } + } + }, + { + "title": "CPU Utilization", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 36 }, + "id": 32, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_node_cpu_utilization", + "legendFormat": "{{ instance }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "percent", "min": 0, "max": 100 } + } + }, + { + "title": "System Memory", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 36 }, + "id": 33, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_node_mem_used / ray_node_mem_total * 100", + "legendFormat": "used %" + } + ], + "fieldConfig": { + "defaults": { "unit": "percent", "min": 0, "max": 100 } + } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 44 }, + "id": 104, + "title": "Ray Serve Internals", + "type": "row" + }, + { + "title": "Replica Health", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 45 }, + "id": 40, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "serve_deployment_replica_health_check", + "legendFormat": "{{ deployment }}" + } + ] + }, + { + "title": "Replica Processing Queries", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 45 }, + "id": 41, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "serve_replica_processing_queries", + "legendFormat": "{{ deployment }}" + } + ] + }, + { + "title": "Deployment Processing Latency P95", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 45 }, + "id": 42, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(serve_deployment_processing_latency_ms_bucket[5m])) by (le, deployment))", + "legendFormat": "p95 {{ deployment }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "ms" } + } + }, + { + "title": "Health Check Failures", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 45 }, + "id": 43, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(serve_deployment_health_check_failures_total[5m])) by (deployment)", + "legendFormat": "{{ deployment }}" + } + ] + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 }, + "id": 105, + "title": "Operational Health", + "type": "row" + }, + { + "title": "Model Load Time", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 54 }, + "id": 50, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(yasha:model_load_duration_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p95 {{ model }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "s" } + } + }, + { + "title": "Model Load Failures", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 54 }, + "id": 51, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(yasha:model_load_failures_total) by (model)", + "legendFormat": "{{ model }}" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "value": 0, "color": "green" }, + { "value": 1, "color": "red" } + ] + } + } + } + }, + { + "title": "Resource Cleanup Errors", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 54 }, + "id": 52, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(yasha:resource_cleanup_errors_total) by (model, component)", + "legendFormat": "{{ model }} ({{ component }})" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "steps": [ + { "value": 0, "color": "green" }, + { "value": 1, "color": "red" } + ] + } + } + } + }, + { + "title": "Streaming Chunks/s", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 54 }, + "id": 53, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(yasha:stream_chunks_total[5m])) by (model)", + "legendFormat": "{{ model }}" + } + ], + "fieldConfig": { + "defaults": { "unit": "cps" } + } + } + ], + "schemaVersion": 39, + "tags": ["yasha", "inference", "gpu", "llm"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Yasha Inference Server", + "uid": "yasha-overview", + "version": 1 +} diff --git a/docs/monitoring.md b/docs/monitoring.md new file mode 100644 index 0000000..a4535a9 --- /dev/null +++ b/docs/monitoring.md @@ -0,0 +1,163 @@ +# Monitoring + +Yasha exposes Prometheus metrics through a single port via Ray's metrics agent. When enabled, all metrics — Ray cluster, Ray Serve, vLLM engine, and custom Yasha metrics — are available on one scrape endpoint. + +## Architecture + +``` +Prometheus ──scrape──> Ray Metrics Agent (:8079) + | + |-- ray_* Ray cluster: GPU, CPU, memory, actors + |-- serve_* Ray Serve: HTTP requests, latency, replicas + |-- vllm:* vLLM engine: KV cache, TTFT, tokens, queue + |-- yasha:* Custom: per-model latency, errors, load time +``` + +## Enabling Metrics + +Metrics are disabled by default. Set `YASHA_METRICS=true` to enable: + +```bash +docker run --rm --shm-size=8g --gpus all \ + -e HF_TOKEN=your_token \ + -e YASHA_METRICS=true \ + -v ./models.yaml:/yasha/config/models.yaml \ + -p 8000:8000 -p 8079:8079 -p 8265:8265 \ + ghcr.io/alez007/yasha:latest +``` + +| Env Var | Default | Description | +|---|---|---| +| `YASHA_METRICS` | `false` | Master toggle. Enables all metrics and the Ray metrics export port. | +| `RAY_METRICS_EXPORT_PORT` | `8079` | Port for the Ray metrics agent (only active when `YASHA_METRICS=true`). | + +When `YASHA_METRICS=false`, no metrics are collected and port 8079 is not exposed. Zero overhead. + +## Connecting to Prometheus + +Add Yasha as a scrape target in your `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: yasha + scrape_interval: 15s + static_configs: + - targets: [":8079"] +``` + +For multi-node Ray clusters, use Ray's auto-generated service discovery file instead of static targets: + +```yaml +scrape_configs: + - job_name: yasha + file_sd_configs: + - files: ["/tmp/ray/prom_metrics_service_discovery.json"] +``` + +## Connecting to Grafana + +A pre-built Grafana dashboard is included at [`docs/grafana-dashboard.json`](grafana-dashboard.json). + +To import it: + +1. Open Grafana and go to **Dashboards > Import** +2. Upload `grafana-dashboard.json` or paste its contents +3. Select your Prometheus datasource when prompted + +The dashboard has 6 rows: + +| Row | What it shows | Metric sources | +|---|---|---| +| **Overview** | Request rate, error rate, in-flight requests, models loaded, client disconnects | `yasha:*` | +| **Latency** | Gateway P50/P95/P99, per-model latency, per-usecase latency (generate, TTS, image, STT, embed) | `yasha:*` | +| **vLLM Engine** | KV cache usage, TTFT, inter-token latency, token throughput, queue depth, preemptions, prefix cache hit rate | `vllm:*` | +| **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` | +| **Ray Serve** | Replica health, processing queries, deployment latency, health check failures | `serve_*` | +| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `yasha:*` | + +## Health Check + +A health endpoint is always available regardless of the metrics toggle: + +```bash +curl http://localhost:8000/health +# {"status": "ok"} +``` + +## Yasha Metrics Reference + +All custom metrics use the `yasha:` prefix and are exported via `ray.serve.metrics`. + +### Gateway + +| Metric | Type | Tags | Description | +|---|---|---|---| +| `yasha:request_total` | Counter | `model`, `endpoint`, `status` | Total requests by model and API method | +| `yasha:request_duration_seconds` | Histogram | `model`, `endpoint` | End-to-end request latency | +| `yasha:request_errors_total` | Counter | `model`, `endpoint`, `error_type` | Errors: `inference_error`, `stream_error`, `unhandled` | +| `yasha:request_in_progress` | Gauge | `model`, `endpoint` | Currently processing requests | +| `yasha:client_disconnects_total` | Counter | `model`, `endpoint` | Client disconnected before response completed | +| `yasha:stream_chunks_total` | Counter | `model` | Streaming chunks emitted | + +### Model Deployment + +| Metric | Type | Tags | Description | +|---|---|---|---| +| `yasha:model_load_duration_seconds` | Histogram | `model`, `loader` | Time to initialize a model | +| `yasha:model_load_failures_total` | Counter | `model`, `loader` | Failed model initializations | +| `yasha:models_loaded` | Gauge | | Number of loaded and ready models | + +### Inference Timing + +| Metric | Type | Tags | Description | +|---|---|---|---| +| `yasha:generation_duration_seconds` | Histogram | `model` | Chat/text generation latency | +| `yasha:tts_generation_duration_seconds` | Histogram | `model` | Text-to-speech latency | +| `yasha:image_generation_duration_seconds` | Histogram | `model` | Image generation latency | +| `yasha:transcription_duration_seconds` | Histogram | `model` | Speech-to-text latency | +| `yasha:embedding_duration_seconds` | Histogram | `model` | Embedding latency | + +### Resource Cleanup + +| Metric | Type | Tags | Description | +|---|---|---|---| +| `yasha:resource_cleanup_errors_total` | Counter | `model`, `component` | Errors during engine/model cleanup | + +## Built-in Metrics from vLLM and Ray + +These are automatically available when `YASHA_METRICS=true` — no additional configuration needed. + +### vLLM (`vllm:*`) + +Key metrics for LLM inference monitoring: + +- `vllm:num_requests_running` / `vllm:num_requests_waiting` — queue depth +- `vllm:kv_cache_usage_perc` — KV cache utilization (0-1) +- `vllm:time_to_first_token_seconds` — TTFT histogram +- `vllm:inter_token_latency_seconds` — ITL histogram +- `vllm:e2e_request_latency_seconds` — end-to-end latency histogram +- `vllm:request_queue_time_seconds` — time spent waiting in queue +- `vllm:prompt_tokens` / `vllm:generation_tokens` — token throughput counters +- `vllm:num_preemptions` — memory pressure signal +- `vllm:prefix_cache_hits` / `vllm:prefix_cache_queries` — cache efficiency + +Full reference: [vLLM Metrics Documentation](https://docs.vllm.ai/en/stable/design/metrics/) + +### Ray Serve (`serve_*`) + +- `serve_num_http_requests` — request count by route, method, status +- `serve_http_request_latency_ms` — request latency histogram +- `serve_num_ongoing_http_requests` — in-flight requests +- `serve_deployment_processing_latency_ms` — per-replica processing time +- `serve_deployment_replica_health_check` — replica health status + +Full reference: [Ray Serve Monitoring](https://docs.ray.io/en/latest/serve/monitoring.html) + +### Ray Cluster (`ray_*`) + +- `ray_node_gpus_utilization` — GPU utilization by device +- `ray_node_gram_used` / `ray_node_gram_available` — GPU memory +- `ray_node_cpu_utilization` — CPU usage +- `ray_node_mem_used` / `ray_node_mem_total` — system memory + +Full reference: [Ray Metrics](https://docs.ray.io/en/latest/cluster/metrics.html) diff --git a/yasha/infer/diffusers/diffusers_infer.py b/yasha/infer/diffusers/diffusers_infer.py index c2ac396..752af70 100644 --- a/yasha/infer/diffusers/diffusers_infer.py +++ b/yasha/infer/diffusers/diffusers_infer.py @@ -42,7 +42,9 @@ def __del__(self): if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception: - pass + from yasha.metrics import RESOURCE_CLEANUP_ERRORS_TOTAL + + RESOURCE_CLEANUP_ERRORS_TOTAL.inc(tags={"model": self.model_config.name, "component": "diffusers_pipeline"}) async def start(self): from diffusers.pipelines.auto_pipeline import AutoPipelineForText2Image diff --git a/yasha/infer/infer_config.py b/yasha/infer/infer_config.py index dc5b90e..a2ef96f 100644 --- a/yasha/infer/infer_config.py +++ b/yasha/infer/infer_config.py @@ -109,14 +109,19 @@ def is_set(self) -> bool: class RequestWatcher: """Watches a FastAPI Request for client disconnect and signals via a Ray actor event.""" - def __init__(self, raw_request: Request): + def __init__(self, raw_request: Request, model: str = "", endpoint: str = ""): self._request = raw_request self._event = DisconnectEvent.remote() + self._model = model + self._endpoint = endpoint self._task = asyncio.create_task(self._watch()) async def _watch(self): + from yasha.metrics import CLIENT_DISCONNECTS_TOTAL + while True: if await self._request.is_disconnected(): + CLIENT_DISCONNECTS_TOTAL.inc(tags={"model": self._model, "endpoint": self._endpoint}) await self._event.set.remote() # type: ignore[attr-defined] break await asyncio.sleep(0.1) diff --git a/yasha/infer/model_deployment.py b/yasha/infer/model_deployment.py index 23132a3..3b8239c 100644 --- a/yasha/infer/model_deployment.py +++ b/yasha/infer/model_deployment.py @@ -1,4 +1,5 @@ import logging +import time from collections.abc import AsyncGenerator from typing import Any @@ -9,6 +10,15 @@ from yasha.infer.infer_config import DisconnectProxy, ModelLoader, YashaModelConfig from yasha.infer.transformers.transformers_infer import TransformersInfer from yasha.infer.vllm.vllm_infer import VllmInfer +from yasha.metrics import ( + EMBEDDING_DURATION_SECONDS, + GENERATION_DURATION_SECONDS, + IMAGE_GENERATION_DURATION_SECONDS, + MODEL_LOAD_DURATION_SECONDS, + MODEL_LOAD_FAILURES_TOTAL, + TRANSCRIPTION_DURATION_SECONDS, + TTS_GENERATION_DURATION_SECONDS, +) from yasha.openai.protocol import ( ChatCompletionRequest, EmbeddingRequest, @@ -25,20 +35,31 @@ class ModelDeployment: async def __init__(self, config: YashaModelConfig): self.config = config - if config.loader == ModelLoader.vllm: - self.infer = VllmInfer(config) - elif config.loader == ModelLoader.transformers: - self.infer = TransformersInfer(config) - elif config.loader == ModelLoader.diffusers: - self.infer = DiffusersInfer(config) - else: - self.infer = CustomInfer(config) + start = time.monotonic() + try: + if config.loader == ModelLoader.vllm: + self.infer = VllmInfer(config) + elif config.loader == ModelLoader.transformers: + self.infer = TransformersInfer(config) + elif config.loader == ModelLoader.diffusers: + self.infer = DiffusersInfer(config) + else: + self.infer = CustomInfer(config) - await self.infer.start() + await self.infer.start() + except Exception: + MODEL_LOAD_FAILURES_TOTAL.inc(tags={"model": config.name, "loader": config.loader.value}) + raise + finally: + MODEL_LOAD_DURATION_SECONDS.observe( + time.monotonic() - start, tags={"model": config.name, "loader": config.loader.value} + ) async def generate(self, request: ChatCompletionRequest, request_headers: dict[str, str], disconnect_event: Any): proxy = DisconnectProxy(disconnect_event, request_headers) + start = time.monotonic() result = await self.infer.create_chat_completion(request, proxy) + GENERATION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name}) if isinstance(result, AsyncGenerator): async for chunk in result: yield chunk @@ -47,7 +68,9 @@ async def generate(self, request: ChatCompletionRequest, request_headers: dict[s async def embed(self, request: EmbeddingRequest, request_headers: dict[str, str], disconnect_event: Any): proxy = DisconnectProxy(disconnect_event, request_headers) + start = time.monotonic() result = await self.infer.create_embedding(request, proxy) + EMBEDDING_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name}) if isinstance(result, AsyncGenerator): async for chunk in result: yield chunk @@ -58,7 +81,9 @@ async def transcribe( self, audio_data: bytes, request: TranscriptionRequest, request_headers: dict[str, str], disconnect_event: Any ): proxy = DisconnectProxy(disconnect_event, request_headers) + start = time.monotonic() result = await self.infer.create_transcription(audio_data, request, proxy) + TRANSCRIPTION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name}) if isinstance(result, AsyncGenerator): async for chunk in result: yield chunk @@ -69,7 +94,9 @@ async def translate( self, audio_data: bytes, request: TranslationRequest, request_headers: dict[str, str], disconnect_event: Any ): proxy = DisconnectProxy(disconnect_event, request_headers) + start = time.monotonic() result = await self.infer.create_translation(audio_data, request, proxy) + TRANSCRIPTION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name}) if isinstance(result, AsyncGenerator): async for chunk in result: yield chunk @@ -78,7 +105,9 @@ async def translate( async def speak(self, request: SpeechRequest, request_headers: dict[str, str], disconnect_event: Any): proxy = DisconnectProxy(disconnect_event, request_headers) + start = time.monotonic() result = await self.infer.create_speech(request, proxy) + TTS_GENERATION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name}) if isinstance(result, AsyncGenerator): async for chunk in result: yield chunk @@ -87,7 +116,9 @@ async def speak(self, request: SpeechRequest, request_headers: dict[str, str], d async def imagine(self, request: ImageGenerationRequest, request_headers: dict[str, str], disconnect_event: Any): proxy = DisconnectProxy(disconnect_event, request_headers) + start = time.monotonic() result = await self.infer.create_image_generation(request, proxy) + IMAGE_GENERATION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name}) if isinstance(result, AsyncGenerator): async for chunk in result: yield chunk diff --git a/yasha/infer/transformers/transformers_infer.py b/yasha/infer/transformers/transformers_infer.py index d48a54c..23b8879 100644 --- a/yasha/infer/transformers/transformers_infer.py +++ b/yasha/infer/transformers/transformers_infer.py @@ -41,7 +41,9 @@ def __del__(self): if torch.cuda.is_available(): torch.cuda.empty_cache() except Exception: - pass + from yasha.metrics import RESOURCE_CLEANUP_ERRORS_TOTAL + + RESOURCE_CLEANUP_ERRORS_TOTAL.inc(tags={"model": self.model_config.name, "component": "transformers_model"}) async def start(self): self.serving_chat = None diff --git a/yasha/infer/vllm/vllm_infer.py b/yasha/infer/vllm/vllm_infer.py index 9816d77..b4fd9dc 100644 --- a/yasha/infer/vllm/vllm_infer.py +++ b/yasha/infer/vllm/vllm_infer.py @@ -115,7 +115,9 @@ def __del__(self): if engine := getattr(self, "engine", None): engine.shutdown() except Exception: - pass + from yasha.metrics import RESOURCE_CLEANUP_ERRORS_TOTAL + + RESOURCE_CLEANUP_ERRORS_TOTAL.inc(tags={"model": self.model_config.name, "component": "vllm_engine"}) async def start(self): logger.info("Start vllm infer for model: %s", self.model_config) diff --git a/yasha/metrics.py b/yasha/metrics.py new file mode 100644 index 0000000..04dbe11 --- /dev/null +++ b/yasha/metrics.py @@ -0,0 +1,192 @@ +"""Yasha Prometheus metrics — all exported via Ray's metrics agent. + +When YASHA_METRICS=true, metrics are defined using ray.serve.metrics so they +flow through the same Ray metrics agent port as ray_*, serve_*, and vllm:* +metrics. When disabled, no-op objects are exported so call sites need zero +conditional logic. +""" + +import os + +_ENABLED = os.environ.get("YASHA_METRICS", "false").lower() == "true" + +# --------------------------------------------------------------------------- +# No-op metric stubs (used when metrics are disabled) +# --------------------------------------------------------------------------- + + +class _NoOpCounter: + def inc(self, value=1.0, tags=None): + pass + + def set_default_tags(self, tags): + pass + + +class _NoOpGauge: + def set(self, value, tags=None): + pass + + def set_default_tags(self, tags): + pass + + +class _NoOpHistogram: + def observe(self, value, tags=None): + pass + + def set_default_tags(self, tags): + pass + + +# --------------------------------------------------------------------------- +# Latency bucket boundaries (in seconds) +# --------------------------------------------------------------------------- + +_REQUEST_LATENCY_BOUNDARIES = [0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60] +_MODEL_LOAD_BOUNDARIES: list[float] = [1, 5, 10, 30, 60, 120, 300, 600] + + +def _build_metrics(): + """Construct real or no-op metric objects based on YASHA_METRICS.""" + + if not _ENABLED: + return { + # Gateway + "request_total": _NoOpCounter(), + "request_duration_seconds": _NoOpHistogram(), + "request_errors_total": _NoOpCounter(), + "request_in_progress": _NoOpGauge(), + "client_disconnects_total": _NoOpCounter(), + "stream_chunks_total": _NoOpCounter(), + # Model deployment + "model_load_duration_seconds": _NoOpHistogram(), + "model_load_failures_total": _NoOpCounter(), + "models_loaded": _NoOpGauge(), + # Inference timing + "generation_duration_seconds": _NoOpHistogram(), + "tts_generation_duration_seconds": _NoOpHistogram(), + "image_generation_duration_seconds": _NoOpHistogram(), + "transcription_duration_seconds": _NoOpHistogram(), + "embedding_duration_seconds": _NoOpHistogram(), + # Resource cleanup + "resource_cleanup_errors_total": _NoOpCounter(), + } + + from ray.serve.metrics import Counter, Gauge, Histogram + + # Ray's type stubs over-constrain tag_keys (Tuple[str] instead of variable-length + # tuples) and boundaries (List[float] vs int literals). Suppressed with type: ignore. + return { + # -- Gateway layer -- + "request_total": Counter( + "yasha:request_total", + description="Total inference requests by model and endpoint.", + tag_keys=("model", "endpoint", "status"), # type: ignore[arg-type] + ), + "request_duration_seconds": Histogram( + "yasha:request_duration_seconds", + description="End-to-end request latency (gateway to response) in seconds.", + boundaries=_REQUEST_LATENCY_BOUNDARIES, + tag_keys=("model", "endpoint"), # type: ignore[arg-type] + ), + "request_errors_total": Counter( + "yasha:request_errors_total", + description="Total inference errors by model, endpoint, and error type.", + tag_keys=("model", "endpoint", "error_type"), # type: ignore[arg-type] + ), + "request_in_progress": Gauge( + "yasha:request_in_progress", + description="Number of requests currently being processed per model.", + tag_keys=("model", "endpoint"), # type: ignore[arg-type] + ), + "client_disconnects_total": Counter( + "yasha:client_disconnects_total", + description="Total client disconnects during inference.", + tag_keys=("model", "endpoint"), # type: ignore[arg-type] + ), + "stream_chunks_total": Counter( + "yasha:stream_chunks_total", + description="Total streaming chunks emitted.", + tag_keys=("model",), + ), + # -- Model deployment layer -- + "model_load_duration_seconds": Histogram( + "yasha:model_load_duration_seconds", + description="Model initialization time in seconds.", + boundaries=_MODEL_LOAD_BOUNDARIES, + tag_keys=("model", "loader"), # type: ignore[arg-type] + ), + "model_load_failures_total": Counter( + "yasha:model_load_failures_total", + description="Total failed model deployments.", + tag_keys=("model", "loader"), # type: ignore[arg-type] + ), + "models_loaded": Gauge( + "yasha:models_loaded", + description="Number of models currently loaded.", + ), + # -- Inference timing -- + "generation_duration_seconds": Histogram( + "yasha:generation_duration_seconds", + description="Chat/text generation latency in seconds.", + boundaries=_REQUEST_LATENCY_BOUNDARIES, + tag_keys=("model",), + ), + "tts_generation_duration_seconds": Histogram( + "yasha:tts_generation_duration_seconds", + description="TTS inference latency in seconds.", + boundaries=_REQUEST_LATENCY_BOUNDARIES, + tag_keys=("model",), + ), + "image_generation_duration_seconds": Histogram( + "yasha:image_generation_duration_seconds", + description="Image generation latency in seconds.", + boundaries=_REQUEST_LATENCY_BOUNDARIES, + tag_keys=("model",), + ), + "transcription_duration_seconds": Histogram( + "yasha:transcription_duration_seconds", + description="Speech-to-text latency in seconds.", + boundaries=_REQUEST_LATENCY_BOUNDARIES, + tag_keys=("model",), + ), + "embedding_duration_seconds": Histogram( + "yasha:embedding_duration_seconds", + description="Embedding inference latency in seconds.", + boundaries=_REQUEST_LATENCY_BOUNDARIES, + tag_keys=("model",), + ), + # -- Resource cleanup -- + "resource_cleanup_errors_total": Counter( + "yasha:resource_cleanup_errors_total", + description="Errors during resource cleanup (engine shutdown, memory release).", + tag_keys=("model", "component"), # type: ignore[arg-type] + ), + } + + +_metrics = _build_metrics() + +# -- Gateway -- +REQUEST_TOTAL = _metrics["request_total"] +REQUEST_DURATION_SECONDS = _metrics["request_duration_seconds"] +REQUEST_ERRORS_TOTAL = _metrics["request_errors_total"] +REQUEST_IN_PROGRESS = _metrics["request_in_progress"] +CLIENT_DISCONNECTS_TOTAL = _metrics["client_disconnects_total"] +STREAM_CHUNKS_TOTAL = _metrics["stream_chunks_total"] + +# -- Model deployment -- +MODEL_LOAD_DURATION_SECONDS = _metrics["model_load_duration_seconds"] +MODEL_LOAD_FAILURES_TOTAL = _metrics["model_load_failures_total"] +MODELS_LOADED = _metrics["models_loaded"] + +# -- Inference timing -- +GENERATION_DURATION_SECONDS = _metrics["generation_duration_seconds"] +TTS_GENERATION_DURATION_SECONDS = _metrics["tts_generation_duration_seconds"] +IMAGE_GENERATION_DURATION_SECONDS = _metrics["image_generation_duration_seconds"] +TRANSCRIPTION_DURATION_SECONDS = _metrics["transcription_duration_seconds"] +EMBEDDING_DURATION_SECONDS = _metrics["embedding_duration_seconds"] + +# -- Resource cleanup -- +RESOURCE_CLEANUP_ERRORS_TOTAL = _metrics["resource_cleanup_errors_total"] diff --git a/yasha/openai/api.py b/yasha/openai/api.py index 4227147..d71d70b 100644 --- a/yasha/openai/api.py +++ b/yasha/openai/api.py @@ -11,6 +11,14 @@ from ray.serve.handle import DeploymentHandle from yasha.infer.infer_config import ModelUsecase, RequestWatcher +from yasha.metrics import ( + MODELS_LOADED, + REQUEST_DURATION_SECONDS, + REQUEST_ERRORS_TOTAL, + REQUEST_IN_PROGRESS, + REQUEST_TOTAL, + STREAM_CHUNKS_TOTAL, +) from yasha.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -78,6 +86,7 @@ class YashaAPI: def __init__(self, model_handles: dict[str, tuple[DeploymentHandle, ModelUsecase]]): self.models = {name: handle for name, (handle, _) in model_handles.items()} self.model_list = [OpenAiModelCard(id=name) for name in model_handles] + MODELS_LOADED.set(len(self.models)) # all models are RUNNING by this point def _get_handle(self, model_name: str | None) -> DeploymentHandle: if model_name is None or model_name not in self.models: @@ -85,39 +94,68 @@ def _get_handle(self, model_name: str | None) -> DeploymentHandle: return self.models[model_name] async def _handle_response( - self, response_gen, watcher: RequestWatcher, stream_media_type: str = "text/event-stream" + self, + response_gen, + watcher: RequestWatcher, + model: str, + endpoint: str, + stream_media_type: str = "text/event-stream", ): - first = await response_gen.__anext__() - - if isinstance(first, ErrorResponse): - watcher.stop() - return _error_response(first) - - if isinstance(first, RawSpeechResponse): - watcher.stop() - return Response(content=first.audio, media_type=first.media_type) - - if isinstance( - first, - ChatCompletionResponse - | EmbeddingResponse - | TranscriptionResponse - | TranslationResponse - | ImageGenerationResponse, - ): - watcher.stop() - return JSONResponse(content=first.model_dump(mode="json")) - - # streaming — first chunk already consumed, chain it back - async def _stream(): - try: - yield first - async for chunk in response_gen: - yield chunk - finally: + start = time.monotonic() + REQUEST_IN_PROGRESS.set(1, tags={"model": model, "endpoint": endpoint}) + try: + first = await response_gen.__anext__() + + if isinstance(first, ErrorResponse): + REQUEST_ERRORS_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "error_type": "inference_error"}) + REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "error"}) watcher.stop() + return _error_response(first) - return StreamingResponse(content=_stream(), media_type=stream_media_type) + if isinstance(first, RawSpeechResponse): + REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "ok"}) + watcher.stop() + return Response(content=first.audio, media_type=first.media_type) + + if isinstance( + first, + ChatCompletionResponse + | EmbeddingResponse + | TranscriptionResponse + | TranslationResponse + | ImageGenerationResponse, + ): + REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "ok"}) + watcher.stop() + return JSONResponse(content=first.model_dump(mode="json")) + + # streaming — first chunk already consumed, chain it back + async def _stream(): + try: + STREAM_CHUNKS_TOTAL.inc(tags={"model": model}) + yield first + async for chunk in response_gen: + STREAM_CHUNKS_TOTAL.inc(tags={"model": model}) + yield chunk + REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "ok"}) + except Exception: + REQUEST_ERRORS_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "error_type": "stream_error"}) + raise + finally: + watcher.stop() + + return StreamingResponse(content=_stream(), media_type=stream_media_type) + except Exception: + REQUEST_ERRORS_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "error_type": "unhandled"}) + raise + finally: + duration = time.monotonic() - start + REQUEST_DURATION_SECONDS.observe(duration, tags={"model": model, "endpoint": endpoint}) + REQUEST_IN_PROGRESS.set(0, tags={"model": model, "endpoint": endpoint}) + + @app.get("/health") + async def health(self): + return {"status": "ok"} @app.get("/v1/models", response_model=OpenaiModelList) async def list_models(self): @@ -132,8 +170,9 @@ async def model_info(self, model: str) -> OpenAiModelCard: @app.post("/v1/chat/completions") async def create_chat_completion(self, request: ChatCompletionRequest, raw_request: Request): + model = request.model or "" handle = self._get_handle(request.model) - watcher = RequestWatcher(raw_request) + watcher = RequestWatcher(raw_request, model=model, endpoint="create_chat_completion") headers = dict(raw_request.headers) # Materialize any lazy pydantic ValidatorIterators (from Iterable-typed fields # like tool_calls) in place — they can't be pickled across the Ray boundary. @@ -152,22 +191,24 @@ async def _logged_gen(): logger.info("chat_completion actor output: %s", chunk) yield chunk - return await self._handle_response(_logged_gen(), watcher) + return await self._handle_response(_logged_gen(), watcher, model, "create_chat_completion") @app.post("/v1/embeddings") async def create_embeddings(self, request: EmbeddingRequest, raw_request: Request): + model = request.model or "" handle = self._get_handle(request.model) - watcher = RequestWatcher(raw_request) + watcher = RequestWatcher(raw_request, model=model, endpoint="create_embeddings") headers = dict(raw_request.headers) # EmbeddingRequest is a UnionType — force resolution before Ray pickle boundary. request = type(request).model_validate_json(request.model_dump_json()) response_gen = handle.embed.options(stream=True).remote(request, headers, watcher.event) - return await self._handle_response(response_gen, watcher) + return await self._handle_response(response_gen, watcher, model, "create_embeddings") @app.post("/v1/audio/transcriptions") async def create_transcriptions(self, request: Annotated[TranscriptionRequest, Form()], raw_request: Request): + model = request.model or "" handle = self._get_handle(request.model) - watcher = RequestWatcher(raw_request) + watcher = RequestWatcher(raw_request, model=model, endpoint="create_transcriptions") headers = dict(raw_request.headers) # Read audio bytes before crossing process boundary — UploadFile is not serializable. # The bytes are passed separately; the request is reconstructed without the file field. @@ -176,34 +217,35 @@ async def create_transcriptions(self, request: Annotated[TranscriptionRequest, F response_gen = handle.transcribe.options(stream=True).remote( audio_data, request_no_file, headers, watcher.event ) - return await self._handle_response(response_gen, watcher) + return await self._handle_response(response_gen, watcher, model, "create_transcriptions") @app.post("/v1/audio/translations") async def create_translations(self, request: Annotated[TranslationRequest, Form()], raw_request: Request): + model = request.model or "" handle = self._get_handle(request.model) - watcher = RequestWatcher(raw_request) + watcher = RequestWatcher(raw_request, model=model, endpoint="create_translations") headers = dict(raw_request.headers) # Read audio bytes before crossing process boundary — UploadFile is not serializable. # The bytes are passed separately; the request is reconstructed without the file field. audio_data = await request.file.read() request_no_file = TranslationRequest.model_construct(**request.model_dump(exclude={"file"})) response_gen = handle.translate.options(stream=True).remote(audio_data, request_no_file, headers, watcher.event) - return await self._handle_response(response_gen, watcher) + return await self._handle_response(response_gen, watcher, model, "create_translations") @app.post("/v1/audio/speech") async def create_speech(self, request: SpeechRequest, raw_request: Request): logger.info("speech request headers: %s", dict(raw_request.headers)) logger.info("speech request body: %s", request.model_dump_json()) handle = self._get_handle(request.model) - watcher = RequestWatcher(raw_request) + watcher = RequestWatcher(raw_request, model=request.model, endpoint="create_speech") headers = dict(raw_request.headers) response_gen = handle.speak.options(stream=True).remote(request, headers, watcher.event) - return await self._handle_response(response_gen, watcher) + return await self._handle_response(response_gen, watcher, request.model, "create_speech") @app.post("/v1/images/generations") async def create_image(self, request: ImageGenerationRequest, raw_request: Request): handle = self._get_handle(request.model) - watcher = RequestWatcher(raw_request) + watcher = RequestWatcher(raw_request, model=request.model, endpoint="create_image") headers = dict(raw_request.headers) response_gen = handle.imagine.options(stream=True).remote(request, headers, watcher.event) - return await self._handle_response(response_gen, watcher) + return await self._handle_response(response_gen, watcher, request.model, "create_image") From cfdfef3b246e2262f8e427d24b998008a7e67644 Mon Sep 17 00:00:00 2001 From: Alex M Date: Tue, 7 Apr 2026 05:38:02 +0000 Subject: [PATCH 2/2] fix: correct metric names in dashboard and route vLLM metrics through Ray All metrics exported via Ray's metrics agent are prefixed with ray_, but the Grafana dashboard and docs referenced unprefixed names. This updates all queries to use the actual exported names (ray_yasha_*, ray_vllm_*, ray_serve_*). - Route vLLM native metrics through Ray via RayPrometheusStatLogger - Fix Ray Serve Internals panels to use metrics that exist in Ray 2.54 - Fix model load time panel to work for one-shot events (avg not rate) - Enable YASHA_METRICS=true by default in Dockerfiles and metrics.py - Expose port 8079 in devcontainer config - Update monitoring.md to reflect all metric name prefixes --- .devcontainer/devcontainer.json | 8 ++- Dockerfile.dev | 2 +- Dockerfile.prod | 2 +- docs/grafana-dashboard.json | 101 +++++++++++++++++--------------- docs/monitoring.md | 92 +++++++++++++++-------------- yasha/infer/vllm/vllm_infer.py | 8 +++ yasha/metrics.py | 32 +++++----- 7 files changed, 133 insertions(+), 112 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index da06175..121e13a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -15,9 +15,10 @@ "--shm-size=8g", "--gpus=all", "-p", "0.0.0.0:8000:8000", - "-p", "0.0.0.0:8265:8265" + "-p", "0.0.0.0:8265:8265", + "-p", "0.0.0.0:8079:8079" ], - "forwardPorts": [8000, 8265], + "forwardPorts": [8000, 8265, 8079], "portsAttributes": { "8000": { "label": "API" }, "8265": { "label": "Ray Dashboard" } @@ -31,7 +32,8 @@ }, "remoteEnv": { "HF_TOKEN": "${localEnv:HF_TOKEN}", - "YASHA_PLUGINS": "${localEnv:YASHA_PLUGINS}" + "YASHA_PLUGINS": "${localEnv:YASHA_PLUGINS}", + "YASHA_METRICS": "true" }, "customizations": { "vscode": { diff --git a/Dockerfile.dev b/Dockerfile.dev index 160ba90..346a210 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -54,7 +54,7 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0 ENV RAY_HEAD_CPU_NUM=2 ENV RAY_HEAD_GPU_NUM=1 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false -ENV YASHA_METRICS=false +ENV YASHA_METRICS=true ENV RAY_METRICS_EXPORT_PORT=8079 RUN uv venv diff --git a/Dockerfile.prod b/Dockerfile.prod index d6b51f5..9a9e2fc 100644 --- a/Dockerfile.prod +++ b/Dockerfile.prod @@ -54,7 +54,7 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0 ENV RAY_HEAD_CPU_NUM=2 ENV RAY_HEAD_GPU_NUM=1 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false -ENV YASHA_METRICS=false +ENV YASHA_METRICS=true ENV RAY_METRICS_EXPORT_PORT=8079 RUN uv venv diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json index eb3cfef..15e3f07 100644 --- a/docs/grafana-dashboard.json +++ b/docs/grafana-dashboard.json @@ -32,7 +32,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(yasha:request_total[5m])) by (endpoint)", + "expr": "sum(rate(ray_yasha_request_total[5m])) by (endpoint)", "legendFormat": "{{ endpoint }}" } ], @@ -48,7 +48,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(yasha:request_errors_total[5m])) by (model, error_type)", + "expr": "sum(rate(ray_yasha_request_errors_total[5m])) by (model, error_type)", "legendFormat": "{{ model }} - {{ error_type }}" } ], @@ -64,7 +64,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "yasha:request_in_progress", + "expr": "ray_yasha_request_in_progress", "legendFormat": "{{ model }}" } ] @@ -77,7 +77,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "yasha:models_loaded" + "expr": "ray_yasha_models_loaded" } ] }, @@ -89,7 +89,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(yasha:client_disconnects_total[5m])) by (model)", + "expr": "sum(rate(ray_yasha_client_disconnects_total[5m])) by (model)", "legendFormat": "{{ model }}" } ], @@ -112,15 +112,15 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))", + "expr": "histogram_quantile(0.50, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, endpoint))", "legendFormat": "p50 {{ endpoint }}" }, { - "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, endpoint))", "legendFormat": "p95 {{ endpoint }}" }, { - "expr": "histogram_quantile(0.99, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))", + "expr": "histogram_quantile(0.99, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, endpoint))", "legendFormat": "p99 {{ endpoint }}" } ], @@ -136,7 +136,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, model))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, model))", "legendFormat": "p95 {{ model }}" } ], @@ -152,23 +152,23 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(yasha:generation_duration_seconds_bucket[5m])) by (le, model))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_generation_duration_seconds_bucket[5m])) by (le, model))", "legendFormat": "generate {{ model }}" }, { - "expr": "histogram_quantile(0.95, sum(rate(yasha:tts_generation_duration_seconds_bucket[5m])) by (le, model))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_tts_generation_duration_seconds_bucket[5m])) by (le, model))", "legendFormat": "tts {{ model }}" }, { - "expr": "histogram_quantile(0.95, sum(rate(yasha:image_generation_duration_seconds_bucket[5m])) by (le, model))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_image_generation_duration_seconds_bucket[5m])) by (le, model))", "legendFormat": "image {{ model }}" }, { - "expr": "histogram_quantile(0.95, sum(rate(yasha:transcription_duration_seconds_bucket[5m])) by (le, model))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_transcription_duration_seconds_bucket[5m])) by (le, model))", "legendFormat": "transcription {{ model }}" }, { - "expr": "histogram_quantile(0.95, sum(rate(yasha:embedding_duration_seconds_bucket[5m])) by (le, model))", + "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_embedding_duration_seconds_bucket[5m])) by (le, model))", "legendFormat": "embedding {{ model }}" } ], @@ -191,7 +191,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "vllm:kv_cache_usage_perc", + "expr": "ray_vllm_kv_cache_usage_perc", "legendFormat": "{{ model_name }}" } ], @@ -218,15 +218,15 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))", "legendFormat": "p50" }, { - "expr": "histogram_quantile(0.95, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }, { - "expr": "histogram_quantile(0.99, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))", "legendFormat": "p99" } ], @@ -242,11 +242,11 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.50, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.50, sum(rate(ray_vllm_inter_token_latency_seconds_bucket[5m])) by (le))", "legendFormat": "p50" }, { - "expr": "histogram_quantile(0.95, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_inter_token_latency_seconds_bucket[5m])) by (le))", "legendFormat": "p95" } ], @@ -262,11 +262,11 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(vllm:prompt_tokens_total[5m]))", + "expr": "sum(rate(ray_vllm_prompt_tokens_total[5m]))", "legendFormat": "prefill tok/s" }, { - "expr": "sum(rate(vllm:generation_tokens_total[5m]))", + "expr": "sum(rate(ray_vllm_generation_tokens_total[5m]))", "legendFormat": "decode tok/s" } ], @@ -282,11 +282,11 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "vllm:num_requests_running", + "expr": "ray_vllm_num_requests_running", "legendFormat": "running" }, { - "expr": "vllm:num_requests_waiting", + "expr": "ray_vllm_num_requests_waiting", "legendFormat": "waiting" } ] @@ -299,7 +299,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(vllm:num_preemptions_total[5m]))", + "expr": "sum(rate(ray_vllm_num_preemptions_total[5m]))", "legendFormat": "preemptions/s" } ] @@ -312,7 +312,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(vllm:prefix_cache_hits_total[5m])) / clamp_min(sum(rate(vllm:prefix_cache_queries_total[5m])), 1)", + "expr": "sum(rate(ray_vllm_prefix_cache_hits_total[5m])) / clamp_min(sum(rate(ray_vllm_prefix_cache_queries_total[5m])), 1)", "legendFormat": "hit rate" } ], @@ -328,7 +328,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(vllm:request_queue_time_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_request_queue_time_seconds_bucket[5m])) by (le))", "legendFormat": "p95" } ], @@ -344,7 +344,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(vllm:e2e_request_latency_seconds_bucket[5m])) by (le))", + "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_e2e_request_latency_seconds_bucket[5m])) by (le))", "legendFormat": "p95" } ], @@ -435,30 +435,36 @@ "type": "row" }, { - "title": "Replica Health", - "type": "stat", + "title": "Health Check Latency P95", + "type": "timeseries", "gridPos": { "h": 8, "w": 6, "x": 0, "y": 45 }, "id": 40, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "serve_deployment_replica_health_check", - "legendFormat": "{{ deployment }}" + "expr": "histogram_quantile(0.95, sum(rate(ray_serve_health_check_latency_ms_bucket[5m])) by (le, deployment))", + "legendFormat": "p95 {{ deployment }}" } - ] + ], + "fieldConfig": { + "defaults": { "unit": "ms" } + } }, { - "title": "Replica Processing Queries", + "title": "Request Count by Deployment", "type": "timeseries", "gridPos": { "h": 8, "w": 6, "x": 6, "y": 45 }, "id": 41, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "serve_replica_processing_queries", + "expr": "sum(rate(ray_serve_handle_request_counter_total[5m])) by (deployment)", "legendFormat": "{{ deployment }}" } - ] + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } }, { "title": "Deployment Processing Latency P95", @@ -468,7 +474,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(serve_deployment_processing_latency_ms_bucket[5m])) by (le, deployment))", + "expr": "histogram_quantile(0.95, sum(rate(ray_serve_deployment_processing_latency_ms_bucket[5m])) by (le, deployment))", "legendFormat": "p95 {{ deployment }}" } ], @@ -477,17 +483,20 @@ } }, { - "title": "Health Check Failures", + "title": "HTTP Request Latency P95", "type": "timeseries", "gridPos": { "h": 8, "w": 6, "x": 18, "y": 45 }, "id": 43, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(serve_deployment_health_check_failures_total[5m])) by (deployment)", - "legendFormat": "{{ deployment }}" + "expr": "histogram_quantile(0.95, sum(rate(ray_serve_http_request_latency_ms_bucket[5m])) by (le))", + "legendFormat": "p95" } - ] + ], + "fieldConfig": { + "defaults": { "unit": "ms" } + } }, { "collapsed": false, @@ -504,8 +513,8 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "histogram_quantile(0.95, sum(rate(yasha:model_load_duration_seconds_bucket[5m])) by (le, model))", - "legendFormat": "p95 {{ model }}" + "expr": "ray_yasha_model_load_duration_seconds_sum / clamp_min(ray_yasha_model_load_duration_seconds_count, 1)", + "legendFormat": "avg {{ model }}" } ], "fieldConfig": { @@ -520,7 +529,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(yasha:model_load_failures_total) by (model)", + "expr": "sum(ray_yasha_model_load_failures_total) by (model)", "legendFormat": "{{ model }}" } ], @@ -543,7 +552,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(yasha:resource_cleanup_errors_total) by (model, component)", + "expr": "sum(ray_yasha_resource_cleanup_errors_total) by (model, component)", "legendFormat": "{{ model }} ({{ component }})" } ], @@ -566,7 +575,7 @@ "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "targets": [ { - "expr": "sum(rate(yasha:stream_chunks_total[5m])) by (model)", + "expr": "sum(rate(ray_yasha_stream_chunks_total[5m])) by (model)", "legendFormat": "{{ model }}" } ], diff --git a/docs/monitoring.md b/docs/monitoring.md index a4535a9..90e4f63 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -7,15 +7,17 @@ Yasha exposes Prometheus metrics through a single port via Ray's metrics agent. ``` Prometheus ──scrape──> Ray Metrics Agent (:8079) | - |-- ray_* Ray cluster: GPU, CPU, memory, actors - |-- serve_* Ray Serve: HTTP requests, latency, replicas - |-- vllm:* vLLM engine: KV cache, TTFT, tokens, queue - |-- yasha:* Custom: per-model latency, errors, load time + |-- ray_node_* Ray cluster: GPU, CPU, memory + |-- ray_serve_* Ray Serve: HTTP requests, latency, replicas + |-- ray_vllm_* vLLM engine: KV cache, TTFT, tokens, queue + |-- ray_yasha_* Custom: per-model latency, errors, load time ``` +> **Note:** All metrics are prefixed with `ray_` by Ray's metrics agent. vLLM metric names are also sanitized (`:` → `_`), so e.g. the vLLM-native `vllm:kv_cache_usage_perc` becomes `ray_vllm_kv_cache_usage_perc`. + ## Enabling Metrics -Metrics are disabled by default. Set `YASHA_METRICS=true` to enable: +Metrics are enabled by default. Set `YASHA_METRICS=false` to disable: ```bash docker run --rm --shm-size=8g --gpus all \ @@ -28,10 +30,10 @@ docker run --rm --shm-size=8g --gpus all \ | Env Var | Default | Description | |---|---|---| -| `YASHA_METRICS` | `false` | Master toggle. Enables all metrics and the Ray metrics export port. | +| `YASHA_METRICS` | `true` | Master toggle. Enables all metrics and the Ray metrics export port. | | `RAY_METRICS_EXPORT_PORT` | `8079` | Port for the Ray metrics agent (only active when `YASHA_METRICS=true`). | -When `YASHA_METRICS=false`, no metrics are collected and port 8079 is not exposed. Zero overhead. +Set `YASHA_METRICS=false` to disable all metrics collection. When disabled, port 8079 is not exposed and there is zero overhead. ## Connecting to Prometheus @@ -68,12 +70,12 @@ The dashboard has 6 rows: | Row | What it shows | Metric sources | |---|---|---| -| **Overview** | Request rate, error rate, in-flight requests, models loaded, client disconnects | `yasha:*` | -| **Latency** | Gateway P50/P95/P99, per-model latency, per-usecase latency (generate, TTS, image, STT, embed) | `yasha:*` | -| **vLLM Engine** | KV cache usage, TTFT, inter-token latency, token throughput, queue depth, preemptions, prefix cache hit rate | `vllm:*` | +| **Overview** | Request rate, error rate, in-flight requests, models loaded, client disconnects | `ray_yasha_*` | +| **Latency** | Gateway P50/P95/P99, per-model latency, per-usecase latency (generate, TTS, image, STT, embed) | `ray_yasha_*` | +| **vLLM Engine** | KV cache usage, TTFT, inter-token latency, token throughput, queue depth, preemptions, prefix cache hit rate | `ray_vllm_*` | | **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` | -| **Ray Serve** | Replica health, processing queries, deployment latency, health check failures | `serve_*` | -| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `yasha:*` | +| **Ray Serve** | Health check latency, request count, deployment processing latency, HTTP request latency | `ray_serve_*` | +| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `ray_yasha_*` | ## Health Check @@ -86,70 +88,70 @@ curl http://localhost:8000/health ## Yasha Metrics Reference -All custom metrics use the `yasha:` prefix and are exported via `ray.serve.metrics`. +All custom metrics are defined via `ray.serve.metrics` and exported through Ray's metrics agent with a `ray_` prefix. ### Gateway | Metric | Type | Tags | Description | |---|---|---|---| -| `yasha:request_total` | Counter | `model`, `endpoint`, `status` | Total requests by model and API method | -| `yasha:request_duration_seconds` | Histogram | `model`, `endpoint` | End-to-end request latency | -| `yasha:request_errors_total` | Counter | `model`, `endpoint`, `error_type` | Errors: `inference_error`, `stream_error`, `unhandled` | -| `yasha:request_in_progress` | Gauge | `model`, `endpoint` | Currently processing requests | -| `yasha:client_disconnects_total` | Counter | `model`, `endpoint` | Client disconnected before response completed | -| `yasha:stream_chunks_total` | Counter | `model` | Streaming chunks emitted | +| `ray_yasha_request_total` | Counter | `model`, `endpoint`, `status` | Total requests by model and API method | +| `ray_yasha_request_duration_seconds` | Histogram | `model`, `endpoint` | End-to-end request latency | +| `ray_yasha_request_errors_total` | Counter | `model`, `endpoint`, `error_type` | Errors: `inference_error`, `stream_error`, `unhandled` | +| `ray_yasha_request_in_progress` | Gauge | `model`, `endpoint` | Currently processing requests | +| `ray_yasha_client_disconnects_total` | Counter | `model`, `endpoint` | Client disconnected before response completed | +| `ray_yasha_stream_chunks_total` | Counter | `model` | Streaming chunks emitted | ### Model Deployment | Metric | Type | Tags | Description | |---|---|---|---| -| `yasha:model_load_duration_seconds` | Histogram | `model`, `loader` | Time to initialize a model | -| `yasha:model_load_failures_total` | Counter | `model`, `loader` | Failed model initializations | -| `yasha:models_loaded` | Gauge | | Number of loaded and ready models | +| `ray_yasha_model_load_duration_seconds` | Histogram | `model`, `loader` | Time to initialize a model | +| `ray_yasha_model_load_failures_total` | Counter | `model`, `loader` | Failed model initializations | +| `ray_yasha_models_loaded` | Gauge | | Number of loaded and ready models | ### Inference Timing | Metric | Type | Tags | Description | |---|---|---|---| -| `yasha:generation_duration_seconds` | Histogram | `model` | Chat/text generation latency | -| `yasha:tts_generation_duration_seconds` | Histogram | `model` | Text-to-speech latency | -| `yasha:image_generation_duration_seconds` | Histogram | `model` | Image generation latency | -| `yasha:transcription_duration_seconds` | Histogram | `model` | Speech-to-text latency | -| `yasha:embedding_duration_seconds` | Histogram | `model` | Embedding latency | +| `ray_yasha_generation_duration_seconds` | Histogram | `model` | Chat/text generation latency | +| `ray_yasha_tts_generation_duration_seconds` | Histogram | `model` | Text-to-speech latency | +| `ray_yasha_image_generation_duration_seconds` | Histogram | `model` | Image generation latency | +| `ray_yasha_transcription_duration_seconds` | Histogram | `model` | Speech-to-text latency | +| `ray_yasha_embedding_duration_seconds` | Histogram | `model` | Embedding latency | ### Resource Cleanup | Metric | Type | Tags | Description | |---|---|---|---| -| `yasha:resource_cleanup_errors_total` | Counter | `model`, `component` | Errors during engine/model cleanup | +| `ray_yasha_resource_cleanup_errors_total` | Counter | `model`, `component` | Errors during engine/model cleanup | ## Built-in Metrics from vLLM and Ray These are automatically available when `YASHA_METRICS=true` — no additional configuration needed. -### vLLM (`vllm:*`) +### vLLM (`ray_vllm_*`) -Key metrics for LLM inference monitoring: +vLLM metrics are routed through Ray's metrics agent via `RayPrometheusStatLogger`. The native `vllm:` prefix is sanitized to `ray_vllm_`. -- `vllm:num_requests_running` / `vllm:num_requests_waiting` — queue depth -- `vllm:kv_cache_usage_perc` — KV cache utilization (0-1) -- `vllm:time_to_first_token_seconds` — TTFT histogram -- `vllm:inter_token_latency_seconds` — ITL histogram -- `vllm:e2e_request_latency_seconds` — end-to-end latency histogram -- `vllm:request_queue_time_seconds` — time spent waiting in queue -- `vllm:prompt_tokens` / `vllm:generation_tokens` — token throughput counters -- `vllm:num_preemptions` — memory pressure signal -- `vllm:prefix_cache_hits` / `vllm:prefix_cache_queries` — cache efficiency +- `ray_vllm_num_requests_running` / `ray_vllm_num_requests_waiting` — queue depth +- `ray_vllm_kv_cache_usage_perc` — KV cache utilization (0-1) +- `ray_vllm_time_to_first_token_seconds` — TTFT histogram +- `ray_vllm_inter_token_latency_seconds` — ITL histogram +- `ray_vllm_e2e_request_latency_seconds` — end-to-end latency histogram +- `ray_vllm_request_queue_time_seconds` — time spent waiting in queue +- `ray_vllm_prompt_tokens_total` / `ray_vllm_generation_tokens_total` — token throughput counters +- `ray_vllm_num_preemptions_total` — memory pressure signal +- `ray_vllm_prefix_cache_hits_total` / `ray_vllm_prefix_cache_queries_total` — cache efficiency Full reference: [vLLM Metrics Documentation](https://docs.vllm.ai/en/stable/design/metrics/) -### Ray Serve (`serve_*`) +### Ray Serve (`ray_serve_*`) -- `serve_num_http_requests` — request count by route, method, status -- `serve_http_request_latency_ms` — request latency histogram -- `serve_num_ongoing_http_requests` — in-flight requests -- `serve_deployment_processing_latency_ms` — per-replica processing time -- `serve_deployment_replica_health_check` — replica health status +- `ray_serve_num_http_requests_total` — request count by route, method, status +- `ray_serve_http_request_latency_ms` — request latency histogram +- `ray_serve_handle_request_counter_total` — request count by deployment +- `ray_serve_deployment_processing_latency_ms` — per-replica processing time +- `ray_serve_health_check_latency_ms` — health check latency histogram Full reference: [Ray Serve Monitoring](https://docs.ray.io/en/latest/serve/monitoring.html) diff --git a/yasha/infer/vllm/vllm_infer.py b/yasha/infer/vllm/vllm_infer.py index b4fd9dc..3278c3e 100644 --- a/yasha/infer/vllm/vllm_infer.py +++ b/yasha/infer/vllm/vllm_infer.py @@ -19,6 +19,7 @@ from yasha.infer.infer_config import DisconnectProxy, ModelUsecase, VllmEngineConfig, YashaModelConfig from yasha.infer.vllm.openai.serving_speech import OpenAIServingSpeech +from yasha.metrics import _ENABLED as _METRICS_ENABLED from yasha.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -103,11 +104,18 @@ def __init__(self, model_config: YashaModelConfig): # GPU pinning is handled by CUDA_VISIBLE_DEVICES set in ray_actor_options runtime_env. # The GPU is always visible as cuda:0 inside the actor — no device_config override needed. + stat_loggers: list | None = None + if _METRICS_ENABLED: + from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger + + stat_loggers = [RayPrometheusStatLogger] + self.engine = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, enable_log_requests=engine_args.enable_log_requests, disable_log_stats=engine_args.disable_log_stats, + stat_loggers=stat_loggers, ) def __del__(self): diff --git a/yasha/metrics.py b/yasha/metrics.py index 04dbe11..7ae3a7f 100644 --- a/yasha/metrics.py +++ b/yasha/metrics.py @@ -8,7 +8,7 @@ import os -_ENABLED = os.environ.get("YASHA_METRICS", "false").lower() == "true" +_ENABLED = os.environ.get("YASHA_METRICS", "true").lower() == "true" # --------------------------------------------------------------------------- # No-op metric stubs (used when metrics are disabled) @@ -80,86 +80,86 @@ def _build_metrics(): return { # -- Gateway layer -- "request_total": Counter( - "yasha:request_total", + "yasha_request_total", description="Total inference requests by model and endpoint.", tag_keys=("model", "endpoint", "status"), # type: ignore[arg-type] ), "request_duration_seconds": Histogram( - "yasha:request_duration_seconds", + "yasha_request_duration_seconds", description="End-to-end request latency (gateway to response) in seconds.", boundaries=_REQUEST_LATENCY_BOUNDARIES, tag_keys=("model", "endpoint"), # type: ignore[arg-type] ), "request_errors_total": Counter( - "yasha:request_errors_total", + "yasha_request_errors_total", description="Total inference errors by model, endpoint, and error type.", tag_keys=("model", "endpoint", "error_type"), # type: ignore[arg-type] ), "request_in_progress": Gauge( - "yasha:request_in_progress", + "yasha_request_in_progress", description="Number of requests currently being processed per model.", tag_keys=("model", "endpoint"), # type: ignore[arg-type] ), "client_disconnects_total": Counter( - "yasha:client_disconnects_total", + "yasha_client_disconnects_total", description="Total client disconnects during inference.", tag_keys=("model", "endpoint"), # type: ignore[arg-type] ), "stream_chunks_total": Counter( - "yasha:stream_chunks_total", + "yasha_stream_chunks_total", description="Total streaming chunks emitted.", tag_keys=("model",), ), # -- Model deployment layer -- "model_load_duration_seconds": Histogram( - "yasha:model_load_duration_seconds", + "yasha_model_load_duration_seconds", description="Model initialization time in seconds.", boundaries=_MODEL_LOAD_BOUNDARIES, tag_keys=("model", "loader"), # type: ignore[arg-type] ), "model_load_failures_total": Counter( - "yasha:model_load_failures_total", + "yasha_model_load_failures_total", description="Total failed model deployments.", tag_keys=("model", "loader"), # type: ignore[arg-type] ), "models_loaded": Gauge( - "yasha:models_loaded", + "yasha_models_loaded", description="Number of models currently loaded.", ), # -- Inference timing -- "generation_duration_seconds": Histogram( - "yasha:generation_duration_seconds", + "yasha_generation_duration_seconds", description="Chat/text generation latency in seconds.", boundaries=_REQUEST_LATENCY_BOUNDARIES, tag_keys=("model",), ), "tts_generation_duration_seconds": Histogram( - "yasha:tts_generation_duration_seconds", + "yasha_tts_generation_duration_seconds", description="TTS inference latency in seconds.", boundaries=_REQUEST_LATENCY_BOUNDARIES, tag_keys=("model",), ), "image_generation_duration_seconds": Histogram( - "yasha:image_generation_duration_seconds", + "yasha_image_generation_duration_seconds", description="Image generation latency in seconds.", boundaries=_REQUEST_LATENCY_BOUNDARIES, tag_keys=("model",), ), "transcription_duration_seconds": Histogram( - "yasha:transcription_duration_seconds", + "yasha_transcription_duration_seconds", description="Speech-to-text latency in seconds.", boundaries=_REQUEST_LATENCY_BOUNDARIES, tag_keys=("model",), ), "embedding_duration_seconds": Histogram( - "yasha:embedding_duration_seconds", + "yasha_embedding_duration_seconds", description="Embedding inference latency in seconds.", boundaries=_REQUEST_LATENCY_BOUNDARIES, tag_keys=("model",), ), # -- Resource cleanup -- "resource_cleanup_errors_total": Counter( - "yasha:resource_cleanup_errors_total", + "yasha_resource_cleanup_errors_total", description="Errors during resource cleanup (engine shutdown, memory release).", tag_keys=("model", "component"), # type: ignore[arg-type] ),