From 5f87889ae31eb3e7d33984cd39deefa28a156725 Mon Sep 17 00:00:00 2001
From: Alex M <alex@macmini.com>
Date: Mon, 6 Apr 2026 19:17:18 +0000
Subject: [PATCH 1/2] feat: add Prometheus metrics and Grafana dashboard for
 observability

Add yasha:* custom metrics (request latency, errors, model load time,
per-usecase timing, client disconnects, cleanup errors) via ray.serve.metrics,
gated behind YASHA_METRICS env var with zero-overhead no-op stubs when disabled.
Include pre-built Grafana dashboard, /health endpoint, and documentation.
---
 Dockerfile.dev                                |   8 +-
 Dockerfile.prod                               |   8 +-
 README.md                                     |   5 +
 docs/grafana-dashboard.json                   | 592 ++++++++++++++++++
 docs/monitoring.md                            | 163 +++++
 yasha/infer/diffusers/diffusers_infer.py      |   4 +-
 yasha/infer/infer_config.py                   |   7 +-
 yasha/infer/model_deployment.py               |  49 +-
 .../infer/transformers/transformers_infer.py  |   4 +-
 yasha/infer/vllm/vllm_infer.py                |   4 +-
 yasha/metrics.py                              | 192 ++++++
 yasha/openai/api.py                           | 126 ++--
 12 files changed, 1105 insertions(+), 57 deletions(-)
 create mode 100644 docs/grafana-dashboard.json
 create mode 100644 docs/monitoring.md
 create mode 100644 yasha/metrics.py

diff --git a/Dockerfile.dev b/Dockerfile.dev
index ce983e9..160ba90 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -54,6 +54,8 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0
 ENV RAY_HEAD_CPU_NUM=2
 ENV RAY_HEAD_GPU_NUM=1
 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false
+ENV YASHA_METRICS=false
+ENV RAY_METRICS_EXPORT_PORT=8079
 RUN uv venv
 
 ARG PYTHON_VERSION
@@ -78,7 +80,11 @@ uv sync --project /yasha --locked \$EXTRAS
 cd /yasha && uv run pre-commit install
 
 if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
-    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats
+    METRICS_FLAG=""
+    if [ "\${YASHA_METRICS}" = "true" ]; then
+        METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}"
+    fi
+    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG}
     if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then
         echo "ray cluster failed to start"
         exit 1
diff --git a/Dockerfile.prod b/Dockerfile.prod
index 061c7b5..d6b51f5 100644
--- a/Dockerfile.prod
+++ b/Dockerfile.prod
@@ -54,6 +54,8 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0
 ENV RAY_HEAD_CPU_NUM=2
 ENV RAY_HEAD_GPU_NUM=1
 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false
+ENV YASHA_METRICS=false
+ENV RAY_METRICS_EXPORT_PORT=8079
 RUN uv venv
 
 ARG PYTHON_VERSION
@@ -77,7 +79,11 @@ fi
 uv sync --project /yasha --locked \$EXTRAS
 
 if [ "\${YASHA_USE_EXISTING_RAY_CLUSTER}" != "true" ]; then
-    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats
+    METRICS_FLAG=""
+    if [ "\${YASHA_METRICS}" = "true" ]; then
+        METRICS_FLAG="--metrics-export-port=\${RAY_METRICS_EXPORT_PORT}"
+    fi
+    cd /yasha && ray start --head --port=\${RAY_REDIS_PORT} --dashboard-host=0.0.0.0 --num-cpus=\${RAY_HEAD_CPU_NUM} --num-gpus=\${RAY_HEAD_GPU_NUM} --disable-usage-stats \${METRICS_FLAG}
     if ! ray status --address=\${RAY_CLUSTER_ADDRESS}:\${RAY_REDIS_PORT}; then
         echo "ray cluster failed to start"
         exit 1
diff --git a/README.md b/README.md
index 1c712e4..ebaf649 100644
--- a/README.md
+++ b/README.md
@@ -125,6 +125,11 @@ For a full guide on writing your own plugin, see [Plugin Development](docs/plugi
 - [Architecture](docs/architecture.md) — system design, request lifecycle, plugin loading
 - [Plugin Development](docs/plugins.md) — writing custom TTS backends
 - [Home Assistant Integration](docs/home-assistant.md) — Wyoming protocol setup for voice automation
+- [Monitoring](docs/monitoring.md) — Prometheus metrics, Grafana dashboard, health checks
+
+## Monitoring
+
+Yasha exposes Prometheus metrics (Ray cluster, Ray Serve, vLLM, and custom `yasha:*` metrics) through a single port. Enable with `YASHA_METRICS=true` and scrape port 8079. A pre-built Grafana dashboard is included. See [Monitoring](docs/monitoring.md) for setup details.
 
 ## Future Work
 
diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json
new file mode 100644
index 0000000..eb3cfef
--- /dev/null
+++ b/docs/grafana-dashboard.json
@@ -0,0 +1,592 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "Prometheus datasource for Yasha metrics",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [],
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+      "id": 100,
+      "title": "Overview",
+      "type": "row"
+    },
+    {
+      "title": "Request Rate",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 },
+      "id": 1,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(yasha:request_total[5m])) by (endpoint)",
+          "legendFormat": "{{ endpoint }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    },
+    {
+      "title": "Error Rate",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 },
+      "id": 2,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(yasha:request_errors_total[5m])) by (model, error_type)",
+          "legendFormat": "{{ model }} - {{ error_type }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    },
+    {
+      "title": "In-Flight Requests",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 4, "x": 16, "y": 1 },
+      "id": 3,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "yasha:request_in_progress",
+          "legendFormat": "{{ model }}"
+        }
+      ]
+    },
+    {
+      "title": "Models Loaded",
+      "type": "stat",
+      "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
+      "id": 4,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "yasha:models_loaded"
+        }
+      ]
+    },
+    {
+      "title": "Client Disconnects",
+      "type": "timeseries",
+      "gridPos": { "h": 4, "w": 4, "x": 20, "y": 5 },
+      "id": 5,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(yasha:client_disconnects_total[5m])) by (model)",
+          "legendFormat": "{{ model }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 },
+      "id": 101,
+      "title": "Latency",
+      "type": "row"
+    },
+    {
+      "title": "Request Latency P50 / P95 / P99",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 10 },
+      "id": 10,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))",
+          "legendFormat": "p50 {{ endpoint }}"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))",
+          "legendFormat": "p95 {{ endpoint }}"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))",
+          "legendFormat": "p99 {{ endpoint }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "title": "Per-Model Latency P95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 10 },
+      "id": 11,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "p95 {{ model }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "title": "Inference Latency by Usecase P95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 10 },
+      "id": 12,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:generation_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "generate {{ model }}"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:tts_generation_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "tts {{ model }}"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:image_generation_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "image {{ model }}"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:transcription_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "transcription {{ model }}"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:embedding_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "embedding {{ model }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 },
+      "id": 102,
+      "title": "vLLM Engine",
+      "type": "row"
+    },
+    {
+      "title": "KV Cache Usage",
+      "type": "gauge",
+      "gridPos": { "h": 8, "w": 4, "x": 0, "y": 19 },
+      "id": 20,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "vllm:kv_cache_usage_perc",
+          "legendFormat": "{{ model_name }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 0.8, "color": "yellow" },
+              { "value": 0.95, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Time to First Token (TTFT)",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 5, "x": 4, "y": 19 },
+      "id": 21,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p50"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p99"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "title": "Inter-Token Latency (ITL)",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 5, "x": 9, "y": 19 },
+      "id": 22,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p50"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "title": "Token Throughput",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 5, "x": 14, "y": 19 },
+      "id": 23,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(vllm:prompt_tokens_total[5m]))",
+          "legendFormat": "prefill tok/s"
+        },
+        {
+          "expr": "sum(rate(vllm:generation_tokens_total[5m]))",
+          "legendFormat": "decode tok/s"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "tok/s" }
+      }
+    },
+    {
+      "title": "Queue Depth",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 5, "x": 19, "y": 19 },
+      "id": 24,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "vllm:num_requests_running",
+          "legendFormat": "running"
+        },
+        {
+          "expr": "vllm:num_requests_waiting",
+          "legendFormat": "waiting"
+        }
+      ]
+    },
+    {
+      "title": "Preemptions",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 0, "y": 27 },
+      "id": 25,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(vllm:num_preemptions_total[5m]))",
+          "legendFormat": "preemptions/s"
+        }
+      ]
+    },
+    {
+      "title": "Prefix Cache Hit Rate",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 6, "y": 27 },
+      "id": 26,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(vllm:prefix_cache_hits_total[5m])) / clamp_min(sum(rate(vllm:prefix_cache_queries_total[5m])), 1)",
+          "legendFormat": "hit rate"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "percentunit" }
+      }
+    },
+    {
+      "title": "Queue Wait Time P95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 27 },
+      "id": 27,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(vllm:request_queue_time_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "title": "E2E Request Latency P95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 27 },
+      "id": 28,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(vllm:e2e_request_latency_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p95"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 },
+      "id": 103,
+      "title": "GPU & System Resources",
+      "type": "row"
+    },
+    {
+      "title": "GPU Utilization",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 0, "y": 36 },
+      "id": 30,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_node_gpus_utilization",
+          "legendFormat": "GPU {{ GpuIndex }} ({{ GpuDeviceName }})"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "percent", "min": 0, "max": 100 }
+      }
+    },
+    {
+      "title": "GPU Memory",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 6, "y": 36 },
+      "id": 31,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_node_gram_used",
+          "legendFormat": "used GPU {{ GpuIndex }}"
+        },
+        {
+          "expr": "ray_node_gram_available",
+          "legendFormat": "available GPU {{ GpuIndex }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "decgbytes" }
+      }
+    },
+    {
+      "title": "CPU Utilization",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 36 },
+      "id": 32,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_node_cpu_utilization",
+          "legendFormat": "{{ instance }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "percent", "min": 0, "max": 100 }
+      }
+    },
+    {
+      "title": "System Memory",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 36 },
+      "id": 33,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_node_mem_used / ray_node_mem_total * 100",
+          "legendFormat": "used %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "percent", "min": 0, "max": 100 }
+      }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 44 },
+      "id": 104,
+      "title": "Ray Serve Internals",
+      "type": "row"
+    },
+    {
+      "title": "Replica Health",
+      "type": "stat",
+      "gridPos": { "h": 8, "w": 6, "x": 0, "y": 45 },
+      "id": 40,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "serve_deployment_replica_health_check",
+          "legendFormat": "{{ deployment }}"
+        }
+      ]
+    },
+    {
+      "title": "Replica Processing Queries",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 6, "y": 45 },
+      "id": 41,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "serve_replica_processing_queries",
+          "legendFormat": "{{ deployment }}"
+        }
+      ]
+    },
+    {
+      "title": "Deployment Processing Latency P95",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 45 },
+      "id": 42,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(serve_deployment_processing_latency_ms_bucket[5m])) by (le, deployment))",
+          "legendFormat": "p95 {{ deployment }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "ms" }
+      }
+    },
+    {
+      "title": "Health Check Failures",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 45 },
+      "id": 43,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(serve_deployment_health_check_failures_total[5m])) by (deployment)",
+          "legendFormat": "{{ deployment }}"
+        }
+      ]
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 53 },
+      "id": 105,
+      "title": "Operational Health",
+      "type": "row"
+    },
+    {
+      "title": "Model Load Time",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 0, "y": 54 },
+      "id": 50,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(yasha:model_load_duration_seconds_bucket[5m])) by (le, model))",
+          "legendFormat": "p95 {{ model }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "s" }
+      }
+    },
+    {
+      "title": "Model Load Failures",
+      "type": "stat",
+      "gridPos": { "h": 8, "w": 6, "x": 6, "y": 54 },
+      "id": 51,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(yasha:model_load_failures_total) by (model)",
+          "legendFormat": "{{ model }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 1, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Resource Cleanup Errors",
+      "type": "stat",
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 54 },
+      "id": 52,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(yasha:resource_cleanup_errors_total) by (model, component)",
+          "legendFormat": "{{ model }} ({{ component }})"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 1, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Streaming Chunks/s",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 54 },
+      "id": 53,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(yasha:stream_chunks_total[5m])) by (model)",
+          "legendFormat": "{{ model }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "cps" }
+      }
+    }
+  ],
+  "schemaVersion": 39,
+  "tags": ["yasha", "inference", "gpu", "llm"],
+  "templating": {
+    "list": []
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Yasha Inference Server",
+  "uid": "yasha-overview",
+  "version": 1
+}
diff --git a/docs/monitoring.md b/docs/monitoring.md
new file mode 100644
index 0000000..a4535a9
--- /dev/null
+++ b/docs/monitoring.md
@@ -0,0 +1,163 @@
+# Monitoring
+
+Yasha exposes Prometheus metrics through a single port via Ray's metrics agent. When enabled, all metrics — Ray cluster, Ray Serve, vLLM engine, and custom Yasha metrics — are available on one scrape endpoint.
+
+## Architecture
+
+```
+Prometheus  ──scrape──>  Ray Metrics Agent (:8079)
+                              |
+                              |-- ray_*          Ray cluster: GPU, CPU, memory, actors
+                              |-- serve_*        Ray Serve: HTTP requests, latency, replicas
+                              |-- vllm:*         vLLM engine: KV cache, TTFT, tokens, queue
+                              |-- yasha:*        Custom: per-model latency, errors, load time
+```
+
+## Enabling Metrics
+
+Metrics are disabled by default. Set `YASHA_METRICS=true` to enable:
+
+```bash
+docker run --rm --shm-size=8g --gpus all \
+  -e HF_TOKEN=your_token \
+  -e YASHA_METRICS=true \
+  -v ./models.yaml:/yasha/config/models.yaml \
+  -p 8000:8000 -p 8079:8079 -p 8265:8265 \
+  ghcr.io/alez007/yasha:latest
+```
+
+| Env Var | Default | Description |
+|---|---|---|
+| `YASHA_METRICS` | `false` | Master toggle. Enables all metrics and the Ray metrics export port. |
+| `RAY_METRICS_EXPORT_PORT` | `8079` | Port for the Ray metrics agent (only active when `YASHA_METRICS=true`). |
+
+When `YASHA_METRICS=false`, no metrics are collected and port 8079 is not exposed. Zero overhead.
+
+## Connecting to Prometheus
+
+Add Yasha as a scrape target in your `prometheus.yml`:
+
+```yaml
+scrape_configs:
+  - job_name: yasha
+    scrape_interval: 15s
+    static_configs:
+      - targets: ["<yasha-host>:8079"]
+```
+
+For multi-node Ray clusters, use Ray's auto-generated service discovery file instead of static targets:
+
+```yaml
+scrape_configs:
+  - job_name: yasha
+    file_sd_configs:
+      - files: ["/tmp/ray/prom_metrics_service_discovery.json"]
+```
+
+## Connecting to Grafana
+
+A pre-built Grafana dashboard is included at [`docs/grafana-dashboard.json`](grafana-dashboard.json).
+
+To import it:
+
+1. Open Grafana and go to **Dashboards > Import**
+2. Upload `grafana-dashboard.json` or paste its contents
+3. Select your Prometheus datasource when prompted
+
+The dashboard has 6 rows:
+
+| Row | What it shows | Metric sources |
+|---|---|---|
+| **Overview** | Request rate, error rate, in-flight requests, models loaded, client disconnects | `yasha:*` |
+| **Latency** | Gateway P50/P95/P99, per-model latency, per-usecase latency (generate, TTS, image, STT, embed) | `yasha:*` |
+| **vLLM Engine** | KV cache usage, TTFT, inter-token latency, token throughput, queue depth, preemptions, prefix cache hit rate | `vllm:*` |
+| **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` |
+| **Ray Serve** | Replica health, processing queries, deployment latency, health check failures | `serve_*` |
+| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `yasha:*` |
+
+## Health Check
+
+A health endpoint is always available regardless of the metrics toggle:
+
+```bash
+curl http://localhost:8000/health
+# {"status": "ok"}
+```
+
+## Yasha Metrics Reference
+
+All custom metrics use the `yasha:` prefix and are exported via `ray.serve.metrics`.
+
+### Gateway
+
+| Metric | Type | Tags | Description |
+|---|---|---|---|
+| `yasha:request_total` | Counter | `model`, `endpoint`, `status` | Total requests by model and API method |
+| `yasha:request_duration_seconds` | Histogram | `model`, `endpoint` | End-to-end request latency |
+| `yasha:request_errors_total` | Counter | `model`, `endpoint`, `error_type` | Errors: `inference_error`, `stream_error`, `unhandled` |
+| `yasha:request_in_progress` | Gauge | `model`, `endpoint` | Currently processing requests |
+| `yasha:client_disconnects_total` | Counter | `model`, `endpoint` | Client disconnected before response completed |
+| `yasha:stream_chunks_total` | Counter | `model` | Streaming chunks emitted |
+
+### Model Deployment
+
+| Metric | Type | Tags | Description |
+|---|---|---|---|
+| `yasha:model_load_duration_seconds` | Histogram | `model`, `loader` | Time to initialize a model |
+| `yasha:model_load_failures_total` | Counter | `model`, `loader` | Failed model initializations |
+| `yasha:models_loaded` | Gauge | | Number of loaded and ready models |
+
+### Inference Timing
+
+| Metric | Type | Tags | Description |
+|---|---|---|---|
+| `yasha:generation_duration_seconds` | Histogram | `model` | Chat/text generation latency |
+| `yasha:tts_generation_duration_seconds` | Histogram | `model` | Text-to-speech latency |
+| `yasha:image_generation_duration_seconds` | Histogram | `model` | Image generation latency |
+| `yasha:transcription_duration_seconds` | Histogram | `model` | Speech-to-text latency |
+| `yasha:embedding_duration_seconds` | Histogram | `model` | Embedding latency |
+
+### Resource Cleanup
+
+| Metric | Type | Tags | Description |
+|---|---|---|---|
+| `yasha:resource_cleanup_errors_total` | Counter | `model`, `component` | Errors during engine/model cleanup |
+
+## Built-in Metrics from vLLM and Ray
+
+These are automatically available when `YASHA_METRICS=true` — no additional configuration needed.
+
+### vLLM (`vllm:*`)
+
+Key metrics for LLM inference monitoring:
+
+- `vllm:num_requests_running` / `vllm:num_requests_waiting` — queue depth
+- `vllm:kv_cache_usage_perc` — KV cache utilization (0-1)
+- `vllm:time_to_first_token_seconds` — TTFT histogram
+- `vllm:inter_token_latency_seconds` — ITL histogram
+- `vllm:e2e_request_latency_seconds` — end-to-end latency histogram
+- `vllm:request_queue_time_seconds` — time spent waiting in queue
+- `vllm:prompt_tokens` / `vllm:generation_tokens` — token throughput counters
+- `vllm:num_preemptions` — memory pressure signal
+- `vllm:prefix_cache_hits` / `vllm:prefix_cache_queries` — cache efficiency
+
+Full reference: [vLLM Metrics Documentation](https://docs.vllm.ai/en/stable/design/metrics/)
+
+### Ray Serve (`serve_*`)
+
+- `serve_num_http_requests` — request count by route, method, status
+- `serve_http_request_latency_ms` — request latency histogram
+- `serve_num_ongoing_http_requests` — in-flight requests
+- `serve_deployment_processing_latency_ms` — per-replica processing time
+- `serve_deployment_replica_health_check` — replica health status
+
+Full reference: [Ray Serve Monitoring](https://docs.ray.io/en/latest/serve/monitoring.html)
+
+### Ray Cluster (`ray_*`)
+
+- `ray_node_gpus_utilization` — GPU utilization by device
+- `ray_node_gram_used` / `ray_node_gram_available` — GPU memory
+- `ray_node_cpu_utilization` — CPU usage
+- `ray_node_mem_used` / `ray_node_mem_total` — system memory
+
+Full reference: [Ray Metrics](https://docs.ray.io/en/latest/cluster/metrics.html)
diff --git a/yasha/infer/diffusers/diffusers_infer.py b/yasha/infer/diffusers/diffusers_infer.py
index c2ac396..752af70 100644
--- a/yasha/infer/diffusers/diffusers_infer.py
+++ b/yasha/infer/diffusers/diffusers_infer.py
@@ -42,7 +42,9 @@ def __del__(self):
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         except Exception:
-            pass
+            from yasha.metrics import RESOURCE_CLEANUP_ERRORS_TOTAL
+
+            RESOURCE_CLEANUP_ERRORS_TOTAL.inc(tags={"model": self.model_config.name, "component": "diffusers_pipeline"})
 
     async def start(self):
         from diffusers.pipelines.auto_pipeline import AutoPipelineForText2Image
diff --git a/yasha/infer/infer_config.py b/yasha/infer/infer_config.py
index dc5b90e..a2ef96f 100644
--- a/yasha/infer/infer_config.py
+++ b/yasha/infer/infer_config.py
@@ -109,14 +109,19 @@ def is_set(self) -> bool:
 class RequestWatcher:
     """Watches a FastAPI Request for client disconnect and signals via a Ray actor event."""
 
-    def __init__(self, raw_request: Request):
+    def __init__(self, raw_request: Request, model: str = "", endpoint: str = ""):
         self._request = raw_request
         self._event = DisconnectEvent.remote()
+        self._model = model
+        self._endpoint = endpoint
         self._task = asyncio.create_task(self._watch())
 
     async def _watch(self):
+        from yasha.metrics import CLIENT_DISCONNECTS_TOTAL
+
         while True:
             if await self._request.is_disconnected():
+                CLIENT_DISCONNECTS_TOTAL.inc(tags={"model": self._model, "endpoint": self._endpoint})
                 await self._event.set.remote()  # type: ignore[attr-defined]
                 break
             await asyncio.sleep(0.1)
diff --git a/yasha/infer/model_deployment.py b/yasha/infer/model_deployment.py
index 23132a3..3b8239c 100644
--- a/yasha/infer/model_deployment.py
+++ b/yasha/infer/model_deployment.py
@@ -1,4 +1,5 @@
 import logging
+import time
 from collections.abc import AsyncGenerator
 from typing import Any
 
@@ -9,6 +10,15 @@
 from yasha.infer.infer_config import DisconnectProxy, ModelLoader, YashaModelConfig
 from yasha.infer.transformers.transformers_infer import TransformersInfer
 from yasha.infer.vllm.vllm_infer import VllmInfer
+from yasha.metrics import (
+    EMBEDDING_DURATION_SECONDS,
+    GENERATION_DURATION_SECONDS,
+    IMAGE_GENERATION_DURATION_SECONDS,
+    MODEL_LOAD_DURATION_SECONDS,
+    MODEL_LOAD_FAILURES_TOTAL,
+    TRANSCRIPTION_DURATION_SECONDS,
+    TTS_GENERATION_DURATION_SECONDS,
+)
 from yasha.openai.protocol import (
     ChatCompletionRequest,
     EmbeddingRequest,
@@ -25,20 +35,31 @@
 class ModelDeployment:
     async def __init__(self, config: YashaModelConfig):
         self.config = config
-        if config.loader == ModelLoader.vllm:
-            self.infer = VllmInfer(config)
-        elif config.loader == ModelLoader.transformers:
-            self.infer = TransformersInfer(config)
-        elif config.loader == ModelLoader.diffusers:
-            self.infer = DiffusersInfer(config)
-        else:
-            self.infer = CustomInfer(config)
+        start = time.monotonic()
+        try:
+            if config.loader == ModelLoader.vllm:
+                self.infer = VllmInfer(config)
+            elif config.loader == ModelLoader.transformers:
+                self.infer = TransformersInfer(config)
+            elif config.loader == ModelLoader.diffusers:
+                self.infer = DiffusersInfer(config)
+            else:
+                self.infer = CustomInfer(config)
 
-        await self.infer.start()
+            await self.infer.start()
+        except Exception:
+            MODEL_LOAD_FAILURES_TOTAL.inc(tags={"model": config.name, "loader": config.loader.value})
+            raise
+        finally:
+            MODEL_LOAD_DURATION_SECONDS.observe(
+                time.monotonic() - start, tags={"model": config.name, "loader": config.loader.value}
+            )
 
     async def generate(self, request: ChatCompletionRequest, request_headers: dict[str, str], disconnect_event: Any):
         proxy = DisconnectProxy(disconnect_event, request_headers)
+        start = time.monotonic()
         result = await self.infer.create_chat_completion(request, proxy)
+        GENERATION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name})
         if isinstance(result, AsyncGenerator):
             async for chunk in result:
                 yield chunk
@@ -47,7 +68,9 @@ async def generate(self, request: ChatCompletionRequest, request_headers: dict[s
 
     async def embed(self, request: EmbeddingRequest, request_headers: dict[str, str], disconnect_event: Any):
         proxy = DisconnectProxy(disconnect_event, request_headers)
+        start = time.monotonic()
         result = await self.infer.create_embedding(request, proxy)
+        EMBEDDING_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name})
         if isinstance(result, AsyncGenerator):
             async for chunk in result:
                 yield chunk
@@ -58,7 +81,9 @@ async def transcribe(
         self, audio_data: bytes, request: TranscriptionRequest, request_headers: dict[str, str], disconnect_event: Any
     ):
         proxy = DisconnectProxy(disconnect_event, request_headers)
+        start = time.monotonic()
         result = await self.infer.create_transcription(audio_data, request, proxy)
+        TRANSCRIPTION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name})
         if isinstance(result, AsyncGenerator):
             async for chunk in result:
                 yield chunk
@@ -69,7 +94,9 @@ async def translate(
         self, audio_data: bytes, request: TranslationRequest, request_headers: dict[str, str], disconnect_event: Any
     ):
         proxy = DisconnectProxy(disconnect_event, request_headers)
+        start = time.monotonic()
         result = await self.infer.create_translation(audio_data, request, proxy)
+        TRANSCRIPTION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name})
         if isinstance(result, AsyncGenerator):
             async for chunk in result:
                 yield chunk
@@ -78,7 +105,9 @@ async def translate(
 
     async def speak(self, request: SpeechRequest, request_headers: dict[str, str], disconnect_event: Any):
         proxy = DisconnectProxy(disconnect_event, request_headers)
+        start = time.monotonic()
         result = await self.infer.create_speech(request, proxy)
+        TTS_GENERATION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name})
         if isinstance(result, AsyncGenerator):
             async for chunk in result:
                 yield chunk
@@ -87,7 +116,9 @@ async def speak(self, request: SpeechRequest, request_headers: dict[str, str], d
 
     async def imagine(self, request: ImageGenerationRequest, request_headers: dict[str, str], disconnect_event: Any):
         proxy = DisconnectProxy(disconnect_event, request_headers)
+        start = time.monotonic()
         result = await self.infer.create_image_generation(request, proxy)
+        IMAGE_GENERATION_DURATION_SECONDS.observe(time.monotonic() - start, tags={"model": self.config.name})
         if isinstance(result, AsyncGenerator):
             async for chunk in result:
                 yield chunk
diff --git a/yasha/infer/transformers/transformers_infer.py b/yasha/infer/transformers/transformers_infer.py
index d48a54c..23b8879 100644
--- a/yasha/infer/transformers/transformers_infer.py
+++ b/yasha/infer/transformers/transformers_infer.py
@@ -41,7 +41,9 @@ def __del__(self):
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
         except Exception:
-            pass
+            from yasha.metrics import RESOURCE_CLEANUP_ERRORS_TOTAL
+
+            RESOURCE_CLEANUP_ERRORS_TOTAL.inc(tags={"model": self.model_config.name, "component": "transformers_model"})
 
     async def start(self):
         self.serving_chat = None
diff --git a/yasha/infer/vllm/vllm_infer.py b/yasha/infer/vllm/vllm_infer.py
index 9816d77..b4fd9dc 100644
--- a/yasha/infer/vllm/vllm_infer.py
+++ b/yasha/infer/vllm/vllm_infer.py
@@ -115,7 +115,9 @@ def __del__(self):
             if engine := getattr(self, "engine", None):
                 engine.shutdown()
         except Exception:
-            pass
+            from yasha.metrics import RESOURCE_CLEANUP_ERRORS_TOTAL
+
+            RESOURCE_CLEANUP_ERRORS_TOTAL.inc(tags={"model": self.model_config.name, "component": "vllm_engine"})
 
     async def start(self):
         logger.info("Start vllm infer for model: %s", self.model_config)
diff --git a/yasha/metrics.py b/yasha/metrics.py
new file mode 100644
index 0000000..04dbe11
--- /dev/null
+++ b/yasha/metrics.py
@@ -0,0 +1,192 @@
+"""Yasha Prometheus metrics — all exported via Ray's metrics agent.
+
+When YASHA_METRICS=true, metrics are defined using ray.serve.metrics so they
+flow through the same Ray metrics agent port as ray_*, serve_*, and vllm:*
+metrics.  When disabled, no-op objects are exported so call sites need zero
+conditional logic.
+"""
+
+import os
+
+_ENABLED = os.environ.get("YASHA_METRICS", "false").lower() == "true"
+
+# ---------------------------------------------------------------------------
+# No-op metric stubs (used when metrics are disabled)
+# ---------------------------------------------------------------------------
+
+
+class _NoOpCounter:
+    def inc(self, value=1.0, tags=None):
+        pass
+
+    def set_default_tags(self, tags):
+        pass
+
+
+class _NoOpGauge:
+    def set(self, value, tags=None):
+        pass
+
+    def set_default_tags(self, tags):
+        pass
+
+
+class _NoOpHistogram:
+    def observe(self, value, tags=None):
+        pass
+
+    def set_default_tags(self, tags):
+        pass
+
+
+# ---------------------------------------------------------------------------
+# Latency bucket boundaries (in seconds)
+# ---------------------------------------------------------------------------
+
+_REQUEST_LATENCY_BOUNDARIES = [0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60]
+_MODEL_LOAD_BOUNDARIES: list[float] = [1, 5, 10, 30, 60, 120, 300, 600]
+
+
+def _build_metrics():
+    """Construct real or no-op metric objects based on YASHA_METRICS."""
+
+    if not _ENABLED:
+        return {
+            # Gateway
+            "request_total": _NoOpCounter(),
+            "request_duration_seconds": _NoOpHistogram(),
+            "request_errors_total": _NoOpCounter(),
+            "request_in_progress": _NoOpGauge(),
+            "client_disconnects_total": _NoOpCounter(),
+            "stream_chunks_total": _NoOpCounter(),
+            # Model deployment
+            "model_load_duration_seconds": _NoOpHistogram(),
+            "model_load_failures_total": _NoOpCounter(),
+            "models_loaded": _NoOpGauge(),
+            # Inference timing
+            "generation_duration_seconds": _NoOpHistogram(),
+            "tts_generation_duration_seconds": _NoOpHistogram(),
+            "image_generation_duration_seconds": _NoOpHistogram(),
+            "transcription_duration_seconds": _NoOpHistogram(),
+            "embedding_duration_seconds": _NoOpHistogram(),
+            # Resource cleanup
+            "resource_cleanup_errors_total": _NoOpCounter(),
+        }
+
+    from ray.serve.metrics import Counter, Gauge, Histogram
+
+    # Ray's type stubs over-constrain tag_keys (Tuple[str] instead of variable-length
+    # tuples) and boundaries (List[float] vs int literals). Suppressed with type: ignore.
+    return {
+        # -- Gateway layer --
+        "request_total": Counter(
+            "yasha:request_total",
+            description="Total inference requests by model and endpoint.",
+            tag_keys=("model", "endpoint", "status"),  # type: ignore[arg-type]
+        ),
+        "request_duration_seconds": Histogram(
+            "yasha:request_duration_seconds",
+            description="End-to-end request latency (gateway to response) in seconds.",
+            boundaries=_REQUEST_LATENCY_BOUNDARIES,
+            tag_keys=("model", "endpoint"),  # type: ignore[arg-type]
+        ),
+        "request_errors_total": Counter(
+            "yasha:request_errors_total",
+            description="Total inference errors by model, endpoint, and error type.",
+            tag_keys=("model", "endpoint", "error_type"),  # type: ignore[arg-type]
+        ),
+        "request_in_progress": Gauge(
+            "yasha:request_in_progress",
+            description="Number of requests currently being processed per model.",
+            tag_keys=("model", "endpoint"),  # type: ignore[arg-type]
+        ),
+        "client_disconnects_total": Counter(
+            "yasha:client_disconnects_total",
+            description="Total client disconnects during inference.",
+            tag_keys=("model", "endpoint"),  # type: ignore[arg-type]
+        ),
+        "stream_chunks_total": Counter(
+            "yasha:stream_chunks_total",
+            description="Total streaming chunks emitted.",
+            tag_keys=("model",),
+        ),
+        # -- Model deployment layer --
+        "model_load_duration_seconds": Histogram(
+            "yasha:model_load_duration_seconds",
+            description="Model initialization time in seconds.",
+            boundaries=_MODEL_LOAD_BOUNDARIES,
+            tag_keys=("model", "loader"),  # type: ignore[arg-type]
+        ),
+        "model_load_failures_total": Counter(
+            "yasha:model_load_failures_total",
+            description="Total failed model deployments.",
+            tag_keys=("model", "loader"),  # type: ignore[arg-type]
+        ),
+        "models_loaded": Gauge(
+            "yasha:models_loaded",
+            description="Number of models currently loaded.",
+        ),
+        # -- Inference timing --
+        "generation_duration_seconds": Histogram(
+            "yasha:generation_duration_seconds",
+            description="Chat/text generation latency in seconds.",
+            boundaries=_REQUEST_LATENCY_BOUNDARIES,
+            tag_keys=("model",),
+        ),
+        "tts_generation_duration_seconds": Histogram(
+            "yasha:tts_generation_duration_seconds",
+            description="TTS inference latency in seconds.",
+            boundaries=_REQUEST_LATENCY_BOUNDARIES,
+            tag_keys=("model",),
+        ),
+        "image_generation_duration_seconds": Histogram(
+            "yasha:image_generation_duration_seconds",
+            description="Image generation latency in seconds.",
+            boundaries=_REQUEST_LATENCY_BOUNDARIES,
+            tag_keys=("model",),
+        ),
+        "transcription_duration_seconds": Histogram(
+            "yasha:transcription_duration_seconds",
+            description="Speech-to-text latency in seconds.",
+            boundaries=_REQUEST_LATENCY_BOUNDARIES,
+            tag_keys=("model",),
+        ),
+        "embedding_duration_seconds": Histogram(
+            "yasha:embedding_duration_seconds",
+            description="Embedding inference latency in seconds.",
+            boundaries=_REQUEST_LATENCY_BOUNDARIES,
+            tag_keys=("model",),
+        ),
+        # -- Resource cleanup --
+        "resource_cleanup_errors_total": Counter(
+            "yasha:resource_cleanup_errors_total",
+            description="Errors during resource cleanup (engine shutdown, memory release).",
+            tag_keys=("model", "component"),  # type: ignore[arg-type]
+        ),
+    }
+
+
+_metrics = _build_metrics()
+
+# -- Gateway --
+REQUEST_TOTAL = _metrics["request_total"]
+REQUEST_DURATION_SECONDS = _metrics["request_duration_seconds"]
+REQUEST_ERRORS_TOTAL = _metrics["request_errors_total"]
+REQUEST_IN_PROGRESS = _metrics["request_in_progress"]
+CLIENT_DISCONNECTS_TOTAL = _metrics["client_disconnects_total"]
+STREAM_CHUNKS_TOTAL = _metrics["stream_chunks_total"]
+
+# -- Model deployment --
+MODEL_LOAD_DURATION_SECONDS = _metrics["model_load_duration_seconds"]
+MODEL_LOAD_FAILURES_TOTAL = _metrics["model_load_failures_total"]
+MODELS_LOADED = _metrics["models_loaded"]
+
+# -- Inference timing --
+GENERATION_DURATION_SECONDS = _metrics["generation_duration_seconds"]
+TTS_GENERATION_DURATION_SECONDS = _metrics["tts_generation_duration_seconds"]
+IMAGE_GENERATION_DURATION_SECONDS = _metrics["image_generation_duration_seconds"]
+TRANSCRIPTION_DURATION_SECONDS = _metrics["transcription_duration_seconds"]
+EMBEDDING_DURATION_SECONDS = _metrics["embedding_duration_seconds"]
+
+# -- Resource cleanup --
+RESOURCE_CLEANUP_ERRORS_TOTAL = _metrics["resource_cleanup_errors_total"]
diff --git a/yasha/openai/api.py b/yasha/openai/api.py
index 4227147..d71d70b 100644
--- a/yasha/openai/api.py
+++ b/yasha/openai/api.py
@@ -11,6 +11,14 @@
 from ray.serve.handle import DeploymentHandle
 
 from yasha.infer.infer_config import ModelUsecase, RequestWatcher
+from yasha.metrics import (
+    MODELS_LOADED,
+    REQUEST_DURATION_SECONDS,
+    REQUEST_ERRORS_TOTAL,
+    REQUEST_IN_PROGRESS,
+    REQUEST_TOTAL,
+    STREAM_CHUNKS_TOTAL,
+)
 from yasha.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -78,6 +86,7 @@ class YashaAPI:
     def __init__(self, model_handles: dict[str, tuple[DeploymentHandle, ModelUsecase]]):
         self.models = {name: handle for name, (handle, _) in model_handles.items()}
         self.model_list = [OpenAiModelCard(id=name) for name in model_handles]
+        MODELS_LOADED.set(len(self.models))  # all models are RUNNING by this point
 
     def _get_handle(self, model_name: str | None) -> DeploymentHandle:
         if model_name is None or model_name not in self.models:
@@ -85,39 +94,68 @@ def _get_handle(self, model_name: str | None) -> DeploymentHandle:
         return self.models[model_name]
 
     async def _handle_response(
-        self, response_gen, watcher: RequestWatcher, stream_media_type: str = "text/event-stream"
+        self,
+        response_gen,
+        watcher: RequestWatcher,
+        model: str,
+        endpoint: str,
+        stream_media_type: str = "text/event-stream",
     ):
-        first = await response_gen.__anext__()
-
-        if isinstance(first, ErrorResponse):
-            watcher.stop()
-            return _error_response(first)
-
-        if isinstance(first, RawSpeechResponse):
-            watcher.stop()
-            return Response(content=first.audio, media_type=first.media_type)
-
-        if isinstance(
-            first,
-            ChatCompletionResponse
-            | EmbeddingResponse
-            | TranscriptionResponse
-            | TranslationResponse
-            | ImageGenerationResponse,
-        ):
-            watcher.stop()
-            return JSONResponse(content=first.model_dump(mode="json"))
-
-        # streaming — first chunk already consumed, chain it back
-        async def _stream():
-            try:
-                yield first
-                async for chunk in response_gen:
-                    yield chunk
-            finally:
+        start = time.monotonic()
+        REQUEST_IN_PROGRESS.set(1, tags={"model": model, "endpoint": endpoint})
+        try:
+            first = await response_gen.__anext__()
+
+            if isinstance(first, ErrorResponse):
+                REQUEST_ERRORS_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "error_type": "inference_error"})
+                REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "error"})
                 watcher.stop()
+                return _error_response(first)
 
-        return StreamingResponse(content=_stream(), media_type=stream_media_type)
+            if isinstance(first, RawSpeechResponse):
+                REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "ok"})
+                watcher.stop()
+                return Response(content=first.audio, media_type=first.media_type)
+
+            if isinstance(
+                first,
+                ChatCompletionResponse
+                | EmbeddingResponse
+                | TranscriptionResponse
+                | TranslationResponse
+                | ImageGenerationResponse,
+            ):
+                REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "ok"})
+                watcher.stop()
+                return JSONResponse(content=first.model_dump(mode="json"))
+
+            # streaming — first chunk already consumed, chain it back
+            async def _stream():
+                try:
+                    STREAM_CHUNKS_TOTAL.inc(tags={"model": model})
+                    yield first
+                    async for chunk in response_gen:
+                        STREAM_CHUNKS_TOTAL.inc(tags={"model": model})
+                        yield chunk
+                    REQUEST_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "status": "ok"})
+                except Exception:
+                    REQUEST_ERRORS_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "error_type": "stream_error"})
+                    raise
+                finally:
+                    watcher.stop()
+
+            return StreamingResponse(content=_stream(), media_type=stream_media_type)
+        except Exception:
+            REQUEST_ERRORS_TOTAL.inc(tags={"model": model, "endpoint": endpoint, "error_type": "unhandled"})
+            raise
+        finally:
+            duration = time.monotonic() - start
+            REQUEST_DURATION_SECONDS.observe(duration, tags={"model": model, "endpoint": endpoint})
+            REQUEST_IN_PROGRESS.set(0, tags={"model": model, "endpoint": endpoint})
+
+    @app.get("/health")
+    async def health(self):
+        return {"status": "ok"}
 
     @app.get("/v1/models", response_model=OpenaiModelList)
     async def list_models(self):
@@ -132,8 +170,9 @@ async def model_info(self, model: str) -> OpenAiModelCard:
 
     @app.post("/v1/chat/completions")
     async def create_chat_completion(self, request: ChatCompletionRequest, raw_request: Request):
+        model = request.model or ""
         handle = self._get_handle(request.model)
-        watcher = RequestWatcher(raw_request)
+        watcher = RequestWatcher(raw_request, model=model, endpoint="create_chat_completion")
         headers = dict(raw_request.headers)
         # Materialize any lazy pydantic ValidatorIterators (from Iterable-typed fields
         # like tool_calls) in place — they can't be pickled across the Ray boundary.
@@ -152,22 +191,24 @@ async def _logged_gen():
                 logger.info("chat_completion actor output: %s", chunk)
                 yield chunk
 
-        return await self._handle_response(_logged_gen(), watcher)
+        return await self._handle_response(_logged_gen(), watcher, model, "create_chat_completion")
 
     @app.post("/v1/embeddings")
     async def create_embeddings(self, request: EmbeddingRequest, raw_request: Request):
+        model = request.model or ""
         handle = self._get_handle(request.model)
-        watcher = RequestWatcher(raw_request)
+        watcher = RequestWatcher(raw_request, model=model, endpoint="create_embeddings")
         headers = dict(raw_request.headers)
         # EmbeddingRequest is a UnionType — force resolution before Ray pickle boundary.
         request = type(request).model_validate_json(request.model_dump_json())
         response_gen = handle.embed.options(stream=True).remote(request, headers, watcher.event)
-        return await self._handle_response(response_gen, watcher)
+        return await self._handle_response(response_gen, watcher, model, "create_embeddings")
 
     @app.post("/v1/audio/transcriptions")
     async def create_transcriptions(self, request: Annotated[TranscriptionRequest, Form()], raw_request: Request):
+        model = request.model or ""
         handle = self._get_handle(request.model)
-        watcher = RequestWatcher(raw_request)
+        watcher = RequestWatcher(raw_request, model=model, endpoint="create_transcriptions")
         headers = dict(raw_request.headers)
         # Read audio bytes before crossing process boundary — UploadFile is not serializable.
         # The bytes are passed separately; the request is reconstructed without the file field.
@@ -176,34 +217,35 @@ async def create_transcriptions(self, request: Annotated[TranscriptionRequest, F
         response_gen = handle.transcribe.options(stream=True).remote(
             audio_data, request_no_file, headers, watcher.event
         )
-        return await self._handle_response(response_gen, watcher)
+        return await self._handle_response(response_gen, watcher, model, "create_transcriptions")
 
     @app.post("/v1/audio/translations")
     async def create_translations(self, request: Annotated[TranslationRequest, Form()], raw_request: Request):
+        model = request.model or ""
         handle = self._get_handle(request.model)
-        watcher = RequestWatcher(raw_request)
+        watcher = RequestWatcher(raw_request, model=model, endpoint="create_translations")
         headers = dict(raw_request.headers)
         # Read audio bytes before crossing process boundary — UploadFile is not serializable.
         # The bytes are passed separately; the request is reconstructed without the file field.
         audio_data = await request.file.read()
         request_no_file = TranslationRequest.model_construct(**request.model_dump(exclude={"file"}))
         response_gen = handle.translate.options(stream=True).remote(audio_data, request_no_file, headers, watcher.event)
-        return await self._handle_response(response_gen, watcher)
+        return await self._handle_response(response_gen, watcher, model, "create_translations")
 
     @app.post("/v1/audio/speech")
     async def create_speech(self, request: SpeechRequest, raw_request: Request):
         logger.info("speech request headers: %s", dict(raw_request.headers))
         logger.info("speech request body: %s", request.model_dump_json())
         handle = self._get_handle(request.model)
-        watcher = RequestWatcher(raw_request)
+        watcher = RequestWatcher(raw_request, model=request.model, endpoint="create_speech")
         headers = dict(raw_request.headers)
         response_gen = handle.speak.options(stream=True).remote(request, headers, watcher.event)
-        return await self._handle_response(response_gen, watcher)
+        return await self._handle_response(response_gen, watcher, request.model, "create_speech")
 
     @app.post("/v1/images/generations")
     async def create_image(self, request: ImageGenerationRequest, raw_request: Request):
         handle = self._get_handle(request.model)
-        watcher = RequestWatcher(raw_request)
+        watcher = RequestWatcher(raw_request, model=request.model, endpoint="create_image")
         headers = dict(raw_request.headers)
         response_gen = handle.imagine.options(stream=True).remote(request, headers, watcher.event)
-        return await self._handle_response(response_gen, watcher)
+        return await self._handle_response(response_gen, watcher, request.model, "create_image")

From cfdfef3b246e2262f8e427d24b998008a7e67644 Mon Sep 17 00:00:00 2001
From: Alex M <alex@macmini.com>
Date: Tue, 7 Apr 2026 05:38:02 +0000
Subject: [PATCH 2/2] fix: correct metric names in dashboard and route vLLM
 metrics through Ray

All metrics exported via Ray's metrics agent are prefixed with ray_,
but the Grafana dashboard and docs referenced unprefixed names. This
updates all queries to use the actual exported names (ray_yasha_*,
ray_vllm_*, ray_serve_*).

- Route vLLM native metrics through Ray via RayPrometheusStatLogger
- Fix Ray Serve Internals panels to use metrics that exist in Ray 2.54
- Fix model load time panel to work for one-shot events (avg not rate)
- Enable YASHA_METRICS=true by default in Dockerfiles and metrics.py
- Expose port 8079 in devcontainer config
- Update monitoring.md to reflect all metric name prefixes
---
 .devcontainer/devcontainer.json |   8 ++-
 Dockerfile.dev                  |   2 +-
 Dockerfile.prod                 |   2 +-
 docs/grafana-dashboard.json     | 101 +++++++++++++++++---------------
 docs/monitoring.md              |  92 +++++++++++++++--------------
 yasha/infer/vllm/vllm_infer.py  |   8 +++
 yasha/metrics.py                |  32 +++++-----
 7 files changed, 133 insertions(+), 112 deletions(-)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index da06175..121e13a 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -15,9 +15,10 @@
     "--shm-size=8g",
     "--gpus=all",
     "-p", "0.0.0.0:8000:8000",
-    "-p", "0.0.0.0:8265:8265"
+    "-p", "0.0.0.0:8265:8265",
+    "-p", "0.0.0.0:8079:8079"
   ],
-  "forwardPorts": [8000, 8265],
+  "forwardPorts": [8000, 8265, 8079],
   "portsAttributes": {
     "8000": { "label": "API" },
     "8265": { "label": "Ray Dashboard" }
@@ -31,7 +32,8 @@
   },
   "remoteEnv": {
     "HF_TOKEN": "${localEnv:HF_TOKEN}",
-    "YASHA_PLUGINS": "${localEnv:YASHA_PLUGINS}"
+    "YASHA_PLUGINS": "${localEnv:YASHA_PLUGINS}",
+    "YASHA_METRICS": "true"
   },
   "customizations": {
     "vscode": {
diff --git a/Dockerfile.dev b/Dockerfile.dev
index 160ba90..346a210 100644
--- a/Dockerfile.dev
+++ b/Dockerfile.dev
@@ -54,7 +54,7 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0
 ENV RAY_HEAD_CPU_NUM=2
 ENV RAY_HEAD_GPU_NUM=1
 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false
-ENV YASHA_METRICS=false
+ENV YASHA_METRICS=true
 ENV RAY_METRICS_EXPORT_PORT=8079
 RUN uv venv
 
diff --git a/Dockerfile.prod b/Dockerfile.prod
index d6b51f5..9a9e2fc 100644
--- a/Dockerfile.prod
+++ b/Dockerfile.prod
@@ -54,7 +54,7 @@ ENV RAY_CLUSTER_ADDRESS=0.0.0.0
 ENV RAY_HEAD_CPU_NUM=2
 ENV RAY_HEAD_GPU_NUM=1
 ENV YASHA_USE_EXISTING_RAY_CLUSTER=false
-ENV YASHA_METRICS=false
+ENV YASHA_METRICS=true
 ENV RAY_METRICS_EXPORT_PORT=8079
 RUN uv venv
 
diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json
index eb3cfef..15e3f07 100644
--- a/docs/grafana-dashboard.json
+++ b/docs/grafana-dashboard.json
@@ -32,7 +32,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(yasha:request_total[5m])) by (endpoint)",
+          "expr": "sum(rate(ray_yasha_request_total[5m])) by (endpoint)",
           "legendFormat": "{{ endpoint }}"
         }
       ],
@@ -48,7 +48,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(yasha:request_errors_total[5m])) by (model, error_type)",
+          "expr": "sum(rate(ray_yasha_request_errors_total[5m])) by (model, error_type)",
           "legendFormat": "{{ model }} - {{ error_type }}"
         }
       ],
@@ -64,7 +64,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "yasha:request_in_progress",
+          "expr": "ray_yasha_request_in_progress",
           "legendFormat": "{{ model }}"
         }
       ]
@@ -77,7 +77,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "yasha:models_loaded"
+          "expr": "ray_yasha_models_loaded"
         }
       ]
     },
@@ -89,7 +89,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(yasha:client_disconnects_total[5m])) by (model)",
+          "expr": "sum(rate(ray_yasha_client_disconnects_total[5m])) by (model)",
           "legendFormat": "{{ model }}"
         }
       ],
@@ -112,15 +112,15 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))",
+          "expr": "histogram_quantile(0.50, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, endpoint))",
           "legendFormat": "p50 {{ endpoint }}"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, endpoint))",
           "legendFormat": "p95 {{ endpoint }}"
         },
         {
-          "expr": "histogram_quantile(0.99, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, endpoint))",
+          "expr": "histogram_quantile(0.99, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, endpoint))",
           "legendFormat": "p99 {{ endpoint }}"
         }
       ],
@@ -136,7 +136,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:request_duration_seconds_bucket[5m])) by (le, model))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_request_duration_seconds_bucket[5m])) by (le, model))",
           "legendFormat": "p95 {{ model }}"
         }
       ],
@@ -152,23 +152,23 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:generation_duration_seconds_bucket[5m])) by (le, model))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_generation_duration_seconds_bucket[5m])) by (le, model))",
           "legendFormat": "generate {{ model }}"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:tts_generation_duration_seconds_bucket[5m])) by (le, model))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_tts_generation_duration_seconds_bucket[5m])) by (le, model))",
           "legendFormat": "tts {{ model }}"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:image_generation_duration_seconds_bucket[5m])) by (le, model))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_image_generation_duration_seconds_bucket[5m])) by (le, model))",
           "legendFormat": "image {{ model }}"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:transcription_duration_seconds_bucket[5m])) by (le, model))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_transcription_duration_seconds_bucket[5m])) by (le, model))",
           "legendFormat": "transcription {{ model }}"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:embedding_duration_seconds_bucket[5m])) by (le, model))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_yasha_embedding_duration_seconds_bucket[5m])) by (le, model))",
           "legendFormat": "embedding {{ model }}"
         }
       ],
@@ -191,7 +191,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "vllm:kv_cache_usage_perc",
+          "expr": "ray_vllm_kv_cache_usage_perc",
           "legendFormat": "{{ model_name }}"
         }
       ],
@@ -218,15 +218,15 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.50, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))",
           "legendFormat": "p50"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95"
         },
         {
-          "expr": "histogram_quantile(0.99, sum(rate(vllm:time_to_first_token_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.99, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))",
           "legendFormat": "p99"
         }
       ],
@@ -242,11 +242,11 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.50, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.50, sum(rate(ray_vllm_inter_token_latency_seconds_bucket[5m])) by (le))",
           "legendFormat": "p50"
         },
         {
-          "expr": "histogram_quantile(0.95, sum(rate(vllm:inter_token_latency_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_inter_token_latency_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95"
         }
       ],
@@ -262,11 +262,11 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(vllm:prompt_tokens_total[5m]))",
+          "expr": "sum(rate(ray_vllm_prompt_tokens_total[5m]))",
           "legendFormat": "prefill tok/s"
         },
         {
-          "expr": "sum(rate(vllm:generation_tokens_total[5m]))",
+          "expr": "sum(rate(ray_vllm_generation_tokens_total[5m]))",
           "legendFormat": "decode tok/s"
         }
       ],
@@ -282,11 +282,11 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "vllm:num_requests_running",
+          "expr": "ray_vllm_num_requests_running",
           "legendFormat": "running"
         },
         {
-          "expr": "vllm:num_requests_waiting",
+          "expr": "ray_vllm_num_requests_waiting",
           "legendFormat": "waiting"
         }
       ]
@@ -299,7 +299,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(vllm:num_preemptions_total[5m]))",
+          "expr": "sum(rate(ray_vllm_num_preemptions_total[5m]))",
           "legendFormat": "preemptions/s"
         }
       ]
@@ -312,7 +312,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(vllm:prefix_cache_hits_total[5m])) / clamp_min(sum(rate(vllm:prefix_cache_queries_total[5m])), 1)",
+          "expr": "sum(rate(ray_vllm_prefix_cache_hits_total[5m])) / clamp_min(sum(rate(ray_vllm_prefix_cache_queries_total[5m])), 1)",
           "legendFormat": "hit rate"
         }
       ],
@@ -328,7 +328,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, sum(rate(vllm:request_queue_time_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_request_queue_time_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95"
         }
       ],
@@ -344,7 +344,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, sum(rate(vllm:e2e_request_latency_seconds_bucket[5m])) by (le))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_vllm_e2e_request_latency_seconds_bucket[5m])) by (le))",
           "legendFormat": "p95"
         }
       ],
@@ -435,30 +435,36 @@
       "type": "row"
     },
     {
-      "title": "Replica Health",
-      "type": "stat",
+      "title": "Health Check Latency P95",
+      "type": "timeseries",
       "gridPos": { "h": 8, "w": 6, "x": 0, "y": 45 },
       "id": 40,
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "serve_deployment_replica_health_check",
-          "legendFormat": "{{ deployment }}"
+          "expr": "histogram_quantile(0.95, sum(rate(ray_serve_health_check_latency_ms_bucket[5m])) by (le, deployment))",
+          "legendFormat": "p95 {{ deployment }}"
         }
-      ]
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "ms" }
+      }
     },
     {
-      "title": "Replica Processing Queries",
+      "title": "Request Count by Deployment",
       "type": "timeseries",
       "gridPos": { "h": 8, "w": 6, "x": 6, "y": 45 },
       "id": 41,
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "serve_replica_processing_queries",
+          "expr": "sum(rate(ray_serve_handle_request_counter_total[5m])) by (deployment)",
           "legendFormat": "{{ deployment }}"
         }
-      ]
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
     },
     {
       "title": "Deployment Processing Latency P95",
@@ -468,7 +474,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, sum(rate(serve_deployment_processing_latency_ms_bucket[5m])) by (le, deployment))",
+          "expr": "histogram_quantile(0.95, sum(rate(ray_serve_deployment_processing_latency_ms_bucket[5m])) by (le, deployment))",
           "legendFormat": "p95 {{ deployment }}"
         }
       ],
@@ -477,17 +483,20 @@
       }
     },
     {
-      "title": "Health Check Failures",
+      "title": "HTTP Request Latency P95",
       "type": "timeseries",
       "gridPos": { "h": 8, "w": 6, "x": 18, "y": 45 },
       "id": 43,
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(serve_deployment_health_check_failures_total[5m])) by (deployment)",
-          "legendFormat": "{{ deployment }}"
+          "expr": "histogram_quantile(0.95, sum(rate(ray_serve_http_request_latency_ms_bucket[5m])) by (le))",
+          "legendFormat": "p95"
         }
-      ]
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "ms" }
+      }
     },
     {
       "collapsed": false,
@@ -504,8 +513,8 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "histogram_quantile(0.95, sum(rate(yasha:model_load_duration_seconds_bucket[5m])) by (le, model))",
-          "legendFormat": "p95 {{ model }}"
+          "expr": "ray_yasha_model_load_duration_seconds_sum / clamp_min(ray_yasha_model_load_duration_seconds_count, 1)",
+          "legendFormat": "avg {{ model }}"
         }
       ],
       "fieldConfig": {
@@ -520,7 +529,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(yasha:model_load_failures_total) by (model)",
+          "expr": "sum(ray_yasha_model_load_failures_total) by (model)",
           "legendFormat": "{{ model }}"
         }
       ],
@@ -543,7 +552,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(yasha:resource_cleanup_errors_total) by (model, component)",
+          "expr": "sum(ray_yasha_resource_cleanup_errors_total) by (model, component)",
           "legendFormat": "{{ model }} ({{ component }})"
         }
       ],
@@ -566,7 +575,7 @@
       "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
       "targets": [
         {
-          "expr": "sum(rate(yasha:stream_chunks_total[5m])) by (model)",
+          "expr": "sum(rate(ray_yasha_stream_chunks_total[5m])) by (model)",
           "legendFormat": "{{ model }}"
         }
       ],
diff --git a/docs/monitoring.md b/docs/monitoring.md
index a4535a9..90e4f63 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -7,15 +7,17 @@ Yasha exposes Prometheus metrics through a single port via Ray's metrics agent.
 ```
 Prometheus  ──scrape──>  Ray Metrics Agent (:8079)
                               |
-                              |-- ray_*          Ray cluster: GPU, CPU, memory, actors
-                              |-- serve_*        Ray Serve: HTTP requests, latency, replicas
-                              |-- vllm:*         vLLM engine: KV cache, TTFT, tokens, queue
-                              |-- yasha:*        Custom: per-model latency, errors, load time
+                              |-- ray_node_*          Ray cluster: GPU, CPU, memory
+                              |-- ray_serve_*         Ray Serve: HTTP requests, latency, replicas
+                              |-- ray_vllm_*          vLLM engine: KV cache, TTFT, tokens, queue
+                              |-- ray_yasha_*         Custom: per-model latency, errors, load time
 ```
 
+> **Note:** All metrics are prefixed with `ray_` by Ray's metrics agent. vLLM metric names are also sanitized (`:` → `_`), so e.g. the vLLM-native `vllm:kv_cache_usage_perc` becomes `ray_vllm_kv_cache_usage_perc`.
+
 ## Enabling Metrics
 
-Metrics are disabled by default. Set `YASHA_METRICS=true` to enable:
+Metrics are enabled by default. Set `YASHA_METRICS=false` to disable:
 
 ```bash
 docker run --rm --shm-size=8g --gpus all \
@@ -28,10 +30,10 @@ docker run --rm --shm-size=8g --gpus all \
 
 | Env Var | Default | Description |
 |---|---|---|
-| `YASHA_METRICS` | `false` | Master toggle. Enables all metrics and the Ray metrics export port. |
+| `YASHA_METRICS` | `true` | Master toggle. Enables all metrics and the Ray metrics export port. |
 | `RAY_METRICS_EXPORT_PORT` | `8079` | Port for the Ray metrics agent (only active when `YASHA_METRICS=true`). |
 
-When `YASHA_METRICS=false`, no metrics are collected and port 8079 is not exposed. Zero overhead.
+Set `YASHA_METRICS=false` to disable all metrics collection. When disabled, port 8079 is not exposed and there is zero overhead.
 
 ## Connecting to Prometheus
 
@@ -68,12 +70,12 @@ The dashboard has 6 rows:
 
 | Row | What it shows | Metric sources |
 |---|---|---|
-| **Overview** | Request rate, error rate, in-flight requests, models loaded, client disconnects | `yasha:*` |
-| **Latency** | Gateway P50/P95/P99, per-model latency, per-usecase latency (generate, TTS, image, STT, embed) | `yasha:*` |
-| **vLLM Engine** | KV cache usage, TTFT, inter-token latency, token throughput, queue depth, preemptions, prefix cache hit rate | `vllm:*` |
+| **Overview** | Request rate, error rate, in-flight requests, models loaded, client disconnects | `ray_yasha_*` |
+| **Latency** | Gateway P50/P95/P99, per-model latency, per-usecase latency (generate, TTS, image, STT, embed) | `ray_yasha_*` |
+| **vLLM Engine** | KV cache usage, TTFT, inter-token latency, token throughput, queue depth, preemptions, prefix cache hit rate | `ray_vllm_*` |
 | **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` |
-| **Ray Serve** | Replica health, processing queries, deployment latency, health check failures | `serve_*` |
-| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `yasha:*` |
+| **Ray Serve** | Health check latency, request count, deployment processing latency, HTTP request latency | `ray_serve_*` |
+| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `ray_yasha_*` |
 
 ## Health Check
 
@@ -86,70 +88,70 @@ curl http://localhost:8000/health
 
 ## Yasha Metrics Reference
 
-All custom metrics use the `yasha:` prefix and are exported via `ray.serve.metrics`.
+All custom metrics are defined via `ray.serve.metrics` and exported through Ray's metrics agent with a `ray_` prefix.
 
 ### Gateway
 
 | Metric | Type | Tags | Description |
 |---|---|---|---|
-| `yasha:request_total` | Counter | `model`, `endpoint`, `status` | Total requests by model and API method |
-| `yasha:request_duration_seconds` | Histogram | `model`, `endpoint` | End-to-end request latency |
-| `yasha:request_errors_total` | Counter | `model`, `endpoint`, `error_type` | Errors: `inference_error`, `stream_error`, `unhandled` |
-| `yasha:request_in_progress` | Gauge | `model`, `endpoint` | Currently processing requests |
-| `yasha:client_disconnects_total` | Counter | `model`, `endpoint` | Client disconnected before response completed |
-| `yasha:stream_chunks_total` | Counter | `model` | Streaming chunks emitted |
+| `ray_yasha_request_total` | Counter | `model`, `endpoint`, `status` | Total requests by model and API method |
+| `ray_yasha_request_duration_seconds` | Histogram | `model`, `endpoint` | End-to-end request latency |
+| `ray_yasha_request_errors_total` | Counter | `model`, `endpoint`, `error_type` | Errors: `inference_error`, `stream_error`, `unhandled` |
+| `ray_yasha_request_in_progress` | Gauge | `model`, `endpoint` | Currently processing requests |
+| `ray_yasha_client_disconnects_total` | Counter | `model`, `endpoint` | Client disconnected before response completed |
+| `ray_yasha_stream_chunks_total` | Counter | `model` | Streaming chunks emitted |
 
 ### Model Deployment
 
 | Metric | Type | Tags | Description |
 |---|---|---|---|
-| `yasha:model_load_duration_seconds` | Histogram | `model`, `loader` | Time to initialize a model |
-| `yasha:model_load_failures_total` | Counter | `model`, `loader` | Failed model initializations |
-| `yasha:models_loaded` | Gauge | | Number of loaded and ready models |
+| `ray_yasha_model_load_duration_seconds` | Histogram | `model`, `loader` | Time to initialize a model |
+| `ray_yasha_model_load_failures_total` | Counter | `model`, `loader` | Failed model initializations |
+| `ray_yasha_models_loaded` | Gauge | | Number of loaded and ready models |
 
 ### Inference Timing
 
 | Metric | Type | Tags | Description |
 |---|---|---|---|
-| `yasha:generation_duration_seconds` | Histogram | `model` | Chat/text generation latency |
-| `yasha:tts_generation_duration_seconds` | Histogram | `model` | Text-to-speech latency |
-| `yasha:image_generation_duration_seconds` | Histogram | `model` | Image generation latency |
-| `yasha:transcription_duration_seconds` | Histogram | `model` | Speech-to-text latency |
-| `yasha:embedding_duration_seconds` | Histogram | `model` | Embedding latency |
+| `ray_yasha_generation_duration_seconds` | Histogram | `model` | Chat/text generation latency |
+| `ray_yasha_tts_generation_duration_seconds` | Histogram | `model` | Text-to-speech latency |
+| `ray_yasha_image_generation_duration_seconds` | Histogram | `model` | Image generation latency |
+| `ray_yasha_transcription_duration_seconds` | Histogram | `model` | Speech-to-text latency |
+| `ray_yasha_embedding_duration_seconds` | Histogram | `model` | Embedding latency |
 
 ### Resource Cleanup
 
 | Metric | Type | Tags | Description |
 |---|---|---|---|
-| `yasha:resource_cleanup_errors_total` | Counter | `model`, `component` | Errors during engine/model cleanup |
+| `ray_yasha_resource_cleanup_errors_total` | Counter | `model`, `component` | Errors during engine/model cleanup |
 
 ## Built-in Metrics from vLLM and Ray
 
 These are automatically available when `YASHA_METRICS=true` — no additional configuration needed.
 
-### vLLM (`vllm:*`)
+### vLLM (`ray_vllm_*`)
 
-Key metrics for LLM inference monitoring:
+vLLM metrics are routed through Ray's metrics agent via `RayPrometheusStatLogger`. The native `vllm:` prefix is sanitized to `ray_vllm_`.
 
-- `vllm:num_requests_running` / `vllm:num_requests_waiting` — queue depth
-- `vllm:kv_cache_usage_perc` — KV cache utilization (0-1)
-- `vllm:time_to_first_token_seconds` — TTFT histogram
-- `vllm:inter_token_latency_seconds` — ITL histogram
-- `vllm:e2e_request_latency_seconds` — end-to-end latency histogram
-- `vllm:request_queue_time_seconds` — time spent waiting in queue
-- `vllm:prompt_tokens` / `vllm:generation_tokens` — token throughput counters
-- `vllm:num_preemptions` — memory pressure signal
-- `vllm:prefix_cache_hits` / `vllm:prefix_cache_queries` — cache efficiency
+- `ray_vllm_num_requests_running` / `ray_vllm_num_requests_waiting` — queue depth
+- `ray_vllm_kv_cache_usage_perc` — KV cache utilization (0-1)
+- `ray_vllm_time_to_first_token_seconds` — TTFT histogram
+- `ray_vllm_inter_token_latency_seconds` — ITL histogram
+- `ray_vllm_e2e_request_latency_seconds` — end-to-end latency histogram
+- `ray_vllm_request_queue_time_seconds` — time spent waiting in queue
+- `ray_vllm_prompt_tokens_total` / `ray_vllm_generation_tokens_total` — token throughput counters
+- `ray_vllm_num_preemptions_total` — memory pressure signal
+- `ray_vllm_prefix_cache_hits_total` / `ray_vllm_prefix_cache_queries_total` — cache efficiency
 
 Full reference: [vLLM Metrics Documentation](https://docs.vllm.ai/en/stable/design/metrics/)
 
-### Ray Serve (`serve_*`)
+### Ray Serve (`ray_serve_*`)
 
-- `serve_num_http_requests` — request count by route, method, status
-- `serve_http_request_latency_ms` — request latency histogram
-- `serve_num_ongoing_http_requests` — in-flight requests
-- `serve_deployment_processing_latency_ms` — per-replica processing time
-- `serve_deployment_replica_health_check` — replica health status
+- `ray_serve_num_http_requests_total` — request count by route, method, status
+- `ray_serve_http_request_latency_ms` — request latency histogram
+- `ray_serve_handle_request_counter_total` — request count by deployment
+- `ray_serve_deployment_processing_latency_ms` — per-replica processing time
+- `ray_serve_health_check_latency_ms` — health check latency histogram
 
 Full reference: [Ray Serve Monitoring](https://docs.ray.io/en/latest/serve/monitoring.html)
 
diff --git a/yasha/infer/vllm/vllm_infer.py b/yasha/infer/vllm/vllm_infer.py
index b4fd9dc..3278c3e 100644
--- a/yasha/infer/vllm/vllm_infer.py
+++ b/yasha/infer/vllm/vllm_infer.py
@@ -19,6 +19,7 @@
 
 from yasha.infer.infer_config import DisconnectProxy, ModelUsecase, VllmEngineConfig, YashaModelConfig
 from yasha.infer.vllm.openai.serving_speech import OpenAIServingSpeech
+from yasha.metrics import _ENABLED as _METRICS_ENABLED
 from yasha.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -103,11 +104,18 @@ def __init__(self, model_config: YashaModelConfig):
         # GPU pinning is handled by CUDA_VISIBLE_DEVICES set in ray_actor_options runtime_env.
         # The GPU is always visible as cuda:0 inside the actor — no device_config override needed.
 
+        stat_loggers: list | None = None
+        if _METRICS_ENABLED:
+            from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
+
+            stat_loggers = [RayPrometheusStatLogger]
+
         self.engine = AsyncLLM.from_vllm_config(
             vllm_config=vllm_config,
             usage_context=usage_context,
             enable_log_requests=engine_args.enable_log_requests,
             disable_log_stats=engine_args.disable_log_stats,
+            stat_loggers=stat_loggers,
         )
 
     def __del__(self):
diff --git a/yasha/metrics.py b/yasha/metrics.py
index 04dbe11..7ae3a7f 100644
--- a/yasha/metrics.py
+++ b/yasha/metrics.py
@@ -8,7 +8,7 @@
 
 import os
 
-_ENABLED = os.environ.get("YASHA_METRICS", "false").lower() == "true"
+_ENABLED = os.environ.get("YASHA_METRICS", "true").lower() == "true"
 
 # ---------------------------------------------------------------------------
 # No-op metric stubs (used when metrics are disabled)
@@ -80,86 +80,86 @@ def _build_metrics():
     return {
         # -- Gateway layer --
         "request_total": Counter(
-            "yasha:request_total",
+            "yasha_request_total",
             description="Total inference requests by model and endpoint.",
             tag_keys=("model", "endpoint", "status"),  # type: ignore[arg-type]
         ),
         "request_duration_seconds": Histogram(
-            "yasha:request_duration_seconds",
+            "yasha_request_duration_seconds",
             description="End-to-end request latency (gateway to response) in seconds.",
             boundaries=_REQUEST_LATENCY_BOUNDARIES,
             tag_keys=("model", "endpoint"),  # type: ignore[arg-type]
         ),
         "request_errors_total": Counter(
-            "yasha:request_errors_total",
+            "yasha_request_errors_total",
             description="Total inference errors by model, endpoint, and error type.",
             tag_keys=("model", "endpoint", "error_type"),  # type: ignore[arg-type]
         ),
         "request_in_progress": Gauge(
-            "yasha:request_in_progress",
+            "yasha_request_in_progress",
             description="Number of requests currently being processed per model.",
             tag_keys=("model", "endpoint"),  # type: ignore[arg-type]
         ),
         "client_disconnects_total": Counter(
-            "yasha:client_disconnects_total",
+            "yasha_client_disconnects_total",
             description="Total client disconnects during inference.",
             tag_keys=("model", "endpoint"),  # type: ignore[arg-type]
         ),
         "stream_chunks_total": Counter(
-            "yasha:stream_chunks_total",
+            "yasha_stream_chunks_total",
             description="Total streaming chunks emitted.",
             tag_keys=("model",),
         ),
         # -- Model deployment layer --
         "model_load_duration_seconds": Histogram(
-            "yasha:model_load_duration_seconds",
+            "yasha_model_load_duration_seconds",
             description="Model initialization time in seconds.",
             boundaries=_MODEL_LOAD_BOUNDARIES,
             tag_keys=("model", "loader"),  # type: ignore[arg-type]
         ),
         "model_load_failures_total": Counter(
-            "yasha:model_load_failures_total",
+            "yasha_model_load_failures_total",
             description="Total failed model deployments.",
             tag_keys=("model", "loader"),  # type: ignore[arg-type]
         ),
         "models_loaded": Gauge(
-            "yasha:models_loaded",
+            "yasha_models_loaded",
             description="Number of models currently loaded.",
         ),
         # -- Inference timing --
         "generation_duration_seconds": Histogram(
-            "yasha:generation_duration_seconds",
+            "yasha_generation_duration_seconds",
             description="Chat/text generation latency in seconds.",
             boundaries=_REQUEST_LATENCY_BOUNDARIES,
             tag_keys=("model",),
         ),
         "tts_generation_duration_seconds": Histogram(
-            "yasha:tts_generation_duration_seconds",
+            "yasha_tts_generation_duration_seconds",
             description="TTS inference latency in seconds.",
             boundaries=_REQUEST_LATENCY_BOUNDARIES,
             tag_keys=("model",),
         ),
         "image_generation_duration_seconds": Histogram(
-            "yasha:image_generation_duration_seconds",
+            "yasha_image_generation_duration_seconds",
             description="Image generation latency in seconds.",
             boundaries=_REQUEST_LATENCY_BOUNDARIES,
             tag_keys=("model",),
         ),
         "transcription_duration_seconds": Histogram(
-            "yasha:transcription_duration_seconds",
+            "yasha_transcription_duration_seconds",
             description="Speech-to-text latency in seconds.",
             boundaries=_REQUEST_LATENCY_BOUNDARIES,
             tag_keys=("model",),
         ),
         "embedding_duration_seconds": Histogram(
-            "yasha:embedding_duration_seconds",
+            "yasha_embedding_duration_seconds",
             description="Embedding inference latency in seconds.",
             boundaries=_REQUEST_LATENCY_BOUNDARIES,
             tag_keys=("model",),
         ),
         # -- Resource cleanup --
         "resource_cleanup_errors_total": Counter(
-            "yasha:resource_cleanup_errors_total",
+            "yasha_resource_cleanup_errors_total",
             description="Errors during resource cleanup (engine shutdown, memory release).",
             tag_keys=("model", "component"),  # type: ignore[arg-type]
         ),