diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json index 6aba5d1..e4b9221 100644 --- a/docs/grafana-dashboard.json +++ b/docs/grafana-dashboard.json @@ -598,6 +598,189 @@ "fieldConfig": { "defaults": { "unit": "cps" } } + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 62 }, + "id": 106, + "title": "Alerts", + "type": "row" + }, + { + "title": "Error Rate %", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 63 }, + "id": 60, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(ray_modelship_request_errors_total[5m])) / clamp_min(sum(rate(ray_modelship_request_total[5m])), 1) * 100", + "legendFormat": "error %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 2, "color": "yellow" }, + { "value": 5, "color": "red" } + ] + }, + "custom": { + "thresholdsStyle": { "mode": "line+area" } + } + } + } + }, + { + "title": "KV Cache Usage", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 63 }, + "id": 61, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_vllm_kv_cache_usage_perc * 100", + "legendFormat": "{{ model_name }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 80, "color": "yellow" }, + { "value": 95, "color": "red" } + ] + }, + "custom": { + "thresholdsStyle": { "mode": "line+area" } + } + } + } + }, + { + "title": "Queue Depth", + "type": "timeseries", + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 63 }, + "id": 62, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_vllm_num_requests_waiting", + "legendFormat": "waiting" + } + ], + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 5, "color": "yellow" }, + { "value": 10, "color": "red" } + ] + }, + "custom": { + "thresholdsStyle": { "mode": "line+area" } + } + } + } + }, + { + "title": "TTFT P99", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 0, "y": 71 }, + "id": 63, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "green" }, + { "value": 3, "color": "yellow" }, + { "value": 5, "color": "red" } + ] + }, + "custom": { + "thresholdsStyle": { "mode": "line+area" } + } + } + } + }, + { + "title": "Client Disconnects", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 6, "y": 71 }, + "id": 64, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(ray_modelship_client_disconnects_total[5m]))", + "legendFormat": "disconnects/s" + } + ], + "fieldConfig": { + "defaults": { "unit": "reqps" } + } + }, + { + "title": "Preemptions", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 71 }, + "id": 65, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "sum(rate(ray_vllm_num_preemptions_total[5m]))", + "legendFormat": "preemptions/s" + } + ] + }, + { + "title": "GPU Memory Available", + "type": "timeseries", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 71 }, + "id": 66, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "targets": [ + { + "expr": "ray_node_gram_available", + "legendFormat": "GPU {{ GpuIndex }}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "decmbytes", + "thresholds": { + "mode": "absolute", + "steps": [ + { "value": 0, "color": "red" }, + { "value": 1024, "color": "yellow" }, + { "value": 4096, "color": "green" } + ] + }, + "custom": { + "thresholdsStyle": { "mode": "line+area" } + } + } + } } ], "schemaVersion": 39, diff --git a/docs/monitoring.md b/docs/monitoring.md index bcb798e..6147cb8 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -114,7 +114,7 @@ To import it: 2. Upload `grafana-dashboard.json` or paste its contents 3. Select your Prometheus datasource when prompted -The dashboard has 6 rows: +The dashboard has 7 rows: | Row | What it shows | Metric sources | |---|---|---| @@ -124,6 +124,55 @@ The dashboard has 6 rows: | **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` | | **Ray Serve** | Health check latency, request count, deployment processing latency, HTTP request latency | `ray_serve_*` | | **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `ray_modelship_*` | +| **Alerts** | Error rate %, KV cache usage, queue depth, TTFT P99, client disconnects, preemptions, GPU memory | `ray_modelship_*`, `ray_vllm_*`, `ray_node_*` | + +## Alerting + +A standalone Prometheus alerting rules file is included at [`docs/prometheus-alerts.yml`](prometheus-alerts.yml). The Grafana dashboard also has a dedicated **Alerts** row with threshold lines on the key panels. + +### Importing Alert Rules + +Add the rules file to your Prometheus config: + +```yaml +rule_files: + - /path/to/prometheus-alerts.yml +``` + +Then reload Prometheus (`kill -HUP ` or `POST /-/reload` if `--web.enable-lifecycle` is set). + +### Alert Reference + +#### Critical (page-worthy) + +| Alert | Condition | For | Description | +|---|---|---|---| +| `ModelshipHighErrorRate` | Error rate > 5% of traffic | 5m | Significant portion of requests are failing | +| `ModelshipNoModelsLoaded` | `models_loaded` == 0 | 2m | Server is running but cannot serve requests | +| `ModelshipModelLoadFailure` | Any increase in `model_load_failures_total` | 0m | A model failed to initialize | +| `ModelshipKVCacheExhausted` | KV cache usage > 95% | 5m | Requests will queue or be preempted | + +#### Warning (investigate) + +| Alert | Condition | For | Description | +|---|---|---|---| +| `ModelshipHighP99Latency` | Gateway P99 > 30s | 5m | End-to-end latency is very high | +| `ModelshipHighQueueDepth` | Waiting requests > 10 | 5m | vLLM engine is falling behind | +| `ModelshipPreemptions` | Preemption rate > 0 | 5m | GPU memory pressure causing request eviction | +| `ModelshipClientDisconnects` | Disconnect rate > 1/min | 5m | Clients timing out or dropping connections | +| `ModelshipGPUMemoryPressure` | Available GPU memory < 1 GB | 5m | GPU is nearly out of memory | +| `ModelshipHighTTFT` | TTFT P99 > 5s | 5m | Users waiting too long for first token | + +### Tuning Thresholds + +All thresholds are starting points. Adjust based on your deployment: + +- **Error rate**: 5% is aggressive — if you run small models that occasionally OOM, raise to 10%. +- **P99 latency**: 30s works for chat completions with long outputs. For embeddings or TTS, consider lowering to 5-10s by adding per-endpoint rules. +- **Queue depth**: 10 assumes a single vLLM instance. Scale proportionally with replicas. +- **KV cache**: 95% is the danger zone. If you use prefix caching heavily, 90% may be more appropriate. +- **TTFT**: 5s is generous. For interactive chat, consider 2-3s. +- **GPU memory**: 1 GB threshold assumes you're not running anything else on the GPU. Raise if you have shared workloads. ## Health Check diff --git a/docs/production-readiness.md b/docs/production-readiness.md index dd269a4..eb400d1 100644 --- a/docs/production-readiness.md +++ b/docs/production-readiness.md @@ -39,7 +39,7 @@ Future development priorities for making Modelship production-ready, organized b ### Alerting & Observability -- [ ] **Prometheus alerting rules** — error rate thresholds, latency P99 breaches, model load failures, GPU memory pressure, Ray actor crashes +- [x] **Prometheus alerting rules** — error rate thresholds, latency P99 breaches, model load failures, GPU memory pressure (see `docs/prometheus-alerts.yml`) - [ ] **SLO/SLI definitions** — define target availability and latency for each endpoint type - [x] **Structured logging (JSON)** — `MSHIP_LOG_FORMAT=json` for log aggregation (ELK/Loki/Splunk) - [x] **Request-ID correlation** — trace a request from gateway through Ray actor boundaries via `contextvars` diff --git a/docs/prometheus-alerts.yml b/docs/prometheus-alerts.yml new file mode 100644 index 0000000..04d200c --- /dev/null +++ b/docs/prometheus-alerts.yml @@ -0,0 +1,127 @@ +# Modelship Prometheus Alerting Rules +# +# Import into Prometheus: +# rule_files: +# - /path/to/prometheus-alerts.yml +# +# All thresholds are starting points — tune per deployment based on +# your model sizes, traffic patterns, and hardware. + +groups: + - name: modelship-critical + rules: + - alert: ModelshipHighErrorRate + expr: | + sum(rate(ray_modelship_request_errors_total[5m])) + / clamp_min(sum(rate(ray_modelship_request_total[5m])), 1) + > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "High request error rate (> 5%)" + description: >- + Error rate is {{ $value | humanizePercentage }} of total traffic + over the last 5 minutes. + + - alert: ModelshipNoModelsLoaded + expr: ray_modelship_models_loaded == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "No models loaded" + description: >- + ray_modelship_models_loaded has been 0 for 2 minutes. + The server is running but cannot serve any requests. + + - alert: ModelshipModelLoadFailure + expr: increase(ray_modelship_model_load_failures_total[5m]) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: "Model load failure detected" + description: >- + {{ $labels.model }} ({{ $labels.loader }}) failed to load. + + - alert: ModelshipKVCacheExhausted + expr: ray_vllm_kv_cache_usage_perc > 0.95 + for: 5m + labels: + severity: critical + annotations: + summary: "vLLM KV cache near exhaustion (> 95%)" + description: >- + KV cache usage is {{ $value | humanizePercentage }}. + Requests will queue or be preempted. + + - name: modelship-warning + rules: + - alert: ModelshipHighP99Latency + expr: | + histogram_quantile(0.99, + sum(rate(ray_modelship_request_duration_seconds_bucket[5m])) by (le) + ) > 30 + for: 5m + labels: + severity: warning + annotations: + summary: "Request latency P99 > 30s" + description: >- + P99 gateway latency is {{ $value | humanizeDuration }}. + + - alert: ModelshipHighQueueDepth + expr: ray_vllm_num_requests_waiting > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "vLLM request queue depth > 10" + description: >- + {{ $value }} requests waiting in the vLLM queue. + + - alert: ModelshipPreemptions + expr: rate(ray_vllm_num_preemptions_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "vLLM preemptions occurring" + description: >- + Preemptions indicate GPU memory pressure — requests are being + evicted and recomputed. + + - alert: ModelshipClientDisconnects + expr: sum(rate(ray_modelship_client_disconnects_total[5m])) > 0.0167 + for: 5m + labels: + severity: warning + annotations: + summary: "Client disconnects > 1/min" + description: >- + {{ $value | humanize }} disconnects/s — clients may be timing out. + + - alert: ModelshipGPUMemoryPressure + expr: ray_node_gram_available < 1024 + for: 5m + labels: + severity: warning + annotations: + summary: "GPU memory available < 1 GB" + description: >- + GPU {{ $labels.GpuIndex }} has only {{ $value | humanize }}MB free. + + - alert: ModelshipHighTTFT + expr: | + histogram_quantile(0.99, + sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le) + ) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Time to first token P99 > 5s" + description: >- + TTFT P99 is {{ $value | humanizeDuration }} — users are waiting + too long for the first token.