Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions docs/grafana-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,189 @@
"fieldConfig": {
"defaults": { "unit": "cps" }
}
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 62 },
"id": 106,
"title": "Alerts",
"type": "row"
},
{
"title": "Error Rate %",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 63 },
"id": 60,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "sum(rate(ray_modelship_request_errors_total[5m])) / clamp_min(sum(rate(ray_modelship_request_total[5m])), 1) * 100",
"legendFormat": "error %"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 2, "color": "yellow" },
{ "value": 5, "color": "red" }
]
},
"custom": {
"thresholdsStyle": { "mode": "line+area" }
}
}
}
},
{
"title": "KV Cache Usage",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 63 },
"id": 61,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "ray_vllm_kv_cache_usage_perc * 100",
"legendFormat": "{{ model_name }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100,
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 80, "color": "yellow" },
{ "value": 95, "color": "red" }
]
},
"custom": {
"thresholdsStyle": { "mode": "line+area" }
}
}
}
},
{
"title": "Queue Depth",
"type": "timeseries",
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 63 },
"id": 62,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "ray_vllm_num_requests_waiting",
"legendFormat": "waiting"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 5, "color": "yellow" },
{ "value": 10, "color": "red" }
]
},
"custom": {
"thresholdsStyle": { "mode": "line+area" }
}
}
}
},
{
"title": "TTFT P99",
"type": "timeseries",
"gridPos": { "h": 8, "w": 6, "x": 0, "y": 71 },
"id": 63,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "green" },
{ "value": 3, "color": "yellow" },
{ "value": 5, "color": "red" }
]
},
"custom": {
"thresholdsStyle": { "mode": "line+area" }
}
}
}
},
{
"title": "Client Disconnects",
"type": "timeseries",
"gridPos": { "h": 8, "w": 6, "x": 6, "y": 71 },
"id": 64,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "sum(rate(ray_modelship_client_disconnects_total[5m]))",
"legendFormat": "disconnects/s"
}
],
"fieldConfig": {
"defaults": { "unit": "reqps" }
}
},
{
"title": "Preemptions",
"type": "timeseries",
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 71 },
"id": 65,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "sum(rate(ray_vllm_num_preemptions_total[5m]))",
"legendFormat": "preemptions/s"
}
]
},
{
"title": "GPU Memory Available",
"type": "timeseries",
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 71 },
"id": 66,
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"targets": [
{
"expr": "ray_node_gram_available",
"legendFormat": "GPU {{ GpuIndex }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "decmbytes",
"thresholds": {
"mode": "absolute",
"steps": [
{ "value": 0, "color": "red" },
{ "value": 1024, "color": "yellow" },
{ "value": 4096, "color": "green" }
]
},
"custom": {
"thresholdsStyle": { "mode": "line+area" }
}
}
}
}
],
"schemaVersion": 39,
Expand Down
51 changes: 50 additions & 1 deletion docs/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ To import it:
2. Upload `grafana-dashboard.json` or paste its contents
3. Select your Prometheus datasource when prompted

The dashboard has 6 rows:
The dashboard has 7 rows:

| Row | What it shows | Metric sources |
|---|---|---|
Expand All @@ -124,6 +124,55 @@ The dashboard has 6 rows:
| **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` |
| **Ray Serve** | Health check latency, request count, deployment processing latency, HTTP request latency | `ray_serve_*` |
| **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `ray_modelship_*` |
| **Alerts** | Error rate %, KV cache usage, queue depth, TTFT P99, client disconnects, preemptions, GPU memory | `ray_modelship_*`, `ray_vllm_*`, `ray_node_*` |

## Alerting

A standalone Prometheus alerting rules file is included at [`docs/prometheus-alerts.yml`](prometheus-alerts.yml). The Grafana dashboard also has a dedicated **Alerts** row with threshold lines on the key panels.

### Importing Alert Rules

Add the rules file to your Prometheus config:

```yaml
rule_files:
- /path/to/prometheus-alerts.yml
```

Then reload Prometheus (`kill -HUP <pid>` or `POST /-/reload` if `--web.enable-lifecycle` is set).

### Alert Reference

#### Critical (page-worthy)

| Alert | Condition | For | Description |
|---|---|---|---|
| `ModelshipHighErrorRate` | Error rate > 5% of traffic | 5m | Significant portion of requests are failing |
| `ModelshipNoModelsLoaded` | `models_loaded` == 0 | 2m | Server is running but cannot serve requests |
| `ModelshipModelLoadFailure` | Any increase in `model_load_failures_total` | 0m | A model failed to initialize |
| `ModelshipKVCacheExhausted` | KV cache usage > 95% | 5m | Requests will queue or be preempted |

#### Warning (investigate)

| Alert | Condition | For | Description |
|---|---|---|---|
| `ModelshipHighP99Latency` | Gateway P99 > 30s | 5m | End-to-end latency is very high |
| `ModelshipHighQueueDepth` | Waiting requests > 10 | 5m | vLLM engine is falling behind |
| `ModelshipPreemptions` | Preemption rate > 0 | 5m | GPU memory pressure causing request eviction |
| `ModelshipClientDisconnects` | Disconnect rate > 1/min | 5m | Clients timing out or dropping connections |
| `ModelshipGPUMemoryPressure` | Available GPU memory < 1 GB | 5m | GPU is nearly out of memory |
| `ModelshipHighTTFT` | TTFT P99 > 5s | 5m | Users waiting too long for first token |

### Tuning Thresholds

All thresholds are starting points. Adjust based on your deployment:

- **Error rate**: 5% is aggressive — if you run small models that occasionally OOM, raise to 10%.
- **P99 latency**: 30s works for chat completions with long outputs. For embeddings or TTS, consider lowering to 5-10s by adding per-endpoint rules.
- **Queue depth**: 10 assumes a single vLLM instance. Scale proportionally with replicas.
- **KV cache**: 95% is the danger zone. If you use prefix caching heavily, 90% may be more appropriate.
- **TTFT**: 5s is generous. For interactive chat, consider 2-3s.
- **GPU memory**: 1 GB threshold assumes you're not running anything else on the GPU. Raise if you have shared workloads.

## Health Check

Expand Down
2 changes: 1 addition & 1 deletion docs/production-readiness.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Future development priorities for making Modelship production-ready, organized b

### Alerting & Observability

- [ ] **Prometheus alerting rules** — error rate thresholds, latency P99 breaches, model load failures, GPU memory pressure, Ray actor crashes
- [x] **Prometheus alerting rules** — error rate thresholds, latency P99 breaches, model load failures, GPU memory pressure (see `docs/prometheus-alerts.yml`)
- [ ] **SLO/SLI definitions** — define target availability and latency for each endpoint type
- [x] **Structured logging (JSON)** — `MSHIP_LOG_FORMAT=json` for log aggregation (ELK/Loki/Splunk)
- [x] **Request-ID correlation** — trace a request from gateway through Ray actor boundaries via `contextvars`
Expand Down
127 changes: 127 additions & 0 deletions docs/prometheus-alerts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# Modelship Prometheus Alerting Rules
#
# Import into Prometheus:
# rule_files:
# - /path/to/prometheus-alerts.yml
#
# All thresholds are starting points — tune per deployment based on
# your model sizes, traffic patterns, and hardware.

groups:
- name: modelship-critical
rules:
- alert: ModelshipHighErrorRate
expr: |
sum(rate(ray_modelship_request_errors_total[5m]))
/ clamp_min(sum(rate(ray_modelship_request_total[5m])), 1)
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High request error rate (> 5%)"
description: >-
Error rate is {{ $value | humanizePercentage }} of total traffic
over the last 5 minutes.

- alert: ModelshipNoModelsLoaded
expr: ray_modelship_models_loaded == 0
for: 2m
labels:
severity: critical
annotations:
summary: "No models loaded"
description: >-
ray_modelship_models_loaded has been 0 for 2 minutes.
The server is running but cannot serve any requests.

- alert: ModelshipModelLoadFailure
expr: increase(ray_modelship_model_load_failures_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Model load failure detected"
description: >-
{{ $labels.model }} ({{ $labels.loader }}) failed to load.

- alert: ModelshipKVCacheExhausted
expr: ray_vllm_kv_cache_usage_perc > 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "vLLM KV cache near exhaustion (> 95%)"
description: >-
KV cache usage is {{ $value | humanizePercentage }}.
Requests will queue or be preempted.

- name: modelship-warning
rules:
- alert: ModelshipHighP99Latency
expr: |
histogram_quantile(0.99,
sum(rate(ray_modelship_request_duration_seconds_bucket[5m])) by (le)
) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Request latency P99 > 30s"
description: >-
P99 gateway latency is {{ $value | humanizeDuration }}.

- alert: ModelshipHighQueueDepth
expr: ray_vllm_num_requests_waiting > 10
for: 5m
labels:
severity: warning
annotations:
summary: "vLLM request queue depth > 10"
description: >-
{{ $value }} requests waiting in the vLLM queue.

- alert: ModelshipPreemptions
expr: rate(ray_vllm_num_preemptions_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "vLLM preemptions occurring"
description: >-
Preemptions indicate GPU memory pressure — requests are being
evicted and recomputed.

- alert: ModelshipClientDisconnects
expr: sum(rate(ray_modelship_client_disconnects_total[5m])) > 0.0167
for: 5m
labels:
severity: warning
annotations:
summary: "Client disconnects > 1/min"
description: >-
{{ $value | humanize }} disconnects/s — clients may be timing out.

- alert: ModelshipGPUMemoryPressure
expr: ray_node_gram_available < 1024
for: 5m
labels:
severity: warning
annotations:
summary: "GPU memory available < 1 GB"
description: >-
GPU {{ $labels.GpuIndex }} has only {{ $value | humanize }}MB free.

- alert: ModelshipHighTTFT
expr: |
histogram_quantile(0.99,
sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le)
) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Time to first token P99 > 5s"
description: >-
TTFT P99 is {{ $value | humanizeDuration }} — users are waiting
too long for the first token.
Loading