alez007 · alez007 · Apr 13, 2026 · Apr 13, 2026
diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json
@@ -598,6 +598,189 @@
       "fieldConfig": {
         "defaults": { "unit": "cps" }
       }
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 62 },
+      "id": 106,
+      "title": "Alerts",
+      "type": "row"
+    },
+    {
+      "title": "Error Rate %",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 63 },
+      "id": 60,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(ray_modelship_request_errors_total[5m])) / clamp_min(sum(rate(ray_modelship_request_total[5m])), 1) * 100",
+          "legendFormat": "error %"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 2, "color": "yellow" },
+              { "value": 5, "color": "red" }
+            ]
+          },
+          "custom": {
+            "thresholdsStyle": { "mode": "line+area" }
+          }
+        }
+      }
+    },
+    {
+      "title": "KV Cache Usage",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 63 },
+      "id": 61,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_vllm_kv_cache_usage_perc * 100",
+          "legendFormat": "{{ model_name }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percent",
+          "min": 0,
+          "max": 100,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 80, "color": "yellow" },
+              { "value": 95, "color": "red" }
+            ]
+          },
+          "custom": {
+            "thresholdsStyle": { "mode": "line+area" }
+          }
+        }
+      }
+    },
+    {
+      "title": "Queue Depth",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 63 },
+      "id": 62,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_vllm_num_requests_waiting",
+          "legendFormat": "waiting"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 5, "color": "yellow" },
+              { "value": 10, "color": "red" }
+            ]
+          },
+          "custom": {
+            "thresholdsStyle": { "mode": "line+area" }
+          }
+        }
+      }
+    },
+    {
+      "title": "TTFT P99",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 0, "y": 71 },
+      "id": 63,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le))",
+          "legendFormat": "p99"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "green" },
+              { "value": 3, "color": "yellow" },
+              { "value": 5, "color": "red" }
+            ]
+          },
+          "custom": {
+            "thresholdsStyle": { "mode": "line+area" }
+          }
+        }
+      }
+    },
+    {
+      "title": "Client Disconnects",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 6, "y": 71 },
+      "id": 64,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(ray_modelship_client_disconnects_total[5m]))",
+          "legendFormat": "disconnects/s"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": { "unit": "reqps" }
+      }
+    },
+    {
+      "title": "Preemptions",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 71 },
+      "id": 65,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "sum(rate(ray_vllm_num_preemptions_total[5m]))",
+          "legendFormat": "preemptions/s"
+        }
+      ]
+    },
+    {
+      "title": "GPU Memory Available",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 71 },
+      "id": 66,
+      "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
+      "targets": [
+        {
+          "expr": "ray_node_gram_available",
+          "legendFormat": "GPU {{ GpuIndex }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "decmbytes",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": 0, "color": "red" },
+              { "value": 1024, "color": "yellow" },
+              { "value": 4096, "color": "green" }
+            ]
+          },
+          "custom": {
+            "thresholdsStyle": { "mode": "line+area" }
+          }
+        }
+      }
     }
   ],
   "schemaVersion": 39,

diff --git a/docs/monitoring.md b/docs/monitoring.md
@@ -114,7 +114,7 @@ To import it:
 2. Upload `grafana-dashboard.json` or paste its contents
 3. Select your Prometheus datasource when prompted
 
-The dashboard has 6 rows:
+The dashboard has 7 rows:
 
 | Row | What it shows | Metric sources |
 |---|---|---|
@@ -124,6 +124,55 @@ The dashboard has 6 rows:
 | **GPU & System** | GPU utilization, GPU memory, CPU, system memory | `ray_node_*` |
 | **Ray Serve** | Health check latency, request count, deployment processing latency, HTTP request latency | `ray_serve_*` |
 | **Operational** | Model load time, load failures, resource cleanup errors, streaming chunks/s | `ray_modelship_*` |
+| **Alerts** | Error rate %, KV cache usage, queue depth, TTFT P99, client disconnects, preemptions, GPU memory | `ray_modelship_*`, `ray_vllm_*`, `ray_node_*` |
+
+## Alerting
+
+A standalone Prometheus alerting rules file is included at [`docs/prometheus-alerts.yml`](prometheus-alerts.yml). The Grafana dashboard also has a dedicated **Alerts** row with threshold lines on the key panels.
+
+### Importing Alert Rules
+
+Add the rules file to your Prometheus config:
+
+```yaml
+rule_files:
+  - /path/to/prometheus-alerts.yml
+```
+
+Then reload Prometheus (`kill -HUP <pid>` or `POST /-/reload` if `--web.enable-lifecycle` is set).
+
+### Alert Reference
+
+#### Critical (page-worthy)
+
+| Alert | Condition | For | Description |
+|---|---|---|---|
+| `ModelshipHighErrorRate` | Error rate > 5% of traffic | 5m | Significant portion of requests are failing |
+| `ModelshipNoModelsLoaded` | `models_loaded` == 0 | 2m | Server is running but cannot serve requests |
+| `ModelshipModelLoadFailure` | Any increase in `model_load_failures_total` | 0m | A model failed to initialize |
+| `ModelshipKVCacheExhausted` | KV cache usage > 95% | 5m | Requests will queue or be preempted |
+
+#### Warning (investigate)
+
+| Alert | Condition | For | Description |
+|---|---|---|---|
+| `ModelshipHighP99Latency` | Gateway P99 > 30s | 5m | End-to-end latency is very high |
+| `ModelshipHighQueueDepth` | Waiting requests > 10 | 5m | vLLM engine is falling behind |
+| `ModelshipPreemptions` | Preemption rate > 0 | 5m | GPU memory pressure causing request eviction |
+| `ModelshipClientDisconnects` | Disconnect rate > 1/min | 5m | Clients timing out or dropping connections |
+| `ModelshipGPUMemoryPressure` | Available GPU memory < 1 GB | 5m | GPU is nearly out of memory |
+| `ModelshipHighTTFT` | TTFT P99 > 5s | 5m | Users waiting too long for first token |
+
+### Tuning Thresholds
+
+All thresholds are starting points. Adjust based on your deployment:
+
+- **Error rate**: 5% is aggressive — if you run small models that occasionally OOM, raise to 10%.
+- **P99 latency**: 30s works for chat completions with long outputs. For embeddings or TTS, consider lowering to 5-10s by adding per-endpoint rules.
+- **Queue depth**: 10 assumes a single vLLM instance. Scale proportionally with replicas.
+- **KV cache**: 95% is the danger zone. If you use prefix caching heavily, 90% may be more appropriate.
+- **TTFT**: 5s is generous. For interactive chat, consider 2-3s.
+- **GPU memory**: 1 GB threshold assumes you're not running anything else on the GPU. Raise if you have shared workloads.
 
 ## Health Check
 

diff --git a/docs/production-readiness.md b/docs/production-readiness.md
@@ -39,7 +39,7 @@ Future development priorities for making Modelship production-ready, organized b
 
 ### Alerting & Observability
 
-- [ ] **Prometheus alerting rules** — error rate thresholds, latency P99 breaches, model load failures, GPU memory pressure, Ray actor crashes
+- [x] **Prometheus alerting rules** — error rate thresholds, latency P99 breaches, model load failures, GPU memory pressure (see `docs/prometheus-alerts.yml`)
 - [ ] **SLO/SLI definitions** — define target availability and latency for each endpoint type
 - [x] **Structured logging (JSON)** — `MSHIP_LOG_FORMAT=json` for log aggregation (ELK/Loki/Splunk)
 - [x] **Request-ID correlation** — trace a request from gateway through Ray actor boundaries via `contextvars`

diff --git a/docs/prometheus-alerts.yml b/docs/prometheus-alerts.yml
@@ -0,0 +1,127 @@
+# Modelship Prometheus Alerting Rules
+#
+# Import into Prometheus:
+#   rule_files:
+#     - /path/to/prometheus-alerts.yml
+#
+# All thresholds are starting points — tune per deployment based on
+# your model sizes, traffic patterns, and hardware.
+
+groups:
+  - name: modelship-critical
+    rules:
+      - alert: ModelshipHighErrorRate
+        expr: |
+          sum(rate(ray_modelship_request_errors_total[5m]))
+          / clamp_min(sum(rate(ray_modelship_request_total[5m])), 1)
+          > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High request error rate (> 5%)"
+          description: >-
+            Error rate is {{ $value | humanizePercentage }} of total traffic
+            over the last 5 minutes.
+
+      - alert: ModelshipNoModelsLoaded
+        expr: ray_modelship_models_loaded == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "No models loaded"
+          description: >-
+            ray_modelship_models_loaded has been 0 for 2 minutes.
+            The server is running but cannot serve any requests.
+
+      - alert: ModelshipModelLoadFailure
+        expr: increase(ray_modelship_model_load_failures_total[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Model load failure detected"
+          description: >-
+            {{ $labels.model }} ({{ $labels.loader }}) failed to load.
+
+      - alert: ModelshipKVCacheExhausted
+        expr: ray_vllm_kv_cache_usage_perc > 0.95
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "vLLM KV cache near exhaustion (> 95%)"
+          description: >-
+            KV cache usage is {{ $value | humanizePercentage }}.
+            Requests will queue or be preempted.
+
+  - name: modelship-warning
+    rules:
+      - alert: ModelshipHighP99Latency
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(ray_modelship_request_duration_seconds_bucket[5m])) by (le)
+          ) > 30
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Request latency P99 > 30s"
+          description: >-
+            P99 gateway latency is {{ $value | humanizeDuration }}.
+
+      - alert: ModelshipHighQueueDepth
+        expr: ray_vllm_num_requests_waiting > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "vLLM request queue depth > 10"
+          description: >-
+            {{ $value }} requests waiting in the vLLM queue.
+
+      - alert: ModelshipPreemptions
+        expr: rate(ray_vllm_num_preemptions_total[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "vLLM preemptions occurring"
+          description: >-
+            Preemptions indicate GPU memory pressure — requests are being
+            evicted and recomputed.
+
+      - alert: ModelshipClientDisconnects
+        expr: sum(rate(ray_modelship_client_disconnects_total[5m])) > 0.0167
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Client disconnects > 1/min"
+          description: >-
+            {{ $value | humanize }} disconnects/s — clients may be timing out.
+
+      - alert: ModelshipGPUMemoryPressure
+        expr: ray_node_gram_available < 1024
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "GPU memory available < 1 GB"
+          description: >-
+            GPU {{ $labels.GpuIndex }} has only {{ $value | humanize }}MB free.
+
+      - alert: ModelshipHighTTFT
+        expr: |
+          histogram_quantile(0.99,
+            sum(rate(ray_vllm_time_to_first_token_seconds_bucket[5m])) by (le)
+          ) > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Time to first token P99 > 5s"
+          description: >-
+            TTFT P99 is {{ $value | humanizeDuration }} — users are waiting
+            too long for the first token.