observability: Concurrent Users list + Synapse panel polish#4817
Merged
Conversation
- Split the Concurrent Users stat into two stacked panels: the existing big-number stat (with sparkline) on top, and a headerless list of the Matrix user IDs in the rolling 5-minute window below. Each user is a data link to the Users dashboard with `var-matrix_user_id` pre-set, so clicking a name lands on that user's credit/permission view. - Retarget both Concurrent Users panel links from the Synapse dashboard to the Users dashboard. - Stretch the Synapse process bar gauge (CPU / Mem) from h:4 to h:7 so it matches the neighbouring Postgres DB stat and closes the gap below it. - Shrink the Synapse process bar gauge title / value font sizes (~25% smaller than auto) so they sit more proportionally next to the rest of the bottom row. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Contributor
Observability diff (vs staging)diff --git a/tmp/remote-canon.O5Zcn7/dashboards/boxel-status/overview.json b/tmp/committed-canon.Bmx9aC/dashboards/boxel-status/overview.json
index 1802adb..de8c574 100644
--- a/tmp/remote-canon.O5Zcn7/dashboards/boxel-status/overview.json
+++ b/tmp/committed-canon.Bmx9aC/dashboards/boxel-status/overview.json
@@ -247,8 +247,8 @@
"links": [
{
"targetBlank": false,
- "title": "Open Synapse dashboard",
- "url": "/d/000000012"
+ "title": "Open Users dashboard",
+ "url": "/d/boxelusers0001?${__url_time_range}"
}
],
"mappings": [],
@@ -285,7 +285,7 @@
]
},
"gridPos": {
- "h": 8,
+ "h": 5,
"w": 8,
"x": 16,
"y": 0
@@ -295,9 +295,9 @@
{
"icon": "external link",
"targetBlank": false,
- "title": "Open Synapse dashboard",
+ "title": "Open Users dashboard",
"type": "link",
- "url": "/d/000000012"
+ "url": "/d/boxelusers0001?${__url_time_range}"
}
],
"options": {
@@ -322,7 +322,7 @@
"type": "loki",
"uid": "loki"
},
- "expr": "count(count by (user) (count_over_time({service=\"synapse\"} |~ \"Processed request.*simplified_msc3575/sync\" | regexp `\\{(?P<user>@[^}]+:[^}]+)\\}` [5m]))) or vector(0)",
+ "expr": "count(count by (user) (count_over_time({service=\"synapse\"} |= \"Processed request\" |= \"simplified_msc3575/sync\" | regexp `\\{(?P<user>@[^}]+:[^}]+)\\}` [5m]))) or vector(0)",
"legendFormat": "Concurrent Users",
"queryType": "range",
"refId": "A"
@@ -331,6 +331,125 @@
"title": "Concurrent Users",
"type": "stat"
},
+ {
+ "datasource": {
+ "type": "loki",
+ "uid": "loki"
+ },
+ "description": "Matrix user IDs seen on the Synapse Sliding Sync endpoint within the last 5 minutes. Click a user to open the Users dashboard focused on them.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {
+ "align": "left",
+ "cellOptions": {
+ "type": "auto"
+ },
+ "filterable": true,
+ "inspect": false,
+ "minWidth": 120
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "transparent"
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "user"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "Matrix User"
+ },
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "#ff66c4",
+ "mode": "fixed"
+ }
+ },
+ {
+ "id": "links",
+ "value": [
+ {
+ "targetBlank": false,
+ "title": "Open in Users dashboard",
+ "url": "/d/boxelusers0001?${__url_time_range}&var-matrix_user_id=${__value.text:percentencode}"
+ }
+ ]
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 8,
+ "x": 16,
+ "y": 5
+ },
+ "id": 31,
+ "options": {
+ "cellHeight": "sm",
+ "footer": {
+ "countRows": false,
+ "fields": "",
+ "reducer": [
+ "sum"
+ ],
+ "show": false
+ },
+ "showHeader": false,
+ "sortBy": [
+ {
+ "desc": false,
+ "displayName": "Matrix User"
+ }
+ ]
+ },
+ "pluginVersion": "12.4.3",
+ "targets": [
+ {
+ "datasource": {
+ "type": "loki",
+ "uid": "loki"
+ },
+ "expr": "sum by (user) (count_over_time({service=\"synapse\"} |= \"Processed request\" |= \"simplified_msc3575/sync\" | regexp `\\{(?P<user>@[^}]+:[^}]+)\\}` [5m]))",
+ "legendFormat": "{{user}}",
+ "queryType": "instant",
+ "refId": "A"
+ }
+ ],
+ "title": "",
+ "transformations": [
+ {
+ "id": "labelsToFields",
+ "options": {
+ "mode": "columns"
+ }
+ },
+ {
+ "id": "filterFieldsByName",
+ "options": {
+ "include": {
+ "pattern": "^user$"
+ }
+ }
+ }
+ ],
+ "type": "table"
+ },
{
"datasource": {
"type": "loki",
@@ -1001,6 +1120,128 @@
"title": "Realm Server",
"type": "bargauge"
},
+ {
+ "datasource": {
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
+ },
+ "description": "ECS task count for `boxel-realm-server-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "decimals": 2,
+ "mappings": [
+ {
+ "options": {
+ "pattern": "^(\\d+)\\.0*(\\d+)$",
+ "result": {
+ "text": "$1 / $2"
+ }
+ },
+ "type": "regex"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "red"
+ },
+ {
+ "color": "green",
+ "value": 1
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 4,
+ "x": 0,
+ "y": 20
+ },
+ "id": 23,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "center",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "textMode": "value",
+ "wideLayout": true
+ },
+ "pluginVersion": "12.4.3",
+ "targets": [
+ {
+ "datasource": {
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
+ },
+ "dimensions": {
+ "ClusterName": "${env}",
+ "ServiceName": "boxel-realm-server-${env}"
+ },
+ "hide": true,
+ "metricEditorMode": 0,
+ "metricName": "RunningTaskCount",
+ "metricQueryType": 0,
+ "namespace": "ECS/ContainerInsights",
+ "period": "",
+ "refId": "A",
+ "region": "default",
+ "statistic": "Maximum"
+ },
+ {
+ "datasource": {
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
+ },
+ "dimensions": {
+ "ClusterName": "${env}",
+ "ServiceName": "boxel-realm-server-${env}"
+ },
+ "hide": true,
+ "metricEditorMode": 0,
+ "metricName": "DesiredTaskCount",
+ "metricQueryType": 0,
+ "namespace": "ECS/ContainerInsights",
+ "period": "",
+ "refId": "B",
+ "region": "default",
+ "statistic": "Maximum"
+ },
+ {
+ "datasource": {
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
+ },
+ "dimensions": {},
+ "expression": "A + B/100",
+ "id": "e1",
+ "metricEditorMode": 1,
+ "metricName": "",
+ "metricQueryType": 0,
+ "namespace": "",
+ "period": "",
+ "refId": "e1",
+ "region": "default",
+ "statistic": ""
+ }
+ ],
+ "title": "Tasks",
+ "type": "stat"
+ },
{
"datasource": {
"type": "cloudwatch",
@@ -1184,125 +1425,51 @@
"type": "cloudwatch",
"uid": "cef5x9o3yzawwf"
},
- "description": "ECS CPU and memory utilisation for `boxel-prerender-manager-${env}` on cluster `${env}` — averages over the panel refresh window. Task counts (Run / Need) appear in the stat row below this bargauge. Locally these show 'No data' — CloudWatch is staging/production only.",
+ "description": "ECS task count for `boxel-prerender-server-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
- "decimals": 1,
- "links": [
+ "decimals": 2,
+ "mappings": [
{
- "targetBlank": false,
- "title": "Open Prerender Manager dashboard",
- "url": "/d/boxel-svc-prerender-manager"
+ "options": {
+ "pattern": "^(\\d+)\\.0*(\\d+)$",
+ "result": {
+ "text": "$1 / $2"
+ }
+ },
+ "type": "regex"
}
],
- "mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
- "color": "transparent"
+ "color": "red"
+ },
+ {
+ "color": "green",
+ "value": 1
}
]
},
- "unit": "short"
+ "unit": "none"
},
- "overrides": [
- {
- "matcher": {
- "id": "byFrameRefID",
- "options": "A"
- },
- "properties": [
- {
- "id": "displayName",
- "value": "CPU"
- },
- {
- "id": "unit",
- "value": "percent"
- },
- {
- "id": "thresholds",
- "value": {
- "mode": "absolute",
- "steps": [
- {
- "color": "transparent"
- },
- {
- "color": "green",
- "value": 0
- },
- {
- "color": "yellow",
- "value": 70
- },
- {
- "color": "red",
- "value": 90
- }
- ]
- }
- }
- ]
- },
- {
- "matcher": {
- "id": "byFrameRefID",
- "options": "B"
- },
- "properties": [
- {
- "id": "displayName",
- "value": "Mem"
- },
- {
- "id": "unit",
- "value": "percent"
- },
- {
- "id": "thresholds",
- "value": {
- "mode": "absolute",
- "steps": [
- {
- "color": "transparent"
- },
- {
- "color": "green",
- "value": 0
- },
- {
- "color": "yellow",
- "value": 70
- },
- {
- "color": "red",
- "value": 90
- }
- ]
- }
- }
- ]
- }
- ]
+ "overrides": []
},
"gridPos": {
- "h": 4,
+ "h": 3,
"w": 4,
- "x": 8,
- "y": 16
+ "x": 4,
+ "y": 20
},
- "id": 15,
+ "id": 24,
"options": {
- "displayMode": "gradient",
- "maxVizHeight": 300,
- "minVizHeight": 10,
- "minVizWidth": 0,
- "namePlacement": "auto",
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
@@ -1311,9 +1478,8 @@
"fields": "",
"values": false
},
- "showUnfilled": true,
- "sizing": "auto",
- "valueMode": "color"
+ "textMode": "value",
+ "wideLayout": true
},
"pluginVersion": "12.4.3",
"targets": [
@@ -1324,16 +1490,17 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-prerender-manager-${env}"
+ "ServiceName": "boxel-prerender-server-${env}"
},
+ "hide": true,
"metricEditorMode": 0,
- "metricName": "CPUUtilization",
+ "metricName": "RunningTaskCount",
"metricQueryType": 0,
- "namespace": "AWS/ECS",
+ "namespace": "ECS/ContainerInsights",
"period": "",
"refId": "A",
"region": "default",
- "statistic": "Average"
+ "statistic": "Maximum"
},
{
"datasource": {
@@ -1342,27 +1509,45 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-prerender-manager-${env}"
+ "ServiceName": "boxel-prerender-server-${env}"
},
+ "hide": true,
"metricEditorMode": 0,
- "metricName": "MemoryUtilization",
+ "metricName": "DesiredTaskCount",
"metricQueryType": 0,
- "namespace": "AWS/ECS",
+ "namespace": "ECS/ContainerInsights",
"period": "",
"refId": "B",
"region": "default",
- "statistic": "Average"
+ "statistic": "Maximum"
+ },
+ {
+ "datasource": {
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
+ },
+ "dimensions": {},
+ "expression": "A + B/100",
+ "id": "e1",
+ "metricEditorMode": 1,
+ "metricName": "",
+ "metricQueryType": 0,
+ "namespace": "",
+ "period": "",
+ "refId": "e1",
+ "region": "default",
+ "statistic": ""
}
],
- "title": "Prerender Mgr",
- "type": "bargauge"
+ "title": "Tasks",
+ "type": "stat"
},
{
"datasource": {
"type": "cloudwatch",
"uid": "cef5x9o3yzawwf"
},
- "description": "ECS CPU and memory utilisation for `boxel-worker-${env}` on cluster `${env}` — averages over the panel refresh window. Task counts (Run / Need) appear in the stat row below this bargauge. Locally these show 'No data' — CloudWatch is staging/production only.",
+ "description": "ECS CPU and memory utilisation for `boxel-prerender-manager-${env}` on cluster `${env}` — averages over the panel refresh window. Task counts (Run / Need) appear in the stat row below this bargauge. Locally these show 'No data' — CloudWatch is staging/production only.",
"fieldConfig": {
"defaults": {
"color": {
@@ -1372,8 +1557,8 @@
"links": [
{
"targetBlank": false,
- "title": "Open Worker dashboard",
- "url": "/d/boxel-svc-worker"
+ "title": "Open Prerender Manager dashboard",
+ "url": "/d/boxel-svc-prerender-manager"
}
],
"mappings": [],
@@ -1471,10 +1656,10 @@
"gridPos": {
"h": 4,
"w": 4,
- "x": 12,
+ "x": 8,
"y": 16
},
- "id": 16,
+ "id": 15,
"options": {
"displayMode": "gradient",
"maxVizHeight": 300,
@@ -1502,7 +1687,7 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-worker-${env}"
+ "ServiceName": "boxel-prerender-manager-${env}"
},
"metricEditorMode": 0,
"metricName": "CPUUtilization",
@@ -1520,7 +1705,7 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-worker-${env}"
+ "ServiceName": "boxel-prerender-manager-${env}"
},
"metricEditorMode": 0,
"metricName": "MemoryUtilization",
@@ -1532,138 +1717,59 @@
"statistic": "Average"
}
],
- "title": "Worker",
+ "title": "Prerender Mgr",
"type": "bargauge"
},
{
"datasource": {
- "type": "prometheus",
- "uid": "bes7ustjf8w74b"
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
},
- "description": "Synapse process metrics scraped from /\\_synapse/metrics via the synapse-prometheus datasource (local Prometheus locally; AMP in staging/production). CPU is rate over 5m as a percentage of one core; Mem is process_resident_memory_bytes.",
+ "description": "ECS task count for `boxel-prerender-manager-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
- "decimals": 1,
- "links": [
+ "decimals": 2,
+ "mappings": [
{
- "targetBlank": false,
- "title": "Open Synapse dashboard",
- "url": "/d/000000012"
+ "options": {
+ "pattern": "^(\\d+)\\.0*(\\d+)$",
+ "result": {
+ "text": "$1 / $2"
+ }
+ },
+ "type": "regex"
}
],
- "mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
- "color": "transparent"
- }
- ]
- },
- "unit": "short"
- },
- "overrides": [
- {
- "matcher": {
- "id": "byFrameRefID",
- "options": "A"
- },
- "properties": [
- {
- "id": "displayName",
- "value": "CPU"
- },
- {
- "id": "unit",
- "value": "percent"
+ "color": "red"
},
{
- "id": "thresholds",
- "value": {
- "mode": "absolute",
- "steps": [
- {
- "color": "transparent"
- },
- {
- "color": "green",
- "value": 0
- },
- {
- "color": "yellow",
- "value": 70
- },
- {
- "color": "red",
- "value": 90
- }
- ]
- }
+ "color": "green",
+ "value": 1
}
]
},
- {
- "matcher": {
- "id": "byFrameRefID",
- "options": "B"
- },
- "properties": [
- {
- "id": "displayName",
- "value": "Mem"
- },
- {
- "id": "unit",
- "value": "decbytes"
- },
- {
- "id": "thresholds",
- "value": {
- "mode": "absolute",
- "steps": [
- {
- "color": "transparent"
- },
- {
- "color": "green",
- "value": 0
- }
- ]
- }
- },
- {
- "id": "max",
- "value": 8589934592
- }
- ]
- }
- ]
+ "unit": "none"
+ },
+ "overrides": []
},
"gridPos": {
- "h": 4,
+ "h": 3,
"w": 4,
- "x": 16,
- "y": 16
+ "x": 8,
+ "y": 20
},
- "id": 17,
- "links": [
- {
- "icon": "external link",
- "targetBlank": false,
- "title": "Open Synapse dashboard",
- "type": "link",
- "url": "/d/000000012"
- }
- ],
+ "id": 25,
"options": {
- "displayMode": "gradient",
- "maxVizHeight": 300,
- "minVizHeight": 10,
- "minVizWidth": 0,
- "namePlacement": "auto",
+ "colorMode": "value",
+ "graphMode": "none",
+ "justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
@@ -1672,53 +1778,87 @@
"fields": "",
"values": false
},
- "showUnfilled": true,
- "sizing": "auto",
- "valueMode": "color"
+ "textMode": "value",
+ "wideLayout": true
},
"pluginVersion": "12.4.3",
"targets": [
{
"datasource": {
- "type": "prometheus",
- "uid": "bes7ustjf8w74b"
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
},
- "expr": "rate(process_cpu_seconds_total{job=\"synapse\"}[5m]) * 100",
- "instant": true,
- "legendFormat": "CPU",
- "refId": "A"
+ "dimensions": {
+ "ClusterName": "${env}",
+ "ServiceName": "boxel-prerender-manager-${env}"
+ },
+ "hide": true,
+ "metricEditorMode": 0,
+ "metricName": "RunningTaskCount",
+ "metricQueryType": 0,
+ "namespace": "ECS/ContainerInsights",
+ "period": "",
+ "refId": "A",
+ "region": "default",
+ "statistic": "Maximum"
},
{
"datasource": {
- "type": "prometheus",
- "uid": "bes7ustjf8w74b"
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
},
- "expr": "process_resident_memory_bytes{job=\"synapse\"}",
- "instant": true,
- "legendFormat": "Mem",
- "refId": "B"
+ "dimensions": {
+ "ClusterName": "${env}",
+ "ServiceName": "boxel-prerender-manager-${env}"
+ },
+ "hide": true,
+ "metricEditorMode": 0,
+ "metricName": "DesiredTaskCount",
+ "metricQueryType": 0,
+ "namespace": "ECS/ContainerInsights",
+ "period": "",
+ "refId": "B",
+ "region": "default",
+ "statistic": "Maximum"
+ },
+ {
+ "datasource": {
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
+ },
+ "dimensions": {},
+ "expression": "A + B/100",
+ "id": "e1",
+ "metricEditorMode": 1,
+ "metricName": "",
+ "metricQueryType": 0,
+ "namespace": "",
+ "period": "",
+ "refId": "e1",
+ "region": "default",
+ "statistic": ""
}
],
- "title": "Synapse Process",
- "type": "bargauge"
+ "title": "Tasks",
+ "type": "stat"
},
{
"datasource": {
- "type": "grafana-postgresql-datasource",
- "uid": "cef5v5sl9k7i8f"
+ "type": "cloudwatch",
+ "uid": "cef5x9o3yzawwf"
},
- "description": "Boxel application database transaction rate (xact_commit + xact_rollback) averaged since the postmaster started. Not a rolling window — recent spikes will be smoothed. Drill into the DB-specific dashboard for detail.",
+ "description": "ECS CPU and memory utilisation for `boxel-worker-${env}` on cluster `${env}` — averages over the panel refresh window. Task counts (Run / Need) appear in the stat row below this bargauge. Locally these show 'No data' — CloudWatch is staging/production only.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
- "decimals": 0,
+ "decimals": 1,
"links": [
{
"targetBlank": false,
- "title": "Open Database dashboard",
- "url": "/d/boxeldatabase1"
+ "title": "Open Worker dashboard",
+ "url": "/d/boxel-svc-worker"
}
],
"mappings": [],
@@ -1727,105 +1867,105 @@
"steps": [
{
"color": "transparent"
- },
- {
- "color": "green",
- "value": 0
}
]
},
"unit": "short"
},
- "overrides": []
- },
- "gridPos": {
- "h": 7,
- "w": 4,
- "x": 20,
- "y": 16
- },
- "id": 18,
- "options": {
- "colorMode": "background_solid",
- "graphMode": "none",
- "justifyMode": "auto",
- "orientation": "vertical",
- "reduceOptions": {
- "calcs": [
- "lastNotNull"
- ],
- "fields": "",
- "values": false
- },
- "textMode": "value_and_name"
- },
- "pluginVersion": "12.4.3",
- "targets": [
- {
- "datasource": {
- "type": "grafana-postgresql-datasource",
- "uid": "cef5v5sl9k7i8f"
- },
- "editorMode": "code",
- "format": "table",
- "rawQuery": true,
- "rawSql": "SELECT 60.0 * (xact_commit + xact_rollback) / NULLIF(EXTRACT(EPOCH FROM (NOW() - pg_postmaster_start_time())), 0) AS \"txn/min\" FROM pg_stat_database WHERE datname = 'boxel';",
- "refId": "A"
- }
- ],
- "title": "Postgres DB",
- "type": "stat"
- },
- {
- "datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
- },
- "description": "ECS task count for `boxel-realm-server-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
- "fieldConfig": {
- "defaults": {
- "color": {
- "mode": "thresholds"
- },
- "decimals": 2,
- "mappings": [
- {
- "options": {
- "pattern": "^(\\d+)\\.0*(\\d+)$",
- "result": {
- "text": "$1 / $2"
- }
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byFrameRefID",
+ "options": "A"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "CPU"
},
- "type": "regex"
- }
- ],
- "thresholds": {
- "mode": "absolute",
- "steps": [
{
- "color": "red"
+ "id": "unit",
+ "value": "percent"
},
{
- "color": "green",
- "value": 1
+ "id": "thresholds",
+ "value": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "transparent"
+ },
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ }
}
]
},
- "unit": "none"
- },
- "overrides": []
+ {
+ "matcher": {
+ "id": "byFrameRefID",
+ "options": "B"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "Mem"
+ },
+ {
+ "id": "unit",
+ "value": "percent"
+ },
+ {
+ "id": "thresholds",
+ "value": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "transparent"
+ },
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ }
+ }
+ ]
+ }
+ ]
},
"gridPos": {
- "h": 3,
+ "h": 4,
"w": 4,
- "x": 0,
- "y": 20
+ "x": 12,
+ "y": 16
},
- "id": 23,
+ "id": 16,
"options": {
- "colorMode": "value",
- "graphMode": "none",
- "justifyMode": "center",
+ "displayMode": "gradient",
+ "maxVizHeight": 300,
+ "minVizHeight": 10,
+ "minVizWidth": 0,
+ "namePlacement": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
@@ -1834,8 +1974,9 @@
"fields": "",
"values": false
},
- "textMode": "value",
- "wideLayout": true
+ "showUnfilled": true,
+ "sizing": "auto",
+ "valueMode": "color"
},
"pluginVersion": "12.4.3",
"targets": [
@@ -1846,17 +1987,16 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-realm-server-${env}"
+ "ServiceName": "boxel-worker-${env}"
},
- "hide": true,
"metricEditorMode": 0,
- "metricName": "RunningTaskCount",
+ "metricName": "CPUUtilization",
"metricQueryType": 0,
- "namespace": "ECS/ContainerInsights",
+ "namespace": "AWS/ECS",
"period": "",
"refId": "A",
"region": "default",
- "statistic": "Maximum"
+ "statistic": "Average"
},
{
"datasource": {
@@ -1865,45 +2005,27 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-realm-server-${env}"
+ "ServiceName": "boxel-worker-${env}"
},
- "hide": true,
"metricEditorMode": 0,
- "metricName": "DesiredTaskCount",
+ "metricName": "MemoryUtilization",
"metricQueryType": 0,
- "namespace": "ECS/ContainerInsights",
+ "namespace": "AWS/ECS",
"period": "",
"refId": "B",
"region": "default",
- "statistic": "Maximum"
- },
- {
- "datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
- },
- "dimensions": {},
- "expression": "A + B/100",
- "id": "e1",
- "metricEditorMode": 1,
- "metricName": "",
- "metricQueryType": 0,
- "namespace": "",
- "period": "",
- "refId": "e1",
- "region": "default",
- "statistic": ""
+ "statistic": "Average"
}
],
- "title": "Tasks",
- "type": "stat"
+ "title": "Worker",
+ "type": "bargauge"
},
{
"datasource": {
"type": "cloudwatch",
"uid": "cef5x9o3yzawwf"
},
- "description": "ECS task count for `boxel-prerender-server-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
+ "description": "ECS task count for `boxel-worker-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
"fieldConfig": {
"defaults": {
"color": {
@@ -1940,10 +2062,10 @@
"gridPos": {
"h": 3,
"w": 4,
- "x": 4,
+ "x": 12,
"y": 20
},
- "id": 24,
+ "id": 26,
"options": {
"colorMode": "value",
"graphMode": "none",
@@ -1968,7 +2090,7 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-prerender-server-${env}"
+ "ServiceName": "boxel-worker-${env}"
},
"hide": true,
"metricEditorMode": 0,
@@ -1987,7 +2109,7 @@
},
"dimensions": {
"ClusterName": "${env}",
- "ServiceName": "boxel-prerender-server-${env}"
+ "ServiceName": "boxel-worker-${env}"
},
"hide": true,
"metricEditorMode": 0,
@@ -2022,54 +2144,133 @@
},
{
"datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
+ "type": "prometheus",
+ "uid": "bes7ustjf8w74b"
},
- "description": "ECS task count for `boxel-prerender-manager-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
+ "description": "Synapse process metrics scraped from /\\_synapse/metrics via the synapse-prometheus datasource (local Prometheus locally; AMP in staging/production). CPU is rate over 5m as a percentage of one core; Mem is process_resident_memory_bytes.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
- "decimals": 2,
- "mappings": [
+ "decimals": 1,
+ "links": [
{
- "options": {
- "pattern": "^(\\d+)\\.0*(\\d+)$",
- "result": {
- "text": "$1 / $2"
- }
- },
- "type": "regex"
+ "targetBlank": false,
+ "title": "Open Synapse dashboard",
+ "url": "/d/000000012"
}
],
+ "mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
- "color": "red"
+ "color": "transparent"
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byFrameRefID",
+ "options": "A"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "CPU"
},
{
- "color": "green",
- "value": 1
+ "id": "unit",
+ "value": "percent"
+ },
+ {
+ "id": "thresholds",
+ "value": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "transparent"
+ },
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "yellow",
+ "value": 70
+ },
+ {
+ "color": "red",
+ "value": 90
+ }
+ ]
+ }
}
]
},
- "unit": "none"
- },
- "overrides": []
+ {
+ "matcher": {
+ "id": "byFrameRefID",
+ "options": "B"
+ },
+ "properties": [
+ {
+ "id": "displayName",
+ "value": "Mem"
+ },
+ {
+ "id": "unit",
+ "value": "decbytes"
+ },
+ {
+ "id": "thresholds",
+ "value": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "transparent"
+ },
+ {
+ "color": "green",
+ "value": 0
+ }
+ ]
+ }
+ },
+ {
+ "id": "max",
+ "value": 8589934592
+ }
+ ]
+ }
+ ]
},
"gridPos": {
- "h": 3,
+ "h": 7,
"w": 4,
- "x": 8,
- "y": 20
+ "x": 16,
+ "y": 16
},
- "id": 25,
+ "id": 17,
+ "links": [
+ {
+ "icon": "external link",
+ "targetBlank": false,
+ "title": "Open Synapse dashboard",
+ "type": "link",
+ "url": "/d/000000012"
+ }
+ ],
"options": {
- "colorMode": "value",
- "graphMode": "none",
- "justifyMode": "center",
+ "displayMode": "gradient",
+ "maxVizHeight": 300,
+ "minVizHeight": 10,
+ "minVizWidth": 0,
+ "namePlacement": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
@@ -2078,121 +2279,88 @@
"fields": "",
"values": false
},
- "textMode": "value",
- "wideLayout": true
+ "showUnfilled": true,
+ "sizing": "auto",
+ "text": {
+ "titleSize": 22,
+ "valueSize": 21
+ },
+ "valueMode": "color"
},
"pluginVersion": "12.4.3",
"targets": [
{
"datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
- },
- "dimensions": {
- "ClusterName": "${env}",
- "ServiceName": "boxel-prerender-manager-${env}"
- },
- "hide": true,
- "metricEditorMode": 0,
- "metricName": "RunningTaskCount",
- "metricQueryType": 0,
- "namespace": "ECS/ContainerInsights",
- "period": "",
- "refId": "A",
- "region": "default",
- "statistic": "Maximum"
- },
- {
- "datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
- },
- "dimensions": {
- "ClusterName": "${env}",
- "ServiceName": "boxel-prerender-manager-${env}"
+ "type": "prometheus",
+ "uid": "bes7ustjf8w74b"
},
- "hide": true,
- "metricEditorMode": 0,
- "metricName": "DesiredTaskCount",
- "metricQueryType": 0,
- "namespace": "ECS/ContainerInsights",
- "period": "",
- "refId": "B",
- "region": "default",
- "statistic": "Maximum"
+ "expr": "rate(process_cpu_seconds_total{job=\"synapse\"}[5m]) * 100",
+ "instant": true,
+ "legendFormat": "CPU",
+ "refId": "A"
},
{
"datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
+ "type": "prometheus",
+ "uid": "bes7ustjf8w74b"
},
- "dimensions": {},
- "expression": "A + B/100",
- "id": "e1",
- "metricEditorMode": 1,
- "metricName": "",
- "metricQueryType": 0,
- "namespace": "",
- "period": "",
- "refId": "e1",
- "region": "default",
- "statistic": ""
+ "expr": "process_resident_memory_bytes{job=\"synapse\"}",
+ "instant": true,
+ "legendFormat": "Mem",
+ "refId": "B"
}
],
- "title": "Tasks",
- "type": "stat"
+ "title": "Synapse Process",
+ "type": "bargauge"
},
{
"datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
+ "type": "grafana-postgresql-datasource",
+ "uid": "cef5v5sl9k7i8f"
},
- "description": "ECS task count for `boxel-worker-${env}` — `Run` is RunningTaskCount, `Need` is DesiredTaskCount (both from ECS/ContainerInsights, Maximum over the panel refresh window). Run < Need means the service is under-scaled or tasks are crashing.",
+ "description": "Boxel application database transaction rate (xact_commit + xact_rollback) averaged since the postmaster started. Not a rolling window — recent spikes will be smoothed. Drill into the DB-specific dashboard for detail.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
- "decimals": 2,
- "mappings": [
+ "decimals": 0,
+ "links": [
{
- "options": {
- "pattern": "^(\\d+)\\.0*(\\d+)$",
- "result": {
- "text": "$1 / $2"
- }
- },
- "type": "regex"
+ "targetBlank": false,
+ "title": "Open Database dashboard",
+ "url": "/d/boxeldatabase1"
}
],
+ "mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
- "color": "red"
+ "color": "transparent"
},
{
"color": "green",
- "value": 1
+ "value": 0
}
]
},
- "unit": "none"
+ "unit": "short"
},
"overrides": []
},
"gridPos": {
- "h": 3,
+ "h": 7,
"w": 4,
- "x": 12,
- "y": 20
+ "x": 20,
+ "y": 16
},
- "id": 26,
+ "id": 18,
"options": {
- "colorMode": "value",
+ "colorMode": "background_solid",
"graphMode": "none",
- "justifyMode": "center",
- "orientation": "horizontal",
+ "justifyMode": "auto",
+ "orientation": "vertical",
"reduceOptions": {
"calcs": [
"lastNotNull"
@@ -2200,70 +2368,54 @@
"fields": "",
"values": false
},
- "textMode": "value",
- "wideLayout": true
+ "textMode": "value_and_name"
},
"pluginVersion": "12.4.3",
"targets": [
{
"datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
- },
- "dimensions": {
- "ClusterName": "${env}",
- "ServiceName": "boxel-worker-${env}"
- },
- "hide": true,
- "metricEditorMode": 0,
- "metricName": "RunningTaskCount",
- "metricQueryType": 0,
- "namespace": "ECS/ContainerInsights",
- "period": "",
- "refId": "A",
- "region": "default",
- "statistic": "Maximum"
- },
- {
- "datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
- },
- "dimensions": {
- "ClusterName": "${env}",
- "ServiceName": "boxel-worker-${env}"
- },
- "hide": true,
- "metricEditorMode": 0,
- "metricName": "DesiredTaskCount",
- "metricQueryType": 0,
- "namespace": "ECS/ContainerInsights",
- "period": "",
- "refId": "B",
- "region": "default",
- "statistic": "Maximum"
- },
- {
- "datasource": {
- "type": "cloudwatch",
- "uid": "cef5x9o3yzawwf"
+ "type": "grafana-postgresql-datasource",
+ "uid": "cef5v5sl9k7i8f"
},
- "dimensions": {},
- "expression": "A + B/100",
- "id": "e1",
- "metricEditorMode": 1,
- "metricName": "",
- "metricQueryType": 0,
- "namespace": "",
- "period": "",
- "refId": "e1",
- "region": "default",
- "statistic": ""
+ "editorMode": "code",
+ "format": "table",
+ "rawQuery": true,
+ "rawSql": "SELECT 60.0 * (xact_commit + xact_rollback) / NULLIF(EXTRACT(EPOCH FROM (NOW() - pg_postmaster_start_time())), 0) AS \"txn/min\" FROM pg_stat_database WHERE datname = 'boxel';",
+ "refId": "A"
}
],
- "title": "Tasks",
+ "title": "Postgres DB",
"type": "stat"
},
+ {
+ "description": "All firing and pending Grafana alert rules. Drill into a rule for details and silences. Replaces the Worker Status alertlist (which was a single-panel dashboard).",
+ "gridPos": {
+ "h": 8,
+ "w": 24,
+ "x": 0,
+ "y": 33
+ },
+ "id": 6,
+ "options": {
+ "alertInstanceLabelFilter": "",
+ "alertName": "",
+ "dashboardAlerts": false,
+ "groupBy": [],
+ "groupMode": "default",
+ "maxItems": 30,
+ "sortOrder": 1,
+ "stateFilter": {
+ "error": true,
+ "firing": true,
+ "noData": true,
+ "normal": false,
+ "pending": true
+ },
+ "viewMode": "list"
+ },
+ "title": "Active Alerts",
+ "type": "alertlist"
+ },
{
"datasource": {
"type": "grafana-postgresql-datasource",
@@ -2299,8 +2451,7 @@
}
]
}
- },
- "overrides": []
+ }
},
"gridPos": {
"h": 4,
@@ -2367,8 +2518,7 @@
}
]
}
- },
- "overrides": []
+ }
},
"gridPos": {
"h": 4,
@@ -2444,8 +2594,7 @@
]
},
"unit": "s"
- },
- "overrides": []
+ }
},
"gridPos": {
"h": 4,
@@ -2516,8 +2665,7 @@
}
]
}
- },
- "overrides": []
+ }
},
"gridPos": {
"h": 4,
@@ -2720,35 +2868,6 @@
"title": "Indexing throughput",
"type": "timeseries"
},
- {
- "description": "All firing and pending Grafana alert rules. Drill into a rule for details and silences. Replaces the Worker Status alertlist (which was a single-panel dashboard).",
- "gridPos": {
- "h": 8,
- "w": 24,
- "x": 0,
- "y": 33
- },
- "id": 6,
- "options": {
- "alertInstanceLabelFilter": "",
- "alertName": "",
- "dashboardAlerts": false,
- "groupBy": [],
- "groupMode": "default",
- "maxItems": 30,
- "sortOrder": 1,
- "stateFilter": {
- "error": true,
- "firing": true,
- "noData": true,
- "normal": false,
- "pending": true
- },
- "viewMode": "list"
- },
- "title": "Active Alerts",
- "type": "alertlist"
- },
{
"gridPos": {
"h": 8,
@@ -2781,7 +2900,7 @@
{
"hide": 2,
"name": "env",
- "query": "staging",
+ "query": "__ENV__",
"skipUrlSync": true,
"type": "constant"
},
(Run: https://github.com/cardstack/boxel/actions/runs/25819007010) |
Contributor
There was a problem hiding this comment.
Pull request overview
Note
Copilot was unable to run its full agentic suite in this review.
Updates the Boxel Status Overview Grafana dashboard to improve observability of active Matrix users and refine panel layout/typography for Synapse process gauges.
Changes:
- Split “Concurrent Users” into a stat panel plus a stacked, headerless table listing user IDs seen in the last 5 minutes (with deep-links).
- Retarget “Concurrent Users” panel links from the Synapse dashboard to the Users dashboard.
- Adjust Synapse CPU/Mem bargauge height and text sizing to better align with neighboring panels.
💡 Add Copilot custom instructions for smarter, more guided reviews. Learn how to get started.
- URL-encode the Matrix user ID in the per-user data link using the `:percentencode` formatter, so users with `@`, `:`, `+` or other reserved chars produce a valid query string. - Preserve the dashboard time range on the two panel-level "Open Users dashboard" links, matching the per-user data link behaviour. - Swap the `|~ "Processed request.*simplified_msc3575/sync"` regex filter in both Loki queries for two `|=` substring filters, which Loki evaluates much faster than a regex match. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
lukemelia
approved these changes
May 13, 2026
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Summary
var-matrix_user_idpre-set, so clicking a user lands on that user's credit/permission view.h:4→h:7to match the neighbouring Postgres DB stat and close the gap underneath it; shrank title / value font sizes (~25% off auto) so it sits proportionally with the rest of the bottom row.Test plan
cd packages/observability && ./scripts/apply.shand check the Overview dashboard top-right column shows a "Concurrent Users" stat with sparkline above a list of user IDs.Userfilter.🤖 Generated with Claude Code