diff --git a/stack_orchestrator/data/compose/docker-compose-grafana.yml b/stack_orchestrator/data/compose/docker-compose-grafana.yml index d559b246..5054a0db 100644 --- a/stack_orchestrator/data/compose/docker-compose-grafana.yml +++ b/stack_orchestrator/data/compose/docker-compose-grafana.yml @@ -6,10 +6,16 @@ services: restart: always environment: GF_SERVER_ROOT_URL: ${GF_SERVER_ROOT_URL} + CERC_GRAFANA_ALERTS_SUBGRAPH_IDS: ${CERC_GRAFANA_ALERTS_SUBGRAPH_IDS} volumes: - ../config/monitoring/grafana/provisioning:/etc/grafana/provisioning - ../config/monitoring/grafana/dashboards:/etc/grafana/dashboards + - ../config/monitoring/update-grafana-alerts-config.sh:/update-grafana-alerts-config.sh - grafana_storage:/var/lib/grafana + user: root + entrypoint: ["bash", "-c"] + command: | + "/update-grafana-alerts-config.sh && /run.sh" ports: - "3000" healthcheck: diff --git a/stack_orchestrator/data/config/monitoring/subgraph-alert-rules.yml b/stack_orchestrator/data/config/monitoring/subgraph-alert-rules.yml new file mode 100644 index 00000000..ed59e8ef --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/subgraph-alert-rules.yml @@ -0,0 +1,64 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: subgraph + folder: SubgraphAlerts + interval: 30s + rules: + - uid: b2a9144b-6104-46fc-92b5-352f4e643c4c + title: subgraph_head_tracking + condition: condition + data: + - refId: diff + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: PBFA97CFB590B2093 + model: + datasource: + type: prometheus + uid: PBFA97CFB590B2093 + editorMode: code + expr: ethereum_chain_head_number - on(network) group_right deployment_head{deployment=~"REPLACE_WITH_SUBGRAPH_IDS"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: diff + - refId: condition + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 15 + - 0 + type: gt + operator: + type: and + query: + params: [] + reducer: + params: [] + type: avg + type: query + datasource: + name: Expression + type: __expr__ + uid: __expr__ + expression: diff + intervalMs: 1000 + maxDataPoints: 43200 + refId: condition + type: threshold + noDataState: OK + execErrState: Alerting + for: 5m + annotations: + summary: Subgraph deployment {{ index $labels "deployment" }} is falling behind head by {{ index $values "diff" }} + labels: {} + isPaused: false diff --git a/stack_orchestrator/data/config/monitoring/update-grafana-alerts-config.sh b/stack_orchestrator/data/config/monitoring/update-grafana-alerts-config.sh new file mode 100755 index 00000000..b6ec932c --- /dev/null +++ b/stack_orchestrator/data/config/monitoring/update-grafana-alerts-config.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo Using CERC_GRAFANA_ALERTS_SUBGRAPH_IDS ${CERC_GRAFANA_ALERTS_SUBGRAPH_IDS} + +# Replace subgraph ids in subgraph alerting config +# Note: Requires the grafana container to be run with user root +sed -i "s/REPLACE_WITH_SUBGRAPH_IDS/$CERC_GRAFANA_ALERTS_SUBGRAPH_IDS/g" /etc/grafana/provisioning/alerting/subgraph-alert-rules.yml diff --git a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md index 2f057c3c..c4704f58 100644 --- a/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md +++ b/stack_orchestrator/data/stacks/monitoring/monitoring-watchers.md @@ -113,21 +113,31 @@ Add the following scrape configs to prometheus config file (`monitoring-watchers labels: instance: 'ajna' chain: 'filecoin' + + - job_name: graph-node + metrics_path: /metrics + scrape_interval: 30s + static_configs: + - targets: ['GRAPH_NODE_HOST:GRAPH_NODE_HOST_METRICS_PORT'] ``` Add scrape config as done above for any additional watcher to add it to the Watchers dashboard. ### Grafana alerts config -Place the pre-configured watcher alerts rules in Grafana provisioning directory: +Place the pre-configured alerts rules in Grafana provisioning directory: ```bash + # watcher alert rules cp monitoring-watchers-deployment/config/monitoring/watcher-alert-rules.yml monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/ + + # subgraph alert rules + cp monitoring-watchers-deployment/config/monitoring/subgraph-alert-rules.yml monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/ ``` Update the alerting contact points config (`monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/contactpoints.yml`) with desired contact points -Add corresponding routes to the notification policies config (`monitoring-watchers-deployment/monitoring/grafana/provisioning/alerting/policies.yaml`) with appropriate object-matchers: +Add corresponding routes to the notification policies config (`monitoring-watchers-deployment/config/monitoring/grafana/provisioning/alerting/policies.yml`) with appropriate object-matchers: ```yml ... @@ -135,7 +145,7 @@ Add corresponding routes to the notification policies config (`monitoring-watche - receiver: SlackNotifier object_matchers: # Add matchers below - - ['grafana_folder', '=', 'WatcherAlerts'] + - ['grafana_folder', '=~', 'WatcherAlerts|SubgraphAlerts'] ``` ### Env @@ -149,6 +159,9 @@ Set the following env variables in the deployment env config file (`monitoring-w # Grafana server host URL to be used # (Optional, default: http://localhost:3000) GF_SERVER_ROOT_URL= + + # List of subgraph ids to configure alerts for (separated by |) + CERC_GRAFANA_ALERTS_SUBGRAPH_IDS= ``` ## Start the stack