diff --git a/monitoring/ceph-mixin/config.libsonnet b/monitoring/ceph-mixin/config.libsonnet index 7c3216b36d489..7ee1210b04331 100644 --- a/monitoring/ceph-mixin/config.libsonnet +++ b/monitoring/ceph-mixin/config.libsonnet @@ -4,5 +4,8 @@ clusterLabel: 'cluster', showMultiCluster: false, + + CephNodeNetworkPacketDropsThreshold: 0.005, + CephNodeNetworkPacketDropsPerSec: 10, }, } diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index a00d621c15bd6..bed89a8790648 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -513,16 +513,38 @@ }, { alert: 'CephNodeNetworkPacketDrops', - expr: '( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) >= 10', + expr: ||| + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= %(CephNodeNetworkPacketDropsThreshold)s and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= %(CephNodeNetworkPacketDropsPerSec)s + ||| % $._config, labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' }, annotations: { summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(), - description: 'Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}.', + description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec }, }, }, { alert: 'CephNodeNetworkPacketErrors', - expr: '( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) >= 10', + expr: ||| + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + |||, labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' }, annotations: { summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(), diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index 7602d48a414d7..a544d41eb0ee0 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -459,9 +459,19 @@ groups: type: "ceph_default" - alert: "CephNodeNetworkPacketDrops" annotations: - description: "Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}." + description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}." summary: "One or more NICs reports packet drops" - expr: "( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) >= 10" + expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 labels: oid: "1.3.6.1.4.1.50495.1.2.1.8.2" severity: "warning" @@ -470,7 +480,17 @@ groups: annotations: description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}." summary: "One or more NICs reports packet errors" - expr: "( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) >= 10" + expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 labels: oid: "1.3.6.1.4.1.50495.1.2.1.8.3" severity: "warning" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index 847cf3b7e3d34..7b7e7db7301bd 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -375,32 +375,38 @@ tests: description: "Root volume is dangerously full: 4.811% free." # network packets dropped - - interval: 1s + - interval: 1m input_series: - series: 'node_network_receive_drop_total{device="eth0", instance="node-exporter",job="node-exporter"}' - values: '1+1x500' + values: '0+600x10' - series: 'node_network_transmit_drop_total{device="eth0", instance="node-exporter",job="node-exporter"}' - values: '1+1x500' + values: '0+600x10' + - series: 'node_network_receive_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' + - series: 'node_network_transmit_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' promql_expr_test: - expr: | ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) - ) >= 0.0001 or ( - increase(node_network_receive_drop_total{device!="lo"}[1m]) + - increase(node_network_transmit_drop_total{device!="lo"}[1m]) + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) ) >= 10 eval_time: 5m exp_samples: - labels: '{device="eth0", instance="node-exporter", job="node-exporter"}' - value: 1.2E+02 + value: 8E-1 alert_rule_test: - eval_time: 5m alertname: CephNodeNetworkPacketDrops @@ -414,35 +420,41 @@ tests: type: ceph_default exp_annotations: summary: One or more NICs reports packet drops - description: "Node node-exporter experiences packet drop > 0.01% or > 10 packets/s on interface eth0." + description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0." # network packets errors - - interval: 1s + - interval: 1m input_series: - series: 'node_network_receive_errs_total{device="eth0", instance="node-exporter",job="node-exporter"}' - values: '1+1x500' + values: '0+600x10' - series: 'node_network_transmit_errs_total{device="eth0", instance="node-exporter",job="node-exporter"}' - values: '1+1x500' + values: '0+600x10' + - series: 'node_network_transmit_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' + - series: 'node_network_receive_packets_total{device="eth0", + instance="node-exporter",job="node-exporter"}' + values: '0+750x10' promql_expr_test: - expr: | ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) ) / ( - increase(node_network_receive_packets_total{device!="lo"}[1m]) + - increase(node_network_transmit_packets_total{device!="lo"}[1m]) + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) ) >= 0.0001 or ( - increase(node_network_receive_errs_total{device!="lo"}[1m]) + - increase(node_network_transmit_errs_total{device!="lo"}[1m]) + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) ) >= 10 eval_time: 5m exp_samples: - labels: '{device="eth0", instance="node-exporter", job="node-exporter"}' - value: 1.2E+02 + value: 8E-01 alert_rule_test: - eval_time: 5m alertname: CephNodeNetworkPacketErrors