Skip to content

Commit

Permalink
ceph-mixin: fix CephNodeNetworkPacket alerts
Browse files Browse the repository at this point in the history
Signed-off-by: Aswin Toni <aswin.toni@cern.ch>
(cherry picked from commit 351e1ac)
  • Loading branch information
Aswin Toni authored and nizamial09 committed Sep 20, 2022
1 parent d081732 commit 7ee9653
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 28 deletions.
3 changes: 3 additions & 0 deletions monitoring/ceph-mixin/config.libsonnet
Expand Up @@ -4,5 +4,8 @@

clusterLabel: 'cluster',
showMultiCluster: false,

CephNodeNetworkPacketDropsThreshold: 0.005,
CephNodeNetworkPacketDropsPerSec: 10,
},
}
28 changes: 25 additions & 3 deletions monitoring/ceph-mixin/prometheus_alerts.libsonnet
Expand Up @@ -513,16 +513,38 @@
},
{
alert: 'CephNodeNetworkPacketDrops',
expr: '( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) >= 10',
expr: |||
(
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= %(CephNodeNetworkPacketDropsThreshold)s and (
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) >= %(CephNodeNetworkPacketDropsPerSec)s
||| % $._config,
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
annotations: {
summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
description: 'Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}.',
description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
},
},
{
alert: 'CephNodeNetworkPacketErrors',
expr: '( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) >= 10',
expr: |||
(
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
|||,
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' },
annotations: {
summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(),
Expand Down
26 changes: 23 additions & 3 deletions monitoring/ceph-mixin/prometheus_alerts.yml
Expand Up @@ -459,9 +459,19 @@ groups:
type: "ceph_default"
- alert: "CephNodeNetworkPacketDrops"
annotations:
description: "Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
summary: "One or more NICs reports packet drops"
expr: "( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) >= 10"
expr: |
(
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0050000000000000001 and (
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
severity: "warning"
Expand All @@ -470,7 +480,17 @@ groups:
annotations:
description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
summary: "One or more NICs reports packet errors"
expr: "( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) >= 10"
expr: |
(
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
labels:
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
severity: "warning"
Expand Down
56 changes: 34 additions & 22 deletions monitoring/ceph-mixin/tests_alerts/test_alerts.yml
Expand Up @@ -375,32 +375,38 @@ tests:
description: "Root volume is dangerously full: 4.811% free."

# network packets dropped
- interval: 1s
- interval: 1m
input_series:
- series: 'node_network_receive_drop_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '1+1x500'
values: '0+600x10'
- series: 'node_network_transmit_drop_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '1+1x500'
values: '0+600x10'
- series: 'node_network_receive_packets_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '0+750x10'
- series: 'node_network_transmit_packets_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '0+750x10'
promql_expr_test:
- expr: |
(
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
increase(node_network_transmit_drop_total{device!="lo"}[1m])
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
increase(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
increase(node_network_transmit_drop_total{device!="lo"}[1m])
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0050000000000000001 and (
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
rate(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
eval_time: 5m
exp_samples:
- labels: '{device="eth0", instance="node-exporter",
job="node-exporter"}'
value: 1.2E+02
value: 8E-1
alert_rule_test:
- eval_time: 5m
alertname: CephNodeNetworkPacketDrops
Expand All @@ -414,35 +420,41 @@ tests:
type: ceph_default
exp_annotations:
summary: One or more NICs reports packet drops
description: "Node node-exporter experiences packet drop > 0.01% or > 10 packets/s on interface eth0."
description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."

# network packets errors
- interval: 1s
- interval: 1m
input_series:
- series: 'node_network_receive_errs_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '1+1x500'
values: '0+600x10'
- series: 'node_network_transmit_errs_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '1+1x500'
values: '0+600x10'
- series: 'node_network_transmit_packets_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '0+750x10'
- series: 'node_network_receive_packets_total{device="eth0",
instance="node-exporter",job="node-exporter"}'
values: '0+750x10'
promql_expr_test:
- expr: |
(
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
increase(node_network_transmit_errs_total{device!="lo"}[1m])
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
increase(node_network_transmit_packets_total{device!="lo"}[1m])
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
rate(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
increase(node_network_transmit_errs_total{device!="lo"}[1m])
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
rate(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
eval_time: 5m
exp_samples:
- labels: '{device="eth0", instance="node-exporter",
job="node-exporter"}'
value: 1.2E+02
value: 8E-01
alert_rule_test:
- eval_time: 5m
alertname: CephNodeNetworkPacketErrors
Expand Down

0 comments on commit 7ee9653

Please sign in to comment.