Skip to content

Commit

Permalink
Substrate alerts rules update (paritytech#10642)
Browse files Browse the repository at this point in the history
* .maintain/monitoring: Update substrate prometheus alert rules

* match the `substrate_` metrics prefix in alerts instead of `polkadot_`, following changes in paritytech#9543
* remove the filtering on polkadot|kusama domain for NumberOfFileDescriptorsHigh alert

* .maintain/monitoring: Update substrate Grafana dashboards

* match the `substrate_` metrics prefix instead of `polkadot_` in dashboards, following changes in paritytech#9543

* .maintain/monitoring:  make the NumberOfFileDescriptorsHigh alert only apply for metrics tagged with 'chain'
  • Loading branch information
PierreBesson authored and agryaznov committed Feb 4, 2022
1 parent 1881cc5 commit 634f8b3
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 99 deletions.
144 changes: 72 additions & 72 deletions .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,39 @@ evaluation_interval: 1m
tests:
- interval: 1m
input_series:
- series: 'polkadot_sub_libp2p_peers_count{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
- series: 'substrate_sub_libp2p_peers_count{
job="substrate",
pod="substrate-abcdef01234-abcdef",
instance="substrate-abcdef01234-abcdef",
}'
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1

- series: 'polkadot_sub_txpool_validations_scheduled{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
- series: 'substrate_sub_txpool_validations_scheduled{
job="substrate",
pod="substrate-abcdef01234-abcdef",
instance="substrate-abcdef01234-abcdef",
}'
values: '11+1x10 22+2x30 10043x5'

- series: 'polkadot_sub_txpool_validations_finished{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
- series: 'substrate_sub_txpool_validations_finished{
job="substrate",
pod="substrate-abcdef01234-abcdef",
instance="substrate-abcdef01234-abcdef",
}'
values: '0+1x42 42x5'

- series: 'polkadot_block_height{
status="best", job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
- series: 'substrate_block_height{
status="best", job="substrate",
pod="substrate-abcdef01234-abcdef",
instance="substrate-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

- series: 'polkadot_block_height{
- series: 'substrate_block_height{
status="finalized",
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
job="substrate",
pod="substrate-abcdef01234-abcdef",
instance="substrate-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

Expand All @@ -56,37 +56,37 @@ tests:
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
status: best
exp_annotations:
message: "Best block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
substrate-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."

- eval_time: 14m
alertname: BlockProductionSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
status: best
exp_annotations:
message: "Best block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
substrate-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
status: best
exp_annotations:
message: "Best block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
substrate-abcdef01234-abcdef increases by less than 1 per
minute for more than 10 minutes."

######################################################################
Expand All @@ -101,37 +101,37 @@ tests:
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
substrate-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."

- eval_time: 14m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
substrate-abcdef01234-abcdef increases by less than 1 per
minute for more than 3 minutes."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
substrate-abcdef01234-abcdef increases by less than 1 per
minute for more than 10 minutes."

######################################################################
Expand All @@ -152,12 +152,12 @@ tests:
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
substrate-abcdef01234-abcdef has been monotonically
increasing for more than 10 minutes."
- eval_time: 43m
alertname: TransactionQueueSizeIncreasing
Expand All @@ -167,21 +167,21 @@ tests:
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
substrate-abcdef01234-abcdef has been monotonically
increasing for more than 10 minutes."
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been monotonically
substrate-abcdef01234-abcdef has been monotonically
increasing for more than 30 minutes."
- eval_time: 49m
alertname: TransactionQueueSizeHigh
Expand All @@ -191,12 +191,12 @@ tests:
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The transaction pool size on node
polkadot-abcdef01234-abcdef has been above 10_000 for more
substrate-abcdef01234-abcdef has been above 10_000 for more
than 5 minutes."

######################################################################
Expand All @@ -211,29 +211,29 @@ tests:
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
message: "The node substrate-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"

- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
alertname: NumberOfPeersLow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
message: "The node substrate-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
pod: substrate-abcdef01234-abcdef
instance: substrate-abcdef01234-abcdef
job: substrate
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
message: "The node substrate-abcdef01234-abcdef has less
than 3 peers for more than 15 minutes"
Loading

0 comments on commit 634f8b3

Please sign in to comment.