Skip to content

Commit

Permalink
Identifying the nodes/hosts in alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
bvis committed Aug 30, 2017
1 parent f5710b0 commit 2d522af
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 20 deletions.
28 changes: 14 additions & 14 deletions rootfs/etc/prometheus/alert.rules_nodes
@@ -1,48 +1,48 @@

ALERT high_cpu_usage_on_node
IF sum(rate(process_cpu_seconds_total[5m])) by (instance) * 100 > 70
IF (sum(rate(process_cpu_seconds_total[5m])) by (instance) * 100 > 70) * ON(instance) GROUP_LEFT(host) host * ON(instance) GROUP_LEFT(node) host
FOR 5m
ANNOTATIONS {
summary = "HIGH CPU USAGE WARNING ON '{{ $labels.instance }}'",
description = "{{ $labels.instance }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.",
summary = "HIGH CPU USAGE WARNING ON '{{ $labels.host }}'",
description = "{{ $labels.host }} ({{ $labels.node }}) is using a LOT of CPU. CPU usage is {{ humanize $value}}%.",
}

ALERT high_memory_usage_on_node
IF ((node_memory_MemTotal-node_memory_MemAvailable)/node_memory_MemTotal)*100 > 80
IF (((node_memory_MemTotal-node_memory_MemAvailable)/node_memory_MemTotal)*100 > 80) * ON(instance) GROUP_LEFT(host) host * ON(instance) GROUP_LEFT(node) host
FOR 5m
ANNOTATIONS {
summary = "HIGH MEMORY USAGE WARNING TASK ON '{{ $labels.host }}'",
description = "{{ $labels.instance }} is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.",
description = "{{ $labels.host }} ({{ $labels.node }}) is using a LOT of MEMORY. MEMORY usage is over {{ humanize $value}}%.",
}

ALERT high_la_usage_on_node
IF node_load5 > 5
IF (node_load5 > 5) * ON(instance) GROUP_LEFT(host) host * ON(instance) GROUP_LEFT(node) host
FOR 5m
ANNOTATIONS {
summary = "HIGH LOAD AVERAGE WARNING ON '{{ $labels.instance }}'",
description = "{{ $labels.instance }} has a high load average. Load Average 5m is {{ humanize $value}}.",
summary = "HIGH LOAD AVERAGE WARNING ON '{{ $labels.host }}'",
description = "{{ $labels.host }} ({{ $labels.node }}) has a high load average. Load Average 5m is {{ humanize $value}}.",
}

ALERT monitoring_service_down
IF up == 0
IF (up == 0) * ON(instance) GROUP_LEFT(host) host * ON(instance) GROUP_LEFT(node) host
FOR 5m
ANNOTATIONS {
summary = "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels.host }}'",
description = "The monitoring service '{{ $labels.job }}' is down.",
description = "The monitoring service '{{ $labels.job }}' is down in host {{ $labels.host }} ({{ $labels.node }}).",
}

ALERT node_running_out_of_disk_space
IF (node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) * 100/ node_filesystem_size{mountpoint="/"} > 80
IF ((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}) * 100/ node_filesystem_size{mountpoint="/"} > 80) * ON(instance) GROUP_LEFT(host) host * ON(instance) GROUP_LEFT(node) host
FOR 5m
ANNOTATIONS {
summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'",
description = "More than 80% of disk used. Disk usage {{ humanize $value }}%.",
description = "More than 80% of disk used in {{ $labels.host }} ({{ $labels.node }}). Disk usage {{ humanize $value }}%.",
}

ALERT disk_will_fill_in_8_hours
IF predict_linear(node_filesystem_free{mountpoint="/"}[1h], 8*3600) < 0
IF (predict_linear(node_filesystem_free{mountpoint="/"}[1h], 8*3600) < 0) * ON(instance) GROUP_LEFT(host) host * ON(instance) GROUP_LEFT(node) host
FOR 5m
ANNOTATIONS {
summary = "DISK SPACE FULL IN 8 HOURS: NODE '{{ $labels.host }}'",
description = "{{ $labels.instance }} is writing a lot.",
description = "{{ $labels.host }} ({{ $labels.node }}) is writing a lot.",
}
12 changes: 6 additions & 6 deletions rootfs/etc/prometheus/alert.rules_tasks
@@ -1,16 +1,16 @@

ALERT high_cpu_usage_on_container
IF sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) by (container_label_com_docker_swarm_task_name,instance) * 100 > 200
IF sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[5m])) by (container_label_com_docker_swarm_task_name,container_label_com_docker_swarm_node_id) * 100 > 200
FOR 5m
ANNOTATIONS {
summary = "HIGH CPU USAGE WARNING: TASK '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.instance }}'",
description = "{{ $labels.container_label_com_docker_swarm_task_name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.",
summary = "HIGH CPU USAGE WARNING: TASK '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'",
description = "{{ $labels.container_label_com_docker_swarm_task_name }} is using a LOT of CPU in node {{ $labels.container_label_com_docker_swarm_node_id }}. CPU usage is {{ humanize $value}}%.",
}

ALERT container_eating_memory
IF sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) by (instance,name) > 2500000000
IF sum(container_memory_rss{container_label_com_docker_swarm_task_name=~".+"}) by (container_label_com_docker_swarm_task_name,container_label_com_docker_swarm_node_id) > 2500000000
FOR 5m
ANNOTATIONS {
summary = "HIGH MEMORY USAGE WARNING: TASK '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.instance }}'",
description = "{{ $labels.container_label_com_docker_swarm_task_name }} is eating up a LOT of memory. Memory consumption of {{ $labels.container_label_com_docker_swarm_task_name }} is at {{ humanize $value}}.",
summary = "HIGH MEMORY USAGE WARNING: TASK '{{ $labels.container_label_com_docker_swarm_task_name }}' on '{{ $labels.container_label_com_docker_swarm_node_id }}'",
description = "{{ $labels.container_label_com_docker_swarm_task_name }} is eating up a LOT of memory. Memory consumption in node {{ $labels.container_label_com_docker_swarm_node_id }} is at {{ humanize $value}}.",
}

0 comments on commit 2d522af

Please sign in to comment.