Skip to content
Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
155 lines (138 sloc) 6.07 KB
# Default values for Prometheus.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.
images:
init:
repository: astronomerinc/ap-base
tag: 0.9.0-alpha.1
pullPolicy: IfNotPresent
prometheus:
repository: astronomerinc/ap-prometheus
tag: 0.9.0-alpha.1
pullPolicy: IfNotPresent
resources: {}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Which directory prometheus data should be stored
dataDir: "/prometheus"
# How long prometheus should keep data before removing
retention: 15d
# Enable persistence for Prometheus
persistence:
enabled: true
size: 100Gi
storageClassName: ~
ports:
http: 9090
# Define alerts as values. This map gets toYaml'ed into the
# prometheus alerts configmap. This is required because the template strings here
# get parsed by helm, causing errors.
alerts:
groups:
- name: platform
rules:
- alert: PrometheusDiskUsage
expr: (kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"prometheus-data-volume-.*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"prometheus-data-volume-.*"} * 100) < 10
for: 5m
labels:
tier: platform
component: prometheus
annotations:
summary: "Prometheus High Disk Usage"
description: "Prometheus has less than 10% disk space available."
- alert: RegistryDiskUsage
expr: (kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"registry-data-volume-.*"} / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"registry-data-volume-.*"} * 100) < 10
for: 5m
labels:
tier: platform
component: registry
annotations:
summary: "Docker Registry High Disk Usage"
description: "Docker Registry has less than 10% disk space available."
- alert: ElasticsearchDiskUsage
expr: (sum(kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"es-data-volume-.*"}) / sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"es-data-volume-.*"}) * 100) < 10
for: 5m
labels:
tier: platform
component: elasticsearch
annotations:
summary: "Elasticsearch High Disk Usage"
description: "Elasticsearch has less than 10% disk space available."
- alert: IngessCertificateExpiration
expr: avg(nginx_ingress_controller_ssl_expire_time_seconds) by (host) - time() < 604800
for: 10s
labels:
tier: platform
component: ingress
annotations:
summary: "TLS Certificate Expiring Soon"
description: "The TLS Certificate for {{ $labels.host }} is expiring in less than a week."
- name: airflow
rules:
- alert: AirflowDeploymentUnhealthy
expr: sum by (release) (kube_pod_container_status_running{namespace=~".*-.*-.*-[0-9]{4}"}) - count by (release) (kube_pod_container_status_running{namespace=~".*-.*-.*-[0-9]{4}"}) < 0
for: 15m # Rough number but should be enough to clear deployments with a reasonable amount of workers
labels:
tier: airflow
component: deployment
deployment: "{{ $labels.release }}"
annotations:
summary: "{{ $labels.release }} deployment is unhealthy"
description: "The {{ $labels.release }} deployment is not completely available."
- alert: AirflowFailureRate
expr: rate(airflow_operator_failures[1h]) > rate(airflow_operator_failures[2d])
for: 5m
labels:
tier: airflow
component: scheduler
deployment: "{{ $labels.deployment }}"
annotations:
summary: "{{ $labels.deployment }} tasks are failing"
description: "The {{ $labels.deployment }} deployment has a high task failure rate."
# This alert depends on the scheduler_heartbeat metric being a counter.
# The type filter here was introduced in 0.7.0, so we don't trigger alerts for
# older deployments.
- alert: AirflowSchedulerUnhealthy
expr: round(rate(airflow_scheduler_heartbeat{type="counter"}[1m]) * 5) == 0
for: 3m # Scheduler should reboot quick
labels:
tier: airflow
component: scheduler
deployment: "{{ $labels.deployment }}"
annotations:
summary: "{{ $labels.deployment }} scheduler is unhealthy"
description: "The {{ $labels.deployment }} scheduler's heartbeat has dropped below the acceptable rate."
- alert: AirflowPodQuota
expr: (sum by (release) (kube_resourcequota{resource="pods", type="used", namespace=~".*-.*-.*-[0-9]{4}"}) / sum by (release) (kube_resourcequota{resource="pods", type="hard", namespace=~".*-.*-.*-[0-9]{4}"})*100) > 95
for: 10m
labels:
tier: airflow
component: deployment
deployment: "{{ $labels.release }}"
annotations:
summary: "{{ $labels.release }} is near its pod quota"
description: "{{ $labels.release }} has been using over 95% of its pod quota for over 10 minutes."
# - alert: AirflowCPUQuota
# expr: (sum by (release) (kube_resourcequota{resource="limits.cpu", type="used", namespace=~".*-.*-.*-[0-9]{4}"}) / sum by (release) (kube_resourcequota{resource="limits.cpu", type="hard", namespace=~".*-.*-.*-[0-9]{4}"})*100) > 95
# for: 10m
# labels:
# tier: airflow
# component: deployment
# deployment: "{{ $labels.release }}"
# annotations:
# summary: "{{ $labels.release }} is near its cpu quota"
# description: "{{ $labels.release }} has been using over 95% of its cpu quota for over 10 minutes."
# - alert: AirflowMemoryQuota
# expr: (sum by (release) (kube_resourcequota{resource="limits.memory", type="used", namespace=~".*-.*-.*-[0-9]{4}"}) / sum by (release) (kube_resourcequota{resource="limits.memory", type="hard", namespace=~".*-.*-.*-[0-9]{4}"})*100) > 95
# for: 10m
# labels:
# tier: airflow
# component: deployment
# deployment: "{{ $labels.release }}"
# annotations:
# summary: "{{ $labels.release }} is near its memory quota"
# description: "{{ $labels.release }} has been using over 95% of its memory quota for over 10 minutes."
You can’t perform that action at this time.