Skip to content

Commit

Permalink
allow disabling default platform and airflow alerts (#1943)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhoherd authored and pgvishnuram committed Aug 21, 2023
1 parent 9f9255d commit 66e09bf
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 14 deletions.
10 changes: 10 additions & 0 deletions charts/prometheus/templates/prometheus-alerts-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,13 @@ data:
groups:
- name: airflow
rules:
{{- if and (not .Values.additionalAlerts.airflow) (not .Values.defaultAlerts.airflow.enabled) }}
[]
{{ end }}
{{- if .Values.additionalAlerts.airflow }}
{{- tpl .Values.additionalAlerts.airflow $ | nindent 8 }}
{{- end }}
{{- if .Values.defaultAlerts.airflow.enabled }}
- alert: AirflowDeploymentUnhealthy
expr: sum by(release) (kube_pod_container_status_running{container=~".*(scheduler|scheduler-gc|webserver|worker|statsd|pgbouncer|metrics-exporter|redis|flower|triggerer)"}) - count by(release) (kube_pod_container_status_running{container=~".*(scheduler|scheduler-gc|webserver|worker|statsd|pgbouncer|metrics-exporter|redis|flower|triggerer)"}) < 0
for: 15m # Rough number but should be enough to clear deployments with a reasonable amount of workers
Expand Down Expand Up @@ -121,12 +125,17 @@ data:
annotations:
summary: {{ printf "%q" "{{ $labels.pod_name }} ({{ $labels.container_name }}) in namespace {{ $labels.namespace }} is getting throttled {{ $value }}% of the time" }}
description: "In the past 5 minutes, one or more components in the deployment are experiencing CPU throttling."
{{- end }}
- name: platform
rules:
{{- if and (not .Values.additionalAlerts.platform) (not .Values.defaultAlerts.platform.enabled) }}
[]
{{- end }}
{{- if .Values.additionalAlerts.platform }}
{{- tpl .Values.additionalAlerts.platform $ | nindent 8 }}
{{- end }}
{{- if .Values.defaultAlerts.platform.enabled }}
- alert: AirflowOperatorFailureRate
expr: 100 * (sum by (operator) (increase(airflow_operator_failures{operator=~"([A-Z][a-z0-9]+)(([0-9])|([A-Z0-9][a-z0-9]+))*([A-Z])?Operator"}[1h])) / (sum by (operator) (increase(airflow_operator_successes{operator=~"([A-Z][a-z0-9]+)(([0-9])|([A-Z0-9][a-z0-9]+))*([A-Z])?Operator"}[1h])) + sum by (operator) (increase(airflow_operator_failures{operator=~"([A-Z][a-z0-9]+)(([0-9])|([A-Z0-9][a-z0-9]+))*([A-Z])?Operator"}[1h])))) > 50
for: 2h
Expand Down Expand Up @@ -158,6 +167,7 @@ data:
annotations:
summary: "Half or more of schedulers do not have a heartbeat"
description: {{ printf "%q" "{{ $value }} }} schedulers do not have a heartbeat in the last five minutes" }}
{{- end }}
- name: node-exporter-recording.rules
rules:
Expand Down
13 changes: 11 additions & 2 deletions charts/prometheus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,20 @@ tcpProbe:
# Enable prometheus lifecycle api
enableLifecycle: true

# This section allows you to disable parts of the default set of alerts. Use
# this in combination with additionalAlerts if you want prometheus to only
# use the alerts you have configured.
defaultAlerts:
airflow:
enabled: true
platform:
enabled: true

additionalAlerts:
# Additional rules for the 'platform' alert group
# Additional rules appended to the default 'platform' alert group
# Provide as a block string in yaml list form
platform: ~
# Additional rules for the 'airflow' alert group
# Additional rules appended to the default 'airflow' alert group
# Provide as a block string in yaml list form
airflow: ~
# Example:
Expand Down
29 changes: 17 additions & 12 deletions tests/chart_tests/helm_template_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from tempfile import NamedTemporaryFile
from typing import Any, Optional
from pathlib import Path
import os

import jsonschema
import requests
Expand All @@ -32,6 +33,7 @@

BASE_URL_SPEC = "https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master"
GIT_ROOT = Path(__file__).parent.parent.parent
DEBUG = os.getenv("DEBUG", "").lower() in ["yes", "true", "1"]


def get_schema_k8s(api_version, kind, kube_version="1.24.0"):
Expand Down Expand Up @@ -89,7 +91,7 @@ def render_chart(
"""
values = values or {}
chart_dir = chart_dir or sys.path[0]
with NamedTemporaryFile(delete=True) as tmp_file: # use delete=False when debugging
with NamedTemporaryFile(delete=not DEBUG) as tmp_file: # export DEBUG=true to keep
content = yaml.dump(values)
tmp_file.write(content.encode())
tmp_file.flush()
Expand Down Expand Up @@ -117,19 +119,22 @@ def render_chart(
if not templates:
return None
except subprocess.CalledProcessError as error:
print("ERROR: subprocess.CalledProcessError:")
print(f"helm command: {' '.join(command)}")
print(f"Values file contents:\n{'-' * 21}\n{yaml.dump(values)}{'-' * 21}")
print(f"{error.output=}\n{error.stderr=}")

if "could not find template" in error.stderr.decode("utf-8"):
if DEBUG:
print("ERROR: subprocess.CalledProcessError:")
print(f"helm command: {' '.join(command)}")
print(
"ERROR: command is probably using templates with null output, which "
+ "usually means there is a helm value that needs to be set to render "
+ "the content of the chart.\n"
+ "command: "
+ " ".join(command)
f"Values file contents:\n{'-' * 21}\n{yaml.dump(values)}{'-' * 21}"
)
print(f"{error.output=}\n{error.stderr=}")

if "could not find template" in error.stderr.decode("utf-8"):
print(
"ERROR: command is probably using templates with null output, which "
+ "usually means there is a helm value that needs to be set to render "
+ "the content of the chart.\n"
+ "command: "
+ " ".join(command)
)
raise
k8s_objects = yaml.full_load_all(templates)
k8s_objects = [k8s_object for k8s_object in k8s_objects if k8s_object] # type: ignore
Expand Down
64 changes: 64 additions & 0 deletions tests/chart_tests/test_prometheus_alerts_configmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from tests import supported_k8s_versions
import re
import pytest
from textwrap import dedent


@pytest.mark.parametrize(
Expand Down Expand Up @@ -53,6 +54,7 @@ def test_prometheus_alerts_configmap(self, kube_version):

# Validate the contents of an embedded yaml doc
groups = yaml.safe_load(doc["data"]["alerts"])["groups"]
assert len(groups) == 22
for group in groups:
assert isinstance(group.get("name"), str)
assert isinstance(group.get("rules"), list)
Expand Down Expand Up @@ -104,3 +106,65 @@ def test_prometheus_alerts_configmap_with_addition_alerts(self, kube_version):
r".*If more than 2 Airflow Schedulers are not heartbeating for more than 5 minutes, this alarm fires..*",
config_yaml,
)

@pytest.mark.parametrize("section", ["airflow", "platform"])
def test_default_alerts_section_disabled(self, kube_version, section):
"""Should only disable the alert rules for the given section."""
values = {
"prometheus": {
"defaultAlerts": {
section: {
"enabled": False,
}
}
}
}
docs = render_chart(
kube_version=kube_version,
show_only=self.show_only,
values=values,
)

groups = yaml.safe_load(docs[0]["data"]["alerts"])["groups"]
assert len(groups) == 22
assert [x["rules"] for x in groups if x["name"] == section] == [[]]
assert len([x["rules"] for x in groups if x["name"] != section]) == 21

@pytest.mark.parametrize("section", ["airflow", "platform"])
def test_default_alerts_section_disabled_with_additional_alerts(
self, kube_version, section
):
"""Should only show the additional alert rules for the given section."""
values = {
"prometheus": {
"defaultAlerts": {
section: {
"enabled": False,
}
},
"additionalAlerts": {
section: dedent(
"""
- alert: some-happy-alert
expr: sum(all-happiness)
"""
)
},
}
}
docs = render_chart(
kube_version=kube_version,
show_only=self.show_only,
values=values,
)

groups = yaml.safe_load(docs[0]["data"]["alerts"])["groups"]
assert len(groups) == 22
assert [x["rules"] for x in groups if x["name"] == section] == [
[{"alert": "some-happy-alert", "expr": "sum(all-happiness)"}]
]
assert [
x["rules"] != [{"alert": "some-happy-alert", "expr": "sum(all-happiness)"}]
for x in groups
if x["name"] != section
]

0 comments on commit 66e09bf

Please sign in to comment.