-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #140 from aws-observability/feature/istioMonitoring
Istio Monitoring Pattern with AMP and AMG
- Loading branch information
Showing
7 changed files
with
443 additions
and
8 deletions.
There are no files selected for viewing
113 changes: 113 additions & 0 deletions
113
lib/common/resources/amp-config/istio/alerting-rules.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
groups: | ||
- name: "istio.basic.alerting-rules" | ||
rules: | ||
- alert: IngressTrafficMissing | ||
annotations: | ||
summary: 'ingress gateway traffic missing' | ||
description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs' | ||
expr: > | ||
absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1 | ||
for: 5m | ||
- alert: IstioMetricsMissing | ||
annotations: | ||
summary: 'Istio Metrics missing' | ||
description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly' | ||
expr: > | ||
absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1 | ||
for: 5m | ||
- name: "istio.workload.alerting-rules" | ||
rules: | ||
- alert: HTTP5xxRateHigh | ||
annotations: | ||
summary: '5xx rate too high' | ||
description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins' | ||
expr: > | ||
sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05 | ||
for: 5m | ||
- alert: WorkloadLatencyP99High | ||
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160 | ||
for: 10m | ||
annotations: | ||
description: 'The workload request latency P99 > 160ms ' | ||
message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" | ||
- alert: IngressLatencyP99High | ||
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250 | ||
for: 10m | ||
annotations: | ||
description: 'The ingress latency P99 > 250ms ' | ||
message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" | ||
- name: "istio.infra.alerting-rules" | ||
rules: | ||
- alert: ProxyContainerCPUUsageHigh | ||
expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80 | ||
for: 5m | ||
annotations: | ||
summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" | ||
description: "Proxy Container CPU usage is above 80%" | ||
- alert: ProxyContainerMemoryUsageHigh | ||
expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80 | ||
for: 5m | ||
annotations: | ||
summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" | ||
description: "Proxy Container Memory usage is above 80%" | ||
- alert: IngressMemoryUsageIncreaseRateHigh | ||
expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200 | ||
for: 180m | ||
annotations: | ||
summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n" | ||
description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec" | ||
- alert: IstiodContainerCPUUsageHigh | ||
expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80 | ||
for: 5m | ||
annotations: | ||
summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" | ||
description: "Isitod Container CPU usage is above 80%" | ||
- alert: IstiodMemoryUsageHigh | ||
expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80 | ||
for: 5m | ||
annotations: | ||
summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" | ||
description: "Istiod Container Memory usage is above 80%" | ||
- alert: IstiodMemoryUsageIncreaseRateHigh | ||
expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000 | ||
for: 300m | ||
annotations: | ||
summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n" | ||
description: "Istiod Container Memory usage increases more than 1k Bytes/sec" | ||
- name: "istio.controlplane.alerting-rules" | ||
rules: | ||
- alert: IstiodxdsPushErrorsHigh | ||
annotations: | ||
summary: 'istiod push errors is too high' | ||
description: 'istiod push error rate is higher than 0.05' | ||
expr: > | ||
sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 | ||
for: 5m | ||
- alert: IstiodxdsRejectHigh | ||
annotations: | ||
summary: 'istiod rejects rate is too high' | ||
description: 'istiod rejects rate is higher than 0.05' | ||
expr: > | ||
sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 | ||
for: 5m | ||
- alert: IstiodContainerNotReady | ||
annotations: | ||
summary: 'istiod container not ready' | ||
description: 'container: discovery not running' | ||
expr: > | ||
kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0 | ||
for: 5m | ||
- alert: IstiodUnavailableReplica | ||
annotations: | ||
summary: 'Istiod unavailable pod' | ||
description: 'Istiod unavailable replica > 0' | ||
expr: > | ||
kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0 | ||
for: 5m | ||
- alert: Ingress200RateLow | ||
annotations: | ||
summary: 'ingress gateway 200 rate drops' | ||
description: 'The expected rate is 100 per ns, the limit is set based on 15ns' | ||
expr: > | ||
sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490 | ||
for: 30m |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
groups: | ||
- name: "istio.recording-rules" | ||
interval: 5s | ||
rules: | ||
- record: "workload:istio_requests_total" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total) | ||
- record: "workload:istio_request_duration_milliseconds_count" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count) | ||
- record: "workload:istio_request_duration_milliseconds_sum" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum) | ||
- record: "workload:istio_request_duration_milliseconds_bucket" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket) | ||
- record: "workload:istio_request_bytes_count" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count) | ||
- record: "workload:istio_request_bytes_sum" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum) | ||
- record: "workload:istio_request_bytes_bucket" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket) | ||
- record: "workload:istio_response_bytes_count" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count) | ||
- record: "workload:istio_response_bytes_sum" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum) | ||
- record: "workload:istio_response_bytes_bucket" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket) | ||
- record: "workload:istio_tcp_sent_bytes_total" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total) | ||
- record: "workload:istio_tcp_received_bytes_total" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total) | ||
- record: "workload:istio_tcp_connections_opened_total" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total) | ||
- record: "workload:istio_tcp_connections_closed_total" | ||
expr: | | ||
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.