Skip to content

Commit

Permalink
Merge pull request #140 from aws-observability/feature/istioMonitoring
Browse files Browse the repository at this point in the history
Istio Monitoring Pattern with AMP and AMG
  • Loading branch information
freschri authored Jan 25, 2024
2 parents 70b4baf + 213f48a commit 043ab04
Show file tree
Hide file tree
Showing 7 changed files with 443 additions and 8 deletions.
113 changes: 113 additions & 0 deletions lib/common/resources/amp-config/istio/alerting-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
groups:
- name: "istio.basic.alerting-rules"
rules:
- alert: IngressTrafficMissing
annotations:
summary: 'ingress gateway traffic missing'
description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs'
expr: >
absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1
for: 5m
- alert: IstioMetricsMissing
annotations:
summary: 'Istio Metrics missing'
description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly'
expr: >
absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1
for: 5m
- name: "istio.workload.alerting-rules"
rules:
- alert: HTTP5xxRateHigh
annotations:
summary: '5xx rate too high'
description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins'
expr: >
sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05
for: 5m
- alert: WorkloadLatencyP99High
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160
for: 10m
annotations:
description: 'The workload request latency P99 > 160ms '
message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds"
- alert: IngressLatencyP99High
expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250
for: 10m
annotations:
description: 'The ingress latency P99 > 250ms '
message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds"
- name: "istio.infra.alerting-rules"
rules:
- alert: ProxyContainerCPUUsageHigh
expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80
for: 5m
annotations:
summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Proxy Container CPU usage is above 80%"
- alert: ProxyContainerMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80
for: 5m
annotations:
summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Proxy Container Memory usage is above 80%"
- alert: IngressMemoryUsageIncreaseRateHigh
expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200
for: 180m
annotations:
summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n"
description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec"
- alert: IstiodContainerCPUUsageHigh
expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80
for: 5m
annotations:
summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Isitod Container CPU usage is above 80%"
- alert: IstiodMemoryUsageHigh
expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80
for: 5m
annotations:
summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n"
description: "Istiod Container Memory usage is above 80%"
- alert: IstiodMemoryUsageIncreaseRateHigh
expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000
for: 300m
annotations:
summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n"
description: "Istiod Container Memory usage increases more than 1k Bytes/sec"
- name: "istio.controlplane.alerting-rules"
rules:
- alert: IstiodxdsPushErrorsHigh
annotations:
summary: 'istiod push errors is too high'
description: 'istiod push error rate is higher than 0.05'
expr: >
sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05
for: 5m
- alert: IstiodxdsRejectHigh
annotations:
summary: 'istiod rejects rate is too high'
description: 'istiod rejects rate is higher than 0.05'
expr: >
sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05
for: 5m
- alert: IstiodContainerNotReady
annotations:
summary: 'istiod container not ready'
description: 'container: discovery not running'
expr: >
kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0
for: 5m
- alert: IstiodUnavailableReplica
annotations:
summary: 'Istiod unavailable pod'
description: 'Istiod unavailable replica > 0'
expr: >
kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0
for: 5m
- alert: Ingress200RateLow
annotations:
summary: 'ingress gateway 200 rate drops'
description: 'The expected rate is 100 per ns, the limit is set based on 15ns'
expr: >
sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490
for: 30m
59 changes: 59 additions & 0 deletions lib/common/resources/amp-config/istio/recording-rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
groups:
- name: "istio.recording-rules"
interval: 5s
rules:
- record: "workload:istio_requests_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total)
- record: "workload:istio_request_duration_milliseconds_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count)
- record: "workload:istio_request_duration_milliseconds_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum)
- record: "workload:istio_request_duration_milliseconds_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket)
- record: "workload:istio_request_bytes_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count)
- record: "workload:istio_request_bytes_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum)
- record: "workload:istio_request_bytes_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket)
- record: "workload:istio_response_bytes_count"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count)
- record: "workload:istio_response_bytes_sum"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum)
- record: "workload:istio_response_bytes_bucket"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket)
- record: "workload:istio_tcp_sent_bytes_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total)
- record: "workload:istio_tcp_received_bytes_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total)
- record: "workload:istio_tcp_connections_opened_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total)
- record: "workload:istio_tcp_connections_closed_total"
expr: |
sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total)
107 changes: 105 additions & 2 deletions lib/common/resources/otel-collector-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ spec:
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor
{{ if enableAPIserverJob }}
{{ start enableAPIserverJob }}
- job_name: 'apiserver'
scheme: https
tls_config:
Expand All @@ -94,7 +94,7 @@ spec:
regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50)
replacement: $1
action: drop
{{ end }}
{{ stop enableAPIserverJob }}
- job_name: serviceMonitor/default/kube-prometheus-stack-prometheus-node-exporter/0
honor_timestamps: true
scrape_interval: 30s
Expand Down Expand Up @@ -1607,6 +1607,15 @@ spec:
- job_name: 'node-exporter'
kubernetes_sd_configs:
- role: endpoints
ec2_sd_configs:
relabel_configs:
- source_labels: [ __address__ ]
action: keep
regex: '.*:9100$'
- action: replace
source_labels: [__meta_kubernetes_endpoint_node_name]
target_label: nodename
{{ start enableJavaMonJob }}
- job_name: 'kubernetes-java-jmx'
sample_limit: {{javaScrapeSampleLimit}}
metrics_path: {{javaPrometheusMetricsEndpoint}}
Expand Down Expand Up @@ -1637,6 +1646,100 @@ spec:
- source_labels: [ __name__ ]
regex: 'jvm_gc_collection_seconds.*'
action: drop
{{ stop enableJavaMonJob }}
{{ start enableNginxMonJob }}
- job_name: 'kubernetes-nginx'
sample_limit: {{nginxScrapeSampleLimit}}
metrics_path: {{nginxPrometheusMetricsEndpoint}}
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [ __address__ ]
action: keep
regex: '.*:10254$'
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container
action: replace
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: host
action: replace
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
action: replace
metric_relabel_configs:
- source_labels: [__name__]
regex: 'go_memstats.*'
action: drop
- source_labels: [__name__]
regex: 'go_gc.*'
action: drop
- source_labels: [__name__]
regex: 'go_threads'
action: drop
- regex: exported_host
action: labeldrop
{{ stop enableNginxMonJob }}
{{ start enableIstioMonJob }}
- honor_labels: true
job_name: kubernetes-istio
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$$2]:$$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $$2:$$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: keep
source_labels: [ __address__ ]
regex: '.*:15020$$'
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
{{ stop enableIstioMonJob }}
exporters:
prometheusremotewrite:
endpoint: "{{remoteWriteEndpoint}}"
Expand Down
Loading

0 comments on commit 043ab04

Please sign in to comment.