diff --git a/ara/cli/prometheus.py b/ara/cli/prometheus.py index 5d06798f..8326b62f 100644 --- a/ara/cli/prometheus.py +++ b/ara/cli/prometheus.py @@ -39,7 +39,7 @@ DEFAULT_HOST_LABELS = ["name", "playbook", "updated"] -# TODO: This method should be more flexible and live in a library +# TODO: This could be made more flexible and live in a library def get_search_results(client, kind, limit, created_after): """ kind: string, one of ["playbooks", "hosts", "tasks"] @@ -72,18 +72,15 @@ def __init__(self, client, log, limit, labels=DEFAULT_PLAYBOOK_LABELS): self.labels = labels self.metrics = { - "completed": Gauge("ara_playbooks_completed", "Completed Ansible playbooks", labels), - "expired": Gauge("ara_playbooks_expired", "Expired Ansible playbooks", labels), - "failed": Gauge("ara_playbooks_failed", "Failed Ansible playbooks", labels), "range": Gauge("ara_playbooks_range", "Limit metric collection to the N most recent playbooks"), - "running": Gauge("ara_playbooks_running", "Running Ansible playbooks", labels), "total": Gauge("ara_playbooks_total", "Total number of playbooks recorded by ara"), - "duration": Summary("ara_playbooks_duration", "Duration (in seconds) of playbooks recorded by ara", labels), + "playbooks": Summary( + "ara_playbooks", "Labels and duration (in seconds) of playbooks recorded by ara", labels + ), } + self.metrics["range"].set(self.limit) - def collect_metrics(self, created_after=None, limit=1000): - self.metrics["range"].set(limit) - + def collect_metrics(self, created_after=None): playbooks = get_search_results(self.client, "playbooks", self.limit, created_after) # Save the most recent timestamp so we only scrape beyond it next time if playbooks: @@ -91,12 +88,6 @@ def collect_metrics(self, created_after=None, limit=1000): self.log.info(f"updating metrics for {len(playbooks)} playbooks...") for playbook in playbooks: - self.metrics["total"].inc() - - # Gather the values of each label so we can attach them to our metrics - labels = {label: playbook[label] for label in self.labels} - self.metrics[playbook["status"]].labels(**labels).inc() - # The API returns a duration in string format, convert it back to seconds # so we can use it as a value for the metric. if playbook["duration"] is not None: @@ -108,7 +99,12 @@ def collect_metrics(self, created_after=None, limit=1000): seconds = 0 else: seconds = 0 - self.metrics["duration"].labels(**labels).observe(seconds) + + # Gather the values of each label so we can attach them to our metrics + labels = {label: playbook[label] for label in self.labels} + + self.metrics["playbooks"].labels(**labels).observe(seconds) + self.metrics["total"].inc() return created_after @@ -121,20 +117,13 @@ def __init__(self, client, log, limit, labels=DEFAULT_TASK_LABELS): self.labels = labels self.metrics = { - "completed": Gauge("ara_tasks_completed", "Completed Ansible tasks", labels), - "expired": Gauge("ara_tasks_expired", "Expired Ansible tasks", labels), - "failed": Gauge("ara_tasks_failed", "Failed Ansible tasks", labels), "range": Gauge("ara_tasks_range", "Limit metric collection to the N most recent tasks"), - "running": Gauge("ara_tasks_running", "Running Ansible tasks", labels), "total": Gauge("ara_tasks_total", "Number of tasks recorded by ara in prometheus"), - "duration": Summary( - "ara_tasks_duration", "Duration, in seconds, of playbook tasks recorded by ara", labels - ), + "tasks": Summary("ara_tasks", "Labels and duration, in seconds, of playbook tasks recorded by ara", labels), } - - def collect_metrics(self, created_after=None): self.metrics["range"].set(self.limit) + def collect_metrics(self, created_after=None): tasks = get_search_results(self.client, "tasks", self.limit, created_after) # Save the most recent timestamp so we only scrape beyond it next time if tasks: @@ -142,12 +131,6 @@ def collect_metrics(self, created_after=None): self.log.info(f"updating metrics for {len(tasks)} tasks...") for task in tasks: - self.metrics["total"].inc() - - # Gather the values of each label so we can attach them to our metrics - labels = {label: task[label] for label in self.labels} - self.metrics[task["status"]].labels(**labels).inc() - # The API returns a duration in string format, convert it back to seconds # so we can use it as a value for the metric. if task["duration"] is not None: @@ -159,7 +142,12 @@ def collect_metrics(self, created_after=None): seconds = 0 else: seconds = 0 - self.metrics["duration"].labels(**labels).observe(seconds) + + # Gather the values of each label so we can attach them to our metrics + labels = {label: task[label] for label in self.labels} + + self.metrics["tasks"].labels(**labels).observe(seconds) + self.metrics["total"].inc() return created_after @@ -180,10 +168,9 @@ def __init__(self, client, log, limit, labels=DEFAULT_HOST_LABELS): "total": Gauge("ara_hosts_total", "Hosts recorded by ara"), "unreachable": Gauge("ara_hosts_unreachable", "Number of unreachable errors on a host", labels), } - - def collect_metrics(self, created_after=None): self.metrics["range"].set(self.limit) + def collect_metrics(self, created_after=None): hosts = get_search_results(self.client, "hosts", self.limit, created_after) # Save the most recent timestamp so we only scrape beyond it next time if hosts: diff --git a/contrib/grafana/ara-dashboard.json b/contrib/grafana/ara-dashboard.json new file mode 100644 index 00000000..56f1bc1f --- /dev/null +++ b/contrib/grafana/ara-dashboard.json @@ -0,0 +1,1843 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.5.3" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 13, + "title": "Playbooks", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (status) (ara_playbooks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Playbook results by status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(ara_playbooks_sum) by (path)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average playbook duration by path (in seconds)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (user) (ara_playbooks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Playbooks by user", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (controller) (ara_playbooks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Playbooks by controller", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 28 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (ansible_version) (ara_playbooks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Playbooks by version of Ansible", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 28 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (python_version) (ara_playbooks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Playbooks by version of Python", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 16, + "y": 28 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (client_version) (ara_playbooks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Playbooks by version of ara client", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 7, + "panels": [], + "title": "Tasks", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (status) (ara_tasks_count)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Task results by status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 8, + "x": 0, + "y": 46 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(ara_tasks_sum) by (name)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average task duration by name (in seconds)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 8, + "x": 8, + "y": 46 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(ara_tasks_sum) by (action)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average task duration by action (in seconds)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 2, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 8, + "x": 16, + "y": 46 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(ara_tasks_sum) by (path)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Average task duration by path (in seconds)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 8, + "x": 0, + "y": 61 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (name) (ara_tasks_count{status=\"failed\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Task failures by name", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 8, + "x": 8, + "y": 61 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.5.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (action) (ara_tasks_count{status=\"failed\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Task failures by action", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 8, + "x": 16, + "y": 61 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (path) (ara_tasks_count{status=\"failed\"})", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Task failures by path", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 74 + }, + "id": 6, + "panels": [], + "title": "Hosts", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 75 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(ara_hosts_ok)", + "legendFormat": "ok", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(ara_hosts_failed)", + "hide": false, + "legendFormat": "failed", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(ara_hosts_changed)", + "hide": false, + "legendFormat": "changed", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(ara_hosts_skipped)", + "hide": false, + "legendFormat": "skipped", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum (ara_hosts_unreachable)", + "hide": false, + "legendFormat": "unreachable", + "range": true, + "refId": "E" + } + ], + "title": "Host results by status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 87 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (name) (ara_hosts_changed)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Host changed results by name", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 4, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 4, + "scaleDistribution": { + "log": 2, + "type": "log" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 100 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (name) (ara_hosts_failed)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Host failed results by name", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 38, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Ansible metrics (by ara)", + "uid": "e0717f1a-4bb5-4373-b177-a9f5a498962d", + "version": 4, + "weekStart": "" +} \ No newline at end of file diff --git a/doc/requirements.txt b/doc/requirements.txt index fc5d33de..d40f396f 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -16,3 +16,5 @@ dynaconf[yaml] tzlocal whitenoise pygments +# for the "ara prometheus" exporter CLI command +prometheus_client \ No newline at end of file diff --git a/doc/source/_static/grafana-hosts.png b/doc/source/_static/grafana-hosts.png new file mode 100644 index 00000000..fb22d7f6 Binary files /dev/null and b/doc/source/_static/grafana-hosts.png differ diff --git a/doc/source/_static/grafana-playbooks.png b/doc/source/_static/grafana-playbooks.png new file mode 100644 index 00000000..211f27c6 Binary files /dev/null and b/doc/source/_static/grafana-playbooks.png differ diff --git a/doc/source/_static/grafana-tasks.png b/doc/source/_static/grafana-tasks.png new file mode 100644 index 00000000..00921f91 Binary files /dev/null and b/doc/source/_static/grafana-tasks.png differ diff --git a/doc/source/cli.rst b/doc/source/cli.rst index 0ad2a3b2..88bac7ce 100644 --- a/doc/source/cli.rst +++ b/doc/source/cli.rst @@ -474,6 +474,26 @@ Examples: # Aggregate metrics by task file rather than action ara task metrics --aggregate path +ara prometheus +-------------- + +.. command-output:: ara prometheus --help + +Also read: :ref:`documentation on prometheus `. + +Examples: + +.. code-block:: bash + + # Start a prometheus exporter on the default address (http://0.0.0.0:8001/metrics) + # Then, backfill metrics from the last 365 days until now + # Then, poll every 30s for new metrics + ara prometheus --max-days 365 --poll 30 + + # When gathering metrics, customize the number of items returned in each page of results + # from the API based on instance size and performance expectations + ara prometheus --playbook-limit 500 --task-limit 1000 --host-limit 1000 + CLI: ara-manage (django API server) =================================== diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 2184aace..f9b43ed9 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -1,3 +1,5 @@ +.. _contributing: + Contributing to ARA =================== diff --git a/doc/source/index.rst b/doc/source/index.rst index e943d49e..f50b5b54 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -20,5 +20,6 @@ Table of Contents CLI: ara and ara-manage Contributing to ARA Frequently asked questions + Ansible metrics with Prometheus Troubleshooting Changelog and release notes diff --git a/doc/source/prometheus.rst b/doc/source/prometheus.rst new file mode 100644 index 00000000..c4320ab3 --- /dev/null +++ b/doc/source/prometheus.rst @@ -0,0 +1,119 @@ +.. _prometheus: + +Ansible metrics with Prometheus +=============================== + +ara doesn't provide monitoring or alerting out of the box (they are out of scope) but it records a number of granular metrics about Ansible playbooks, tasks and hosts, amongst other things. + +Starting with version 1.6.2, ara provides an integration of `prometheus_client `_ that queries the ara API and then exposes these metrics for prometheus to scrape. + +Once these metrics are in `Prometheus `_, they're available for queries, alerts and pretty graphs in `Grafana `_. + +The source code for the exporter can be found in the `git repository `_. + +Getting started +--------------- + +The ara prometheus exporter implementation is not an API server component and does not require an API server to run by default. + +It can run out of an installation from distribution packages, PyPI packages in a virtual environment or from a container. + +For example: + +.. code-block:: bash + + # Install ara, ansible and prometheus_client in a virtual environment + python3 -m venv ~/venv/ara + source ~/venv/ara/bin/activate + pip install ansible ara[server,prometheus] + + # Run and record a playbook + export ANSIBLE_CALLBACK_PLUGINS=$(python3 -m ara.setup.callback_plugins) + ansible-playbook ~/docs.yml + + # Start the prometheus exporter + ara prometheus + # http://127.0.0.1:8001/metrics available with metrics from running ~/docs.yml + +When running an ara API server, the prometheus exporter can be installed and run from wherever it is able to query the API. + +For example: + +.. code-block:: bash + + # Install ara and prometheus_client in a virtual environment + # (ansible and the ara API server dependencies are not required here) + python3 -m venv ~/venv/ara + source ~/venv/ara/bin/activate + pip install ara[prometheus] + + # Configure the prometheus exporter to query a running instance + export ARA_API_CLIENT=http + export ARA_API_SERVER=http://ara.example.org + + # Start the prometheus exporter, backfilling metrics from the last 30 days + # and then polling for new data every 30 seconds + ara prometheus --max-days 30 --poll 30 + # http://127.0.0.1:8001/metrics available with metrics from ara.example.org + +Once the exporter is running, Prometheus must be configured to scrape it: + +.. code-block:: yaml + + global: + scrape_interval: 30s + + scrape_configs: + - job_name: 'ara' + static_configs: + # Replace with wherever the exporter is available and listening relative to prometheus + - targets: ['10.0.0.10:8001'] + +Metrics should then be available as soon as Prometheus successfully scrapes at least once. + +Available metrics +----------------- + +The ``ara prometheus`` command queries the ara API and then makes the following metrics available: + +- **ara_playbooks** (`Summary `_) provides labels based on playbook properties and duration in seconds + +.. code-block:: + + ara_playbooks_count{ansible_version="2.15.0",client_version="1.6.2.dev10",controller="fedora",name="docs",path="/home/user/docs.yml",python_version="3.11.3",server_version="1.6.2.dev10",status="completed",updated="2023-06-08T02:43:29.910977Z",user="ansible"} 1.0 + ara_playbooks_sum{ansible_version="2.15.0",client_version="1.6.2.dev10",controller="fedora",name="docs",path="/home/user/docs.yml",python_version="3.11.3",server_version="1.6.2.dev10",status="completed",updated="2023-06-08T02:43:29.910977Z",user="ansible"} 14.161331 + +- **ara_tasks** (`Summary `_) provides labels based on task properties and duration in seconds + +.. code-block:: + + ara_tasks_count{action="command",name="docs",path="/home/user/docs.yml",playbook="30",status="completed",updated="2023-06-08T02:43:29.665787Z"} 1.0 + ara_tasks_sum{action="command",name="docs",path="/home/user/docs.yml",playbook="30",status="completed",updated="2023-06-08T02:43:29.665787Z"} 0.29482 + +- **ara_hosts_{ok,failed,changed,skipped,unreachable}** (`Gauge `_) provide labels based on host properties and number of results for each status + +.. code-block:: + + ara_hosts_ok{name="localhost",playbook="30",updated="2023-06-08T02:43:29.848077Z"} 36.0 + ara_hosts_failed{name="localhost",playbook="24",updated="2023-06-08T02:32:18.773096Z"} 1.0 + ara_hosts_changed{name="localhost",playbook="30",updated="2023-06-08T02:43:29.848077Z"} 10.0 + ara_hosts_skipped{name="host3",playbook="15",updated="2023-06-08T01:24:59.210984Z"} 2.0 + ara_hosts_unreachable{name="localhost",playbook="24",updated="2023-06-08T02:32:18.773096Z"} 1.0 + +Grafana dashboard +----------------- + +While everyone is encouraged to create and tweak their own dashboards according to their needs and preferences, the community maintains a Grafana dashboard to help users get started. + +It is in `contrib `_ inside the git repository. +Feel free to :ref:`open a pull request ` if you'd like to contribute ! + +The dashboard contains a high level overview of available metrics for playbooks, tasks and hosts. + +Open these screenshots in a new tab for viewing them in full resolution: + +.. image:: ../source/_static/grafana-playbooks.png + +.. image:: ../source/_static/grafana-tasks.png + +.. image:: ../source/_static/grafana-hosts.png diff --git a/setup.cfg b/setup.cfg index 38f94e72..9410427e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -81,6 +81,8 @@ postgresql= psycopg2 mysql= mysqlclient +prometheus= + prometheus_client [build_sphinx] source-dir = doc/source