diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 87296be391471..f547bc36af362 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -168,11 +168,11 @@ drive statistics, special series are output like this: :: - ceph_disk_occupation{ceph_daemon="osd.0",device="sdd", exported_instance="myhost"} + ceph_disk_occupation_human{ceph_daemon="osd.0", device="sdd", exported_instance="myhost"} To use this to get disk statistics by OSD ID, use either the ``and`` operator or the ``*`` operator in your prometheus query. All metadata metrics (like `` -ceph_disk_occupation`` have the value 1 so they act neutral with ``*``. Using ``*`` +ceph_disk_occupation_human`` have the value 1 so they act neutral with ``*``. Using ``*`` allows to use ``group_left`` and ``group_right`` grouping modifiers, so that the resulting metric has additional labels from one side of the query. @@ -185,13 +185,24 @@ The goal is to run a query like :: - rate(node_disk_bytes_written[30s]) and on (device,instance) ceph_disk_occupation{ceph_daemon="osd.0"} + rate(node_disk_bytes_written[30s]) and + on (device,instance) ceph_disk_occupation_human{ceph_daemon="osd.0"} Out of the box the above query will not return any metrics since the ``instance`` labels of -both metrics don't match. The ``instance`` label of ``ceph_disk_occupation`` +both metrics don't match. The ``instance`` label of ``ceph_disk_occupation_human`` will be the currently active MGR node. - The following two section outline two approaches to remedy this. +The following two section outline two approaches to remedy this. + +.. note:: + + If you need to group on the `ceph_daemon` label instead of `device` and + `instance` labels, using `ceph_disk_occupation_human` may not work reliably. + It is advised that you use `ceph_disk_occupation` instead. + + The difference is that `ceph_disk_occupation_human` may group several OSDs + into the value of a single `ceph_daemon` label in cases where multiple OSDs + share a disk. Use label_replace ================= @@ -204,7 +215,13 @@ To correlate an OSD and its disks write rate, the following query can be used: :: - label_replace(rate(node_disk_bytes_written[30s]), "exported_instance", "$1", "instance", "(.*):.*") and on (device,exported_instance) ceph_disk_occupation{ceph_daemon="osd.0"} + label_replace( + rate(node_disk_bytes_written[30s]), + "exported_instance", + "$1", + "instance", + "(.*):.*" + ) and on (device, exported_instance) ceph_disk_occupation_human{ceph_daemon="osd.0"} Configuring Prometheus server ============================= diff --git a/monitoring/grafana/dashboards/host-details.json b/monitoring/grafana/dashboards/host-details.json index 88185a95a1555..d2ac2304f31e6 100644 --- a/monitoring/grafana/dashboards/host-details.json +++ b/monitoring/grafana/dashboards/host-details.json @@ -798,7 +798,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(\n (\n irate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device, ceph_daemon) group_left\n label_replace(\n label_replace(\n ceph_disk_occupation,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )", + "expr": "label_replace(\n (\n irate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or\n irate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) writes", @@ -807,7 +807,7 @@ "textEditor": true }, { - "expr": "label_replace(\n (irate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device, ceph_daemon) group_left\n label_replace(\n label_replace(\n ceph_disk_occupation,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )", + "expr": "label_replace(\n (irate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n)\n* on(instance, device) group_left(ceph_daemon)\n label_replace(\n label_replace(\n ceph_disk_occupation_human,\n \"device\",\n \"$1\",\n \"device\",\n \"/dev/(.*)\"\n ),\n \"instance\",\n \"$1\",\n \"instance\",\n \"([^:.]*).*\"\n )", "format": "time_series", "hide": false, "intervalFactor": 1, @@ -899,14 +899,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace((irate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) write", "refId": "B" }, { - "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace((irate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) or irate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m])), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}}({{ceph_daemon}}) read", @@ -992,7 +992,7 @@ "steppedLine": false, "targets": [ { - "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "max by(instance,device) (label_replace((irate(node_disk_write_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_writes_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001) or (irate(node_disk_read_time_seconds_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) ) / clamp_min(irate(node_disk_reads_completed_total{ instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]), 0.001), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")) * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "hide": false, "intervalFactor": 1, @@ -1083,7 +1083,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device, ceph_daemon) group_left label_replace(label_replace(ceph_disk_occupation{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace(((irate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) / 10 ) or irate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\.:].*)?\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "hide": false, "intervalFactor": 1, diff --git a/monitoring/grafana/dashboards/hosts-overview.json b/monitoring/grafana/dashboards/hosts-overview.json index b179d5717d536..83929af984177 100644 --- a/monitoring/grafana/dashboards/hosts-overview.json +++ b/monitoring/grafana/dashboards/hosts-overview.json @@ -431,7 +431,7 @@ "tableColumn": "", "targets": [ { - "expr" : "avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) *\n on(instance, device, ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)", + "expr": "avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) *\n on(instance, device) group_left(ceph_daemon) label_replace(label_replace(ceph_disk_occupation_human{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)", "format": "time_series", "instant": true, "intervalFactor": 1, diff --git a/monitoring/grafana/dashboards/osd-device-details.json b/monitoring/grafana/dashboards/osd-device-details.json index eefb591257917..cb0bfb626371f 100644 --- a/monitoring/grafana/dashboards/osd-device-details.json +++ b/monitoring/grafana/dashboards/osd-device-details.json @@ -390,14 +390,14 @@ "steppedLine": false, "targets": [ { - "expr": "(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))", + "expr": "(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}/{{device}} Reads", "refId": "A" }, { - "expr": "(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))", + "expr": "(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}/{{device}} Writes", @@ -486,14 +486,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace(irate(node_disk_writes_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}} on {{instance}} Writes", "refId": "A" }, { - "expr": "label_replace(irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace(irate(node_disk_reads_completed_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}} on {{instance}} Reads", @@ -582,14 +582,14 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(irate(node_disk_read_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace(irate(node_disk_read_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} {{device}} Reads", "refId": "A" }, { - "expr": "label_replace(irate(node_disk_written_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace(irate(node_disk_written_bytes_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} {{device}} Writes", @@ -673,7 +673,7 @@ "steppedLine": false, "targets": [ { - "expr": "label_replace(irate(node_disk_io_time_seconds_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", + "expr": "label_replace(irate(node_disk_io_time_seconds_total[1m]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\")", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{device}} on {{instance}}", diff --git a/src/pybind/mgr/dashboard/cherrypy_backports.py b/src/pybind/mgr/dashboard/cherrypy_backports.py index dd614609a8dc4..66b23ee80c36f 100644 --- a/src/pybind/mgr/dashboard/cherrypy_backports.py +++ b/src/pybind/mgr/dashboard/cherrypy_backports.py @@ -75,7 +75,7 @@ def patch_builtin_ssl_wrap(v, new_wrap): if v < StrictVersion("9.0.0"): from cherrypy.wsgiserver.ssl_builtin import BuiltinSSLAdapter as builtin_ssl else: - from cheroot.ssl.builtin import BuiltinSSLAdapter as builtin_ssl + from cheroot.ssl.builtin import BuiltinSSLAdapter as builtin_ssl # type: ignore builtin_ssl.wrap = new_wrap(builtin_ssl.wrap) diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index c150b11efa53c..5c00f78f55900 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -14,7 +14,10 @@ from rbd import RBD from collections import namedtuple try: - from typing import DefaultDict, Optional, Dict, Any, Set + from typing import DefaultDict, Optional, Dict, Any, Set, Tuple, Union, List, Callable + LabelValues = Tuple[str, ...] + Number = Union[int, float] + MetricValue = Dict[LabelValues, Number] except ImportError: pass @@ -186,6 +189,98 @@ def floatstr(value): ) return expfmt + def group_by( + self, + keys: List[str], + joins: Dict[str, Callable[[List[str]], str]], + name: Optional[str] = None, + ) -> "Metric": + """ + Groups data by label names. + + Label names not passed are being removed from the resulting metric but + by providing a join function, labels of metrics can be grouped. + + The purpose of this method is to provide a version of a metric that can + be used in matching where otherwise multiple results would be returned. + + As grouping is possible in Prometheus, the only additional value of this + method is the possibility to join labels when grouping. For that reason, + passing joins is required. Please use PromQL expressions in all other + cases. + + >>> m = Metric('type', 'name', '', labels=('label1', 'id')) + >>> m.value = { + ... ('foo', 'x'): 1, + ... ('foo', 'y'): 1, + ... } + >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value + {('foo', 'x,y'): 1} + + The functionality of group by could roughly be compared with Prometheus' + + group (ceph_disk_occupation) by (device, instance) + + with the exception that not all labels which aren't used as a condition + to group a metric are discarded, but their values can are joined and the + label is thereby preserved. + + This function takes the value of the first entry of a found group to be + used for the resulting value of the grouping operation. + + >>> m = Metric('type', 'name', '', labels=('label1', 'id')) + >>> m.value = { + ... ('foo', 'x'): 555, + ... ('foo', 'y'): 10, + ... } + >>> m.group_by(['label1'], {'id': lambda ids: ','.join(ids)}).value + {('foo', 'x,y'): 555} + """ + assert self.labelnames, "cannot match keys without label names" + for key in keys: + assert key in self.labelnames, "unknown key: {}".format(key) + assert joins, "joins must not be empty" + assert all(callable(c) for c in joins.values()), "joins must be callable" + + # group + grouped = defaultdict(list) # type: Dict[LabelValues, List[Tuple[Dict[str, str], Number]]] + for label_values, metric_value in self.value.items(): + labels = dict(zip(self.labelnames, label_values)) + if not all(k in labels for k in keys): + continue + group_key = tuple(labels[k] for k in keys) + grouped[group_key].append((labels, metric_value)) + + # as there is nothing specified on how to join labels that are not equal + # and Prometheus `group` aggregation functions similarly, we simply drop + # those labels. + labelnames = tuple( + label for label in self.labelnames if label in keys or label in joins + ) + superfluous_labelnames = [ + label for label in self.labelnames if label not in labelnames + ] + + # iterate and convert groups with more than one member into a single + # entry + values = {} # type: MetricValue + for group in grouped.values(): + labels, metric_value = group[0] + + for label in superfluous_labelnames: + del labels[label] + + if len(group) > 1: + for key, fn in joins.items(): + labels[key] = fn(list(labels[key] for labels, _ in group)) + + values[tuple(labels.values())] = metric_value + + new_metric = Metric(self.mtype, name if name else self.name, self.desc, labelnames) + new_metric.value = values + + return new_metric + class MetricCollectionThread(threading.Thread): def __init__(self, module): @@ -364,6 +459,14 @@ def _setup_static_metrics(self): DISK_OCCUPATION ) + metrics['disk_occupation_human'] = Metric( + 'untyped', + 'disk_occupation_human', + 'Associate Ceph daemon with disk used for displaying to humans,' + ' not for joining tables (vector matching)', + DISK_OCCUPATION, # label names are automatically decimated on grouping + ) + metrics['pool_metadata'] = Metric( 'untyped', 'pool_metadata', @@ -770,6 +873,17 @@ def get_metadata_and_osd_status(self): self.log.info("Missing dev node metadata for osd {0}, skipping " "occupation record for this osd".format(id_)) + if 'disk_occupation' in self.metrics: + try: + self.metrics['disk_occupation_human'] = \ + self.metrics['disk_occupation'].group_by( + ['device', 'instance'], + {'ceph_daemon': lambda daemons: ', '.join(daemons)}, + name='disk_occupation_human', + ) + except Exception as e: + self.log.error(e) + for pool in osd_map['pools']: self.metrics['pool_metadata'].set( 1, (pool['pool'], pool['pool_name'])) diff --git a/src/pybind/mgr/prometheus/test_module.py b/src/pybind/mgr/prometheus/test_module.py new file mode 100644 index 0000000000000..0647cb658c75d --- /dev/null +++ b/src/pybind/mgr/prometheus/test_module.py @@ -0,0 +1,93 @@ +from typing import Dict +from unittest import TestCase + +from prometheus.module import Metric, LabelValues, Number + + +class MetricGroupTest(TestCase): + def setUp(self): + self.DISK_OCCUPATION = ( + "ceph_daemon", + "device", + "db_device", + "wal_device", + "instance", + ) + self.metric_value: Dict[LabelValues, Number] = { + ("osd.0", "/dev/dm-0", "", "", "node1"): 1, + ("osd.1", "/dev/dm-0", "", "", "node3"): 1, + ("osd.2", "/dev/dm-0", "", "", "node2"): 1, + ("osd.3", "/dev/dm-1", "", "", "node1"): 1, + ("osd.4", "/dev/dm-1", "", "", "node3"): 1, + ("osd.5", "/dev/dm-1", "", "", "node2"): 1, + ("osd.6", "/dev/dm-1", "", "", "node2"): 1, + } + + def test_metric_group_by(self): + m = Metric("untyped", "disk_occupation", "", self.DISK_OCCUPATION) + m.value = self.metric_value + grouped_metric = m.group_by( + ["device", "instance"], + {"ceph_daemon": lambda xs: "+".join(xs)}, + name="disk_occupation_display", + ) + self.assertEqual( + grouped_metric.value, + { + ("osd.0", "/dev/dm-0", "node1"): 1, + ("osd.1", "/dev/dm-0", "node3"): 1, + ("osd.2", "/dev/dm-0", "node2"): 1, + ("osd.3", "/dev/dm-1", "node1"): 1, + ("osd.4", "/dev/dm-1", "node3"): 1, + ("osd.5+osd.6", "/dev/dm-1", "node2"): 1, + }, + ) + self.maxDiff = None + self.assertEqual( + grouped_metric.str_expfmt(), + """ +# HELP ceph_disk_occupation_display +# TYPE ceph_disk_occupation_display untyped +ceph_disk_occupation_display{ceph_daemon="osd.0",device="/dev/dm-0",instance="node1"} 1.0 +ceph_disk_occupation_display{ceph_daemon="osd.1",device="/dev/dm-0",instance="node3"} 1.0 +ceph_disk_occupation_display{ceph_daemon="osd.2",device="/dev/dm-0",instance="node2"} 1.0 +ceph_disk_occupation_display{ceph_daemon="osd.3",device="/dev/dm-1",instance="node1"} 1.0 +ceph_disk_occupation_display{ceph_daemon="osd.4",device="/dev/dm-1",instance="node3"} 1.0 +ceph_disk_occupation_display{ceph_daemon="osd.5+osd.6",device="/dev/dm-1",instance="node2"} 1.0""", # noqa: W291 + ) + self.assertEqual( + grouped_metric.labelnames, ("ceph_daemon", "device", "instance") + ) + + def test_metric_group_by__no_value(self): + m = Metric("metric_type", "name", "desc", labels=('foo', 'bar')) + grouped = m.group_by(['foo'], {'bar': lambda bars: ', '.join(bars)}) + self.assertEqual(grouped.value, {}) + self.assertEqual(grouped.str_expfmt(), + '\n# HELP ceph_name desc\n# TYPE ceph_name metric_type') + + def test_metric_group_by__no_labels(self): + m = Metric("metric_type", "name", "desc", labels=None) + with self.assertRaises(AssertionError) as cm: + m.group_by([], {}) + self.assertEqual(str(cm.exception), "cannot match keys without label names") + + def test_metric_group_by__key_not_in_labels(self): + m = Metric("metric_type", "name", "desc", labels=("foo", "bar")) + m.value = self.metric_value + with self.assertRaises(AssertionError) as cm: + m.group_by(["baz"], {}) + self.assertEqual(str(cm.exception), "unknown key: baz") + + def test_metric_group_by__empty_joins(self): + m = Metric("", "", "", ("foo", "bar")) + with self.assertRaises(AssertionError) as cm: + m.group_by(["foo"], joins={}) + self.assertEqual(str(cm.exception), "joins must not be empty") + + def test_metric_group_by__joins_not_callable(self): + m = Metric("", "", "", ("foo", "bar")) + m.value = self.metric_value + with self.assertRaises(AssertionError) as cm: + m.group_by(["foo"], {"bar": "not callable str"}) + self.assertEqual(str(cm.exception), "joins must be callable") diff --git a/src/pybind/mgr/requirements.txt b/src/pybind/mgr/requirements.txt index b7c8a12a5895d..7fdd0f7926b3b 100644 --- a/src/pybind/mgr/requirements.txt +++ b/src/pybind/mgr/requirements.txt @@ -1,3 +1,4 @@ +cherrypy pytest-cov==2.7.1 mock; python_version <= '3.3' ipaddress; python_version < '3.3' diff --git a/src/pybind/mgr/tox.ini b/src/pybind/mgr/tox.ini index 3e129ba64eb16..1744179d14ae7 100644 --- a/src/pybind/mgr/tox.ini +++ b/src/pybind/mgr/tox.ini @@ -40,7 +40,9 @@ commands = cephadm/ \ orchestrator/ \ pg_autoscaler/ \ - progress/} + progress/ \ + prometheus/ \ + } [testenv:mypy] basepython = python3