Skip to content

Commit

Permalink
Tools 2679 show stop writes bug (#210)
Browse files Browse the repository at this point in the history
* fix: TOOLS-2679 show stop-writes does not report storage-engine device or pmem metrics
  • Loading branch information
jdogmcsteezy committed Oct 6, 2023
1 parent dc10b8f commit 0e51659
Show file tree
Hide file tree
Showing 5 changed files with 261 additions and 102 deletions.
4 changes: 2 additions & 2 deletions lib/health/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,13 +407,13 @@
stop_writes = select "stop_writes" from NAMESPACE.STATISTICS;
stop_writes = group by CLUSTER, NAMESPACE stop_writes;
ASSERT(stop_writes, False, "Namespace has hit stop-writes (stop_writes = true)", "OPERATIONS" , CRITICAL,
"Listed namespace(s) have hit stop-write. Please run 'show statistics namespace like stop_writes' for details.",
"Listed namespace(s) have hit stop-write. Please run 'show stop-writes' for details.",
"Namespace stop-writes flag check.");
clock_skew_stop_writes = select "clock_skew_stop_writes" from NAMESPACE.STATISTICS;
clock_skew_stop_writes = group by CLUSTER, NAMESPACE clock_skew_stop_writes;
ASSERT(clock_skew_stop_writes, False, "Namespace has hit clock-skew-stop-writes (clock_skew_stop_writes = true)", "OPERATIONS" , CRITICAL,
"Listed namespace(s) have hit clock-skew-stop-writes. Please run 'show statistics namespace like clock_skew_stop_writes' for details.",
"Listed namespace(s) have hit clock-skew-stop-writes. Please run 'show stop-writes' for details.",
"Namespace clock-skew-stop-writes flag check.");
SET CONSTRAINT VERSION < 4.3;
Expand Down
168 changes: 111 additions & 57 deletions lib/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1647,41 +1647,80 @@ def _create_stop_writes_entry(
node_sw_metrics[(namespace, set_, metric)] = entry


@staticmethod
def _is_stop_writes_cause(
usage: int | float, threshold: int | float, stop_writes: str | None = None
usage: int | float,
threshold: int | float,
stop_writes: str | None = None,
invert: bool = False,
):
if threshold == 0:
return False

if invert:
return (
True
if usage <= threshold
and (stop_writes is None or stop_writes.lower() == "true")
else False
)
return (
True
if usage >= threshold and (stop_writes is None or stop_writes.lower() == "true")
else False
)


@staticmethod
def _get_first_value_from_dict_with_key(
dict_: dict[str, Any],
key: str | tuple,
default_value: Any = None,
return_type: type = str,
) -> tuple[Any, Any]:
if isinstance(key, str):
key = (key,)

for key in key:
if key in dict_:
return key, util.get_value_from_dict(
dict_, key, default_value=default_value, return_type=return_type
)

return None, None


def _format_ns_stop_writes_metrics(
stop_writes_metrics: StopWritesDict,
service_stats,
ns_stats,
):
for node in service_stats:
cluster_clock_skew_ms = service_stats[node].get("cluster_clock_skew_ms", None)
cluster_clock_skew_stop_writes_sec = service_stats[node].get(
"cluster_clock_skew_stop_writes_sec", None
cluster_clock_skew_ms: int | None = util.get_value_from_dict(
service_stats[node],
"cluster_clock_skew_ms",
None,
return_type=int,
)
cluster_clock_skew_stop_writes_sec: int | None = util.get_value_from_dict(
service_stats[node],
"cluster_clock_skew_stop_writes_sec",
None,
return_type=int,
)
system_free_mem_pct: int | None = util.get_value_from_dict(
service_stats[node],
"system_free_mem_pct",
None,
return_type=int,
)
system_free_mem_pct = service_stats[node].get("system_free_mem_pct", None)

for ns, stats in ns_stats.get(node, {}).items():
# There is no config for this trigger
strong_consistency: str | None = stats.get("strong-consistency", None)
nsup_period: str | None = stats.get("nsup-period", None)
stop_writes: str | None = stats.get("clock_skew_stop_writes", None)
metric: str = "cluster_clock_skew_ms"
usage = cluster_clock_skew_ms
threshold = cluster_clock_skew_stop_writes_sec
metric = "cluster_clock_skew_ms"
usage: int | float | None = cluster_clock_skew_ms
threshold: int | float | None = cluster_clock_skew_stop_writes_sec

"""
For Available mode (AP) namespaces running versions 4.5.1 or above and where
Expand All @@ -1694,34 +1733,32 @@ def _format_ns_stop_writes_metrics(
and nsup_period is not None # nsup-period was added in 4.5.1.
and nsup_period != "0"
):
thresh = 40000
threshold = 40000
else:
thresh = (
int(cluster_clock_skew_stop_writes_sec) * 1000
) # convert to ms
use = int(usage)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
threshold = int(threshold) * 1000 # convert to ms

sw = _is_stop_writes_cause(usage, threshold, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
namespace=ns,
)

stop_writes: str | None = stats.get("stop_writes", None)
metric = "system_free_mem_pct"
config = "stop-writes-sys-memory-pct"
threshold: str | None = stats.get(config, None)
threshold = util.get_value_from_dict(stats, config, None, return_type=int)

if (
threshold is not None
and system_free_mem_pct is not None
and stop_writes is not None
):
thresh = int(threshold)
use = 100 - int(system_free_mem_pct)
thresh = threshold
use = 100 - system_free_mem_pct
sw = _is_stop_writes_cause(use, thresh, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
Expand All @@ -1740,70 +1777,87 @@ def _format_ns_stop_writes_metrics(
if stop_writes is None:
continue

metric = "device_avail_pct"
config = "min-avail-pct"
usage: str | None = stats.get(metric, None)
threshold: str | None = stats.get(config, None)

if usage is None:
metric = "pmem_avail_pct"
usage = stats.get(metric, None)
metric, usage = _get_first_value_from_dict_with_key(
stats,
("data_avail_pct", "device_available_pct", "pmem_available_pct"),
default_value=None,
return_type=int,
)
config, threshold = _get_first_value_from_dict_with_key(
stats,
(
"storage-engine.stop-writes-avail-pct",
"storage-engine.min-avail-pct",
),
default_value=None,
return_type=int,
)

if usage is not None and threshold is not None:
use = int(usage)
thresh = int(threshold)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
sw = _is_stop_writes_cause(usage, threshold, stop_writes, invert=True)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
config=config,
namespace=ns,
)

metric = "device_used_bytes"
config = "max-used-pct"
usage: str | None = stats.get(metric, None)
bytes_total: str | None = stats.get("device_total_bytes", None)
threshold: str | None = stats.get(config, None)

if usage is None:
metric = "pmem_used_bytes"
usage = stats.get(metric, None)
bytes_total = stats.get("pmem_total_bytes", None)
metric, usage = _get_first_value_from_dict_with_key(
stats,
("data_used_bytes", "device_used_bytes", "pmem_used_bytes"),
default_value=None,
return_type=int,
)
config, threshold = _get_first_value_from_dict_with_key(
stats,
("storage-engine.stop-writes-used-pct", "storage-engine.max-used-pct"),
default_value=None,
return_type=int,
)
bytes_total: int | float | None = util.get_value_from_dict(
stats,
("data_total_bytes", "device_total_bytes", "pmem_total_bytes"),
None,
return_type=int,
)

if usage is not None and threshold is not None and bytes_total is not None:
use = int(usage)
thresh = int(bytes_total) * (int(threshold) / 100)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
threshold = bytes_total * (threshold / 100)
sw = _is_stop_writes_cause(usage, threshold, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
config=config,
namespace=ns,
)

metric = "memory_used_bytes"
config = "stop-writes-pct"
usage: str | None = stats.get(metric, None)
bytes_total: str | None = stats.get("memory-size", None)
threshold: str | None = stats.get(config, None)
usage = util.get_value_from_dict(
stats, metric, default_value=None, return_type=int
)
bytes_total = util.get_value_from_dict(
stats, "memory-size", default_value=None, return_type=int
)
threshold = util.get_value_from_dict(
stats, config, default_value=None, return_type=int
)

if usage is not None and threshold is not None and bytes_total is not None:
use = int(usage)
thresh = int(bytes_total) * (int(threshold) / 100)
sw = _is_stop_writes_cause(use, thresh, stop_writes)
threshold = bytes_total * (threshold / 100)
sw = _is_stop_writes_cause(usage, threshold, stop_writes)
_create_stop_writes_entry(
stop_writes_metrics[node],
metric,
use,
usage,
sw,
thresh,
threshold,
config=config,
namespace=ns,
)
Expand Down
13 changes: 8 additions & 5 deletions lib/view/sheet/decleration.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,19 @@ def fun(edata: EntryValue):
return fun

@staticmethod
def _fmt_pct_type(val: float):
def _fmt_pct_type(val: float, invert: bool = False):
if invert:
val = 100 - val

return str(round(float(val), 2)) + " %"

@staticmethod
def ratio_to_pct(edata: EntryValue):
return Converters._fmt_pct_type(edata.value * 100)
def ratio_to_pct(edata: EntryValue, invert: bool = False):
return Converters._fmt_pct_type(edata.value * 100, invert)

@staticmethod
def pct(edata: EntryValue):
return Converters._fmt_pct_type(edata.value)
def pct(edata: EntryValue, invert: bool = False):
return Converters._fmt_pct_type(edata.value, invert)


FormatterPredicateFnType = Callable[[EntryData], bool]
Expand Down
33 changes: 27 additions & 6 deletions lib/view/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1677,8 +1677,14 @@ def stop_writes_converter_selector(edata: EntryData):
return None

metric = edata.record["Metric"]
val = ""

if "pct" in metric:
if "avail" in metric:
val = Converters.pct(edata, invert=True)
val = "(inverted) " + val
return val

return Converters.pct(edata)
if "bytes" in metric:
return Converters.byte(edata)
Expand All @@ -1688,6 +1694,25 @@ def stop_writes_converter_selector(edata: EntryData):
return Converters.scientific_units(edata)


class StopWritesUsagePctProjector(Projectors.Number):
def __init__(self, source, *keys, **kwargs):
"""
Keyword Arguments:
invert -- False by default, if True will return 100 - value.
"""
super().__init__(source, *keys, **kwargs)
self.invert = kwargs.get("invert", False)

def do_project(self, sheet, sources):
data = sources.get("stop_writes", ((), {}))[1]
val = super().do_project(sheet, sources)

if "metric" in data and "avail" in data["metric"]:
val = 100 - val

return _ignore_zero(val)


sw_row_yellow_format = (
Formatters.yellow_alert(lambda edata: edata.record["Stop-Writes"] == True),
)
Expand Down Expand Up @@ -1738,12 +1763,8 @@ def stop_writes_converter_selector(edata: EntryData):
Field(
"Usage%",
Projectors.Div(
Projectors.Number("stop_writes", "metric_usage"),
Projectors.Func(
FieldType.number,
_ignore_zero,
Projectors.Number("stop_writes", "metric_threshold"),
),
StopWritesUsagePctProjector("stop_writes", "metric_usage"),
StopWritesUsagePctProjector("stop_writes", "metric_threshold"),
),
converter=Converters.ratio_to_pct,
formatters=sw_val_red_format + sw_val_yellow_format + sw_row_yellow_format,
Expand Down
Loading

0 comments on commit 0e51659

Please sign in to comment.