From d43c351912c96fd44eb5377b06b0717ce0628f4c Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 17 Apr 2024 09:36:13 -0400 Subject: [PATCH 1/2] mgr/cephadm: don't mark daemons created/removed in the last minute as stray There is sometimes a slight delay between when the core mgr knows a daemon has been created/removed and when cephadm knows it as been created/removed. This can cause stray daemon warnings to pop up for a few seconds at a time. This patch tries to avoid that by not marking daemons as stray that it knows it just created/removed in the past minute. Signed-off-by: Adam King (cherry picked from commit 2f31a481ffc7cf6322b957f52bbfaa6ebbd96da2) --- src/pybind/mgr/cephadm/module.py | 5 +++++ src/pybind/mgr/cephadm/serve.py | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 49eab78fe70e6..2a75a08e0f60e 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -730,6 +730,11 @@ def __init__(self, *args: Any, **kwargs: Any): self.offline_watcher = OfflineHostWatcher(self) self.offline_watcher.start() + # Maps daemon names to timestamps (creation/removal time) for recently created or + # removed daemons. Daemons are added to the dict upon creation or removal and cleared + # as part of the handling of stray daemons + self.recently_altered_daemons: Dict[str, datetime.datetime] = {} + def shutdown(self) -> None: self.log.debug('shutdown') self._worker_pool.close() diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index 4c7889bd18fe0..1094ecb83137a 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -466,6 +466,11 @@ def _check_for_strays(self) -> None: for k in ['CEPHADM_STRAY_HOST', 'CEPHADM_STRAY_DAEMON']: self.mgr.remove_health_warning(k) + # clear recently altered daemons that were created/removed more than 60 seconds ago + self.mgr.recently_altered_daemons = { + d: t for (d, t) in self.mgr.recently_altered_daemons.items() + if ((datetime_now() - t).total_seconds() < 60) + } if self.mgr.warn_on_stray_hosts or self.mgr.warn_on_stray_daemons: ls = self.mgr.list_servers() self.log.debug(ls) @@ -504,6 +509,11 @@ def _check_for_strays(self) -> None: # and don't have a way to check if the daemon is part of iscsi service # we assume that all tcmu-runner daemons are managed by cephadm managed.append(name) + # Don't mark daemons we just created/removed in the last minute as stray. + # It may take some time for the mgr to become aware the daemon + # had been created/removed. + if name in self.mgr.recently_altered_daemons: + continue if host not in self.mgr.inventory: missing_names.append(name) host_num_daemons += 1 @@ -1409,6 +1419,7 @@ async def _create_daemon(self, what = 'reconfigure' if reconfig else 'deploy' self.mgr.events.for_daemon( daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}') + self.mgr.recently_altered_daemons[daemon_spec.name()] = datetime_now() return msg except OrchestratorError: redeploy = daemon_spec.name() in self.mgr.cache.get_daemon_names() @@ -1508,6 +1519,7 @@ def _remove_daemon(self, name: str, host: str, no_post_remove: bool = False) -> daemon_type)].post_remove(daemon, is_failed_deploy=False)) self.mgr._kick_serve_loop() + self.mgr.recently_altered_daemons[name] = datetime_now() return "Removed {} from host '{}'".format(name, host) async def _run_cephadm_json(self, From 3dbbf324f0e1717c2a95a252a0b7b34cb301e1fd Mon Sep 17 00:00:00 2001 From: Adam King Date: Wed, 17 Apr 2024 10:07:09 -0400 Subject: [PATCH 2/2] mgr/cephadm: make remote_executables test able to handle ast.BinOp Was adding a line to serve.py that did if ((datetime_now() - t).total_seconds() < 60) and this was causing the remote_executables test to fail with ValueError: _names: unexpected type: where it seems the (datetime_now() - t) was resolving to an ast.BinOp node which had no case in _names. This patch makes it so the remote_executables test can also handle these BinOp nodes and the binary operations that could be within the node Signed-off-by: Adam King (cherry picked from commit d28aabd7e066d79d28e8fff46777e29d4b48ad59) --- .../cephadm/tests/test_remote_executables.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/pybind/mgr/cephadm/tests/test_remote_executables.py b/src/pybind/mgr/cephadm/tests/test_remote_executables.py index a98f81a8df1ed..9d5bd458254c3 100644 --- a/src/pybind/mgr/cephadm/tests/test_remote_executables.py +++ b/src/pybind/mgr/cephadm/tests/test_remote_executables.py @@ -102,6 +102,24 @@ def _names(node): return [f""] if isinstance(node, ast.Subscript): return [f""] + if isinstance(node, ast.BinOp): + return [f"