Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

squid: mgr/cephadm: don't mark daemons created/removed in the last minute as stray #57397

Open
wants to merge 2 commits into
base: squid
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/pybind/mgr/cephadm/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,11 @@ def __init__(self, *args: Any, **kwargs: Any):
self.offline_watcher = OfflineHostWatcher(self)
self.offline_watcher.start()

# Maps daemon names to timestamps (creation/removal time) for recently created or
# removed daemons. Daemons are added to the dict upon creation or removal and cleared
# as part of the handling of stray daemons
self.recently_altered_daemons: Dict[str, datetime.datetime] = {}

def shutdown(self) -> None:
self.log.debug('shutdown')
self._worker_pool.close()
Expand Down
12 changes: 12 additions & 0 deletions src/pybind/mgr/cephadm/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,11 @@ def _check_for_strays(self) -> None:
for k in ['CEPHADM_STRAY_HOST',
'CEPHADM_STRAY_DAEMON']:
self.mgr.remove_health_warning(k)
# clear recently altered daemons that were created/removed more than 60 seconds ago
self.mgr.recently_altered_daemons = {
d: t for (d, t) in self.mgr.recently_altered_daemons.items()
if ((datetime_now() - t).total_seconds() < 60)
}
if self.mgr.warn_on_stray_hosts or self.mgr.warn_on_stray_daemons:
ls = self.mgr.list_servers()
self.log.debug(ls)
Expand Down Expand Up @@ -504,6 +509,11 @@ def _check_for_strays(self) -> None:
# and don't have a way to check if the daemon is part of iscsi service
# we assume that all tcmu-runner daemons are managed by cephadm
managed.append(name)
# Don't mark daemons we just created/removed in the last minute as stray.
# It may take some time for the mgr to become aware the daemon
# had been created/removed.
if name in self.mgr.recently_altered_daemons:
continue
if host not in self.mgr.inventory:
missing_names.append(name)
host_num_daemons += 1
Expand Down Expand Up @@ -1409,6 +1419,7 @@ async def _create_daemon(self,
what = 'reconfigure' if reconfig else 'deploy'
self.mgr.events.for_daemon(
daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
self.mgr.recently_altered_daemons[daemon_spec.name()] = datetime_now()
return msg
except OrchestratorError:
redeploy = daemon_spec.name() in self.mgr.cache.get_daemon_names()
Expand Down Expand Up @@ -1508,6 +1519,7 @@ def _remove_daemon(self, name: str, host: str, no_post_remove: bool = False) ->
daemon_type)].post_remove(daemon, is_failed_deploy=False))
self.mgr._kick_serve_loop()

self.mgr.recently_altered_daemons[name] = datetime_now()
return "Removed {} from host '{}'".format(name, host)

async def _run_cephadm_json(self,
Expand Down
18 changes: 18 additions & 0 deletions src/pybind/mgr/cephadm/tests/test_remote_executables.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,24 @@ def _names(node):
return [f"<JoinedStr: {node.values!r}>"]
if isinstance(node, ast.Subscript):
return [f"<Subscript: {node.value}{node.slice}>"]
if isinstance(node, ast.BinOp):
return [f"<BinaryOp: {_names(node.left)} {_names(node.op)} {_names(node.right)}"]
if (
isinstance(node, ast.Add)
or isinstance(node, ast.Sub)
or isinstance(node, ast.Mult)
or isinstance(node, ast.Div)
or isinstance(node, ast.FloorDiv)
or isinstance(node, ast.Mod)
or isinstance(node, ast.Pow)
or isinstance(node, ast.LShift)
or isinstance(node, ast.RShift)
or isinstance(node, ast.ButOr)
or isinstance(node, ast.BitXor)
or isinstance(node, ast.BitAnd)
or isinstance(node, ast.MatMult)
):
return [repr(node)]
raise ValueError(f"_names: unexpected type: {node}")


Expand Down