Skip to content

Commit

Permalink
mgr/cephadm: don't mark daemons created/removed in the last minute as…
Browse files Browse the repository at this point in the history
… stray

There is sometimes a slight delay between when the core
mgr knows a daemon has been created/removed and when cephadm knows
it as been created/removed. This can cause stray daemon warnings
to pop up for a few seconds at a time. This patch tries
to avoid that by not marking daemons as stray that it
knows it just created/removed in the past minute.

Signed-off-by: Adam King <adking@redhat.com>
  • Loading branch information
adk3798 committed Apr 22, 2024
1 parent 56e81df commit 2f31a48
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
5 changes: 5 additions & 0 deletions src/pybind/mgr/cephadm/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,11 @@ def __init__(self, *args: Any, **kwargs: Any):
self.offline_watcher = OfflineHostWatcher(self)
self.offline_watcher.start()

# Maps daemon names to timestamps (creation/removal time) for recently created or
# removed daemons. Daemons are added to the dict upon creation or removal and cleared
# as part of the handling of stray daemons
self.recently_altered_daemons: Dict[str, datetime.datetime] = {}

def shutdown(self) -> None:
self.log.debug('shutdown')
self._worker_pool.close()
Expand Down
12 changes: 12 additions & 0 deletions src/pybind/mgr/cephadm/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,11 @@ def _check_for_strays(self) -> None:
for k in ['CEPHADM_STRAY_HOST',
'CEPHADM_STRAY_DAEMON']:
self.mgr.remove_health_warning(k)
# clear recently altered daemons that were created/removed more than 60 seconds ago
self.mgr.recently_altered_daemons = {
d: t for (d, t) in self.mgr.recently_altered_daemons.items()
if ((datetime_now() - t).total_seconds() < 60)
}
if self.mgr.warn_on_stray_hosts or self.mgr.warn_on_stray_daemons:
ls = self.mgr.list_servers()
self.log.debug(ls)
Expand Down Expand Up @@ -504,6 +509,11 @@ def _check_for_strays(self) -> None:
# and don't have a way to check if the daemon is part of iscsi service
# we assume that all tcmu-runner daemons are managed by cephadm
managed.append(name)
# Don't mark daemons we just created/removed in the last minute as stray.
# It may take some time for the mgr to become aware the daemon
# had been created/removed.
if name in self.mgr.recently_altered_daemons:
continue
if host not in self.mgr.inventory:
missing_names.append(name)
host_num_daemons += 1
Expand Down Expand Up @@ -1409,6 +1419,7 @@ async def _create_daemon(self,
what = 'reconfigure' if reconfig else 'deploy'
self.mgr.events.for_daemon(
daemon_spec.name(), OrchestratorEvent.ERROR, f'Failed to {what}: {err}')
self.mgr.recently_altered_daemons[daemon_spec.name()] = datetime_now()
return msg
except OrchestratorError:
redeploy = daemon_spec.name() in self.mgr.cache.get_daemon_names()
Expand Down Expand Up @@ -1508,6 +1519,7 @@ def _remove_daemon(self, name: str, host: str, no_post_remove: bool = False) ->
daemon_type)].post_remove(daemon, is_failed_deploy=False))
self.mgr._kick_serve_loop()

self.mgr.recently_altered_daemons[name] = datetime_now()
return "Removed {} from host '{}'".format(name, host)

async def _run_cephadm_json(self,
Expand Down

0 comments on commit 2f31a48

Please sign in to comment.