Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mgr/cephadm: improve offline host handling, mostly around upgrade #48592

Merged
merged 7 commits into from Jan 13, 2023
9 changes: 8 additions & 1 deletion src/pybind/mgr/cephadm/module.py
Expand Up @@ -1098,6 +1098,10 @@ def check_host(self, host: str, addr: Optional[str] = None) -> Tuple[int, str, s
error_ok=True, no_fsid=True))
if code:
return 1, '', ('check-host failed:\n' + '\n'.join(err))
except ssh.HostConnectionError as e:
self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}")
return 1, '', ('check-host failed:\n'
+ f"Failed to connect to {host} at address ({e.addr}): {str(e)}")
except OrchestratorError:
self.log.exception(f"check-host failed for '{host}'")
return 1, '', ('check-host failed:\n'
Expand Down Expand Up @@ -1578,6 +1582,7 @@ def run_cmd(cmd_args: dict) -> None:
self.inventory.rm_host(host)
self.cache.rm_host(host)
self.ssh.reset_con(host)
self.offline_hosts_remove(host) # if host was in offline host list, we should remove it now.
self.event.set() # refresh stray health check
self.log.info('Removed host %s' % host)
return "Removed {} host '{}'".format('offline' if offline else '', host)
Expand Down Expand Up @@ -2989,7 +2994,9 @@ def upgrade_ls(self, image: Optional[str], tags: bool, show_all_versions: Option
def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
if self.inventory.get_host_with_state("maintenance"):
raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state")
raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
if self.offline_hosts:
raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}")
if daemon_types is not None and services is not None:
raise OrchestratorError('--daemon-types and --services are mutually exclusive')
if daemon_types is not None:
Expand Down
23 changes: 18 additions & 5 deletions src/pybind/mgr/cephadm/ssh.py
Expand Up @@ -26,6 +26,14 @@
asyncssh_logger = logging.getLogger('asyncssh')
asyncssh_logger.propagate = False


class HostConnectionError(OrchestratorError):
def __init__(self, message: str, hostname: str, addr: str) -> None:
super().__init__(message)
self.hostname = hostname
self.addr = addr


DEFAULT_SSH_CONFIG = """
Host *
User root
Expand Down Expand Up @@ -106,19 +114,19 @@ def redirect_log(self, host: str, addr: str) -> Iterator[None]:
log_content = log_string.getvalue()
msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}"
logger.exception(msg)
raise OrchestratorError(msg)
raise HostConnectionError(msg, host, addr)
except asyncssh.Error as e:
self.mgr.offline_hosts.add(host)
log_content = log_string.getvalue()
msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}'
logger.debug(msg)
raise OrchestratorError(msg)
raise HostConnectionError(msg, host, addr)
except Exception as e:
self.mgr.offline_hosts.add(host)
log_content = log_string.getvalue()
logger.exception(str(e))
raise OrchestratorError(
f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}')
raise HostConnectionError(
f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr)
finally:
log_string.flush()
asyncssh_logger.removeHandler(ch)
Expand Down Expand Up @@ -148,7 +156,12 @@ async def _execute_command(self,
logger.debug(f'Connection to {host} failed. {str(e)}')
await self._reset_con(host)
self.mgr.offline_hosts.add(host)
raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}')
if not addr:
try:
addr = self.mgr.inventory.get_addr(host)
except Exception:
addr = host
raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr)

def _rstrip(v: Union[bytes, str, None]) -> str:
if not v:
Expand Down
2 changes: 1 addition & 1 deletion src/pybind/mgr/cephadm/tests/test_ssh.py
Expand Up @@ -38,7 +38,7 @@ def test_offline(self, check_execute_command, execute_command, cephadm_module):
asyncssh_connect.side_effect = ConnectionLost('reason')
code, out, err = cephadm_module.check_host('test')
assert out == ''
assert "Host 'test' not found" in err
assert "Failed to connect to test at address (1::4)" in err

out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
assert out == HostSpec('test', '1::4', status='Offline').to_json()
Expand Down
48 changes: 47 additions & 1 deletion src/pybind/mgr/cephadm/tests/test_upgrade.py
Expand Up @@ -5,7 +5,8 @@

from ceph.deployment.service_spec import PlacementSpec, ServiceSpec
from cephadm import CephadmOrchestrator
from cephadm.upgrade import CephadmUpgrade
from cephadm.upgrade import CephadmUpgrade, UpgradeState
from cephadm.ssh import HostConnectionError
from orchestrator import OrchestratorError, DaemonDescription
from .fixtures import _run_cephadm, wait, with_host, with_service, \
receive_agent_metadata, async_side_effect
Expand Down Expand Up @@ -34,6 +35,51 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator):
) == 'Stopped upgrade to image_id'


@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.offline_hosts = set(['test2'])
with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"):
cephadm_module.upgrade_start('image_id', None)
cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block


@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')):
_to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)]
with pytest.raises(HostConnectionError, match=r"connection failure reason"):
cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1'])


@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
cephadm_module.offline_hosts = set(['test2'])
with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"):
cephadm_module.upgrade._do_upgrade()
cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block


@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning")
def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True)
_rm_health_warning.return_value = None
assert wait(cephadm_module, cephadm_module.upgrade_resume()
) == 'Resumed upgrade to target_image'
calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS]
_rm_health_warning.assert_has_calls(calls_list, any_order=True)


@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@pytest.mark.parametrize("use_repo_digest",
[
Expand Down
26 changes: 25 additions & 1 deletion src/pybind/mgr/cephadm/upgrade.py
Expand Up @@ -10,6 +10,7 @@
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
from cephadm.ssh import HostConnectionError
from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service

if TYPE_CHECKING:
Expand Down Expand Up @@ -122,7 +123,8 @@ class CephadmUpgrade:
'UPGRADE_FAILED_PULL',
'UPGRADE_REDEPLOY_DAEMON',
'UPGRADE_BAD_TARGET_VERSION',
'UPGRADE_EXCEPTION'
'UPGRADE_EXCEPTION',
'UPGRADE_OFFLINE_HOST'
]

def __init__(self, mgr: "CephadmOrchestrator"):
Expand Down Expand Up @@ -467,6 +469,8 @@ def upgrade_resume(self) -> str:
self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image)
self._save_upgrade_state()
self.mgr.event.set()
for alert_id in self.UPGRADE_ERRORS:
self.mgr.remove_health_warning(alert_id)
return 'Resumed upgrade to %s' % self.target_image

def upgrade_stop(self) -> str:
Expand All @@ -491,6 +495,14 @@ def continue_upgrade(self) -> bool:
if self.upgrade_state and not self.upgrade_state.paused:
try:
self._do_upgrade()
except HostConnectionError as e:
self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
'severity': 'error',
'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
'count': 1,
'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
})
return False
except Exception as e:
self._fail_upgrade('UPGRADE_EXCEPTION', {
'severity': 'error',
Expand Down Expand Up @@ -1025,6 +1037,18 @@ def _do_upgrade(self):
logger.debug('_do_upgrade no state, exiting')
return

if self.mgr.offline_hosts:
# offline host(s), on top of potential connection errors when trying to upgrade a daemon
# or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
# whether or not that risk is present for any given offline hosts is a difficult problem,
# it's best to just fail upgrade cleanly so user can address the offline host(s)

# the HostConnectionError expects a hostname and addr, so let's just take
# one at random. It doesn't really matter which host we say we couldn't reach here.
hostname: str = list(self.mgr.offline_hosts)[0]
addr: str = self.mgr.inventory.get_addr(hostname)
raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)

target_image = self.target_image
target_id = self.upgrade_state.target_id
target_digests = self.upgrade_state.target_digests
Expand Down