ceph · adk3798 · Jan 13, 2023 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022
@@ -1098,6 +1098,10 @@ def check_host(self, host: str, addr: Optional[str] = None) -> Tuple[int, str, s
                                                                              error_ok=True, no_fsid=True))
             if code:
                 return 1, '', ('check-host failed:\n' + '\n'.join(err))
+        except ssh.HostConnectionError as e:
+            self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}")
+            return 1, '', ('check-host failed:\n'
+                           + f"Failed to connect to {host} at address ({e.addr}): {str(e)}")
         except OrchestratorError:
             self.log.exception(f"check-host failed for '{host}'")
             return 1, '', ('check-host failed:\n'
@@ -1578,6 +1582,7 @@ def run_cmd(cmd_args: dict) -> None:
         self.inventory.rm_host(host)
         self.cache.rm_host(host)
         self.ssh.reset_con(host)
+        self.offline_hosts_remove(host)  # if host was in offline host list, we should remove it now.
         self.event.set()  # refresh stray health check
         self.log.info('Removed host %s' % host)
         return "Removed {} host '{}'".format('offline' if offline else '', host)
@@ -2989,7 +2994,9 @@ def upgrade_ls(self, image: Optional[str], tags: bool, show_all_versions: Option
     def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
                       services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
         if self.inventory.get_host_with_state("maintenance"):
-            raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state")
+            raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
+        if self.offline_hosts:
+            raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}")
         if daemon_types is not None and services is not None:
             raise OrchestratorError('--daemon-types and --services are mutually exclusive')
         if daemon_types is not None:

@@ -26,6 +26,14 @@
 asyncssh_logger = logging.getLogger('asyncssh')
 asyncssh_logger.propagate = False
 
+
+class HostConnectionError(OrchestratorError):
+    def __init__(self, message: str, hostname: str, addr: str) -> None:
+        super().__init__(message)
+        self.hostname = hostname
+        self.addr = addr
+
+
 DEFAULT_SSH_CONFIG = """
 Host *
   User root
@@ -106,19 +114,19 @@ def redirect_log(self, host: str, addr: str) -> Iterator[None]:
             log_content = log_string.getvalue()
             msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}"
             logger.exception(msg)
-            raise OrchestratorError(msg)
+            raise HostConnectionError(msg, host, addr)
         except asyncssh.Error as e:
             self.mgr.offline_hosts.add(host)
             log_content = log_string.getvalue()
             msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}'
             logger.debug(msg)
-            raise OrchestratorError(msg)
+            raise HostConnectionError(msg, host, addr)
         except Exception as e:
             self.mgr.offline_hosts.add(host)
             log_content = log_string.getvalue()
             logger.exception(str(e))
-            raise OrchestratorError(
-                f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}')
+            raise HostConnectionError(
+                f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr)
         finally:
             log_string.flush()
             asyncssh_logger.removeHandler(ch)
@@ -148,7 +156,12 @@ async def _execute_command(self,
             logger.debug(f'Connection to {host} failed. {str(e)}')
             await self._reset_con(host)
             self.mgr.offline_hosts.add(host)
-            raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}')
+            if not addr:
+                try:
+                    addr = self.mgr.inventory.get_addr(host)
+                except Exception:
+                    addr = host
+            raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr)
 
         def _rstrip(v: Union[bytes, str, None]) -> str:
             if not v:

@@ -38,7 +38,7 @@ def test_offline(self, check_execute_command, execute_command, cephadm_module):
                 asyncssh_connect.side_effect = ConnectionLost('reason')
                 code, out, err = cephadm_module.check_host('test')
                 assert out == ''
-                assert "Host 'test' not found" in err
+                assert "Failed to connect to test at address (1::4)" in err
 
                 out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
                 assert out == HostSpec('test', '1::4', status='Offline').to_json()

@@ -5,7 +5,8 @@
 
 from ceph.deployment.service_spec import PlacementSpec, ServiceSpec
 from cephadm import CephadmOrchestrator
-from cephadm.upgrade import CephadmUpgrade
+from cephadm.upgrade import CephadmUpgrade, UpgradeState
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription
 from .fixtures import _run_cephadm, wait, with_host, with_service, \
     receive_agent_metadata, async_side_effect
@@ -34,6 +35,51 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator):
                             ) == 'Stopped upgrade to image_id'
 
 
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.offline_hosts = set(['test2'])
+            with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"):
+                cephadm_module.upgrade_start('image_id', None)
+            cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
+            with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')):
+                _to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)]
+                with pytest.raises(HostConnectionError, match=r"connection failure reason"):
+                    cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1'])
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
+            cephadm_module.offline_hosts = set(['test2'])
+            with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"):
+                cephadm_module.upgrade._do_upgrade()
+            cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning")
+def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True)
+            _rm_health_warning.return_value = None
+            assert wait(cephadm_module, cephadm_module.upgrade_resume()
+                        ) == 'Resumed upgrade to target_image'
+            calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS]
+            _rm_health_warning.assert_has_calls(calls_list, any_order=True)
+
+
 @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
 @pytest.mark.parametrize("use_repo_digest",
                          [

@@ -10,6 +10,7 @@
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
     MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service
 
 if TYPE_CHECKING:
@@ -122,7 +123,8 @@ class CephadmUpgrade:
         'UPGRADE_FAILED_PULL',
         'UPGRADE_REDEPLOY_DAEMON',
         'UPGRADE_BAD_TARGET_VERSION',
-        'UPGRADE_EXCEPTION'
+        'UPGRADE_EXCEPTION',
+        'UPGRADE_OFFLINE_HOST'
     ]
 
     def __init__(self, mgr: "CephadmOrchestrator"):
@@ -467,6 +469,8 @@ def upgrade_resume(self) -> str:
         self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image)
         self._save_upgrade_state()
         self.mgr.event.set()
+        for alert_id in self.UPGRADE_ERRORS:
+            self.mgr.remove_health_warning(alert_id)
         return 'Resumed upgrade to %s' % self.target_image
 
     def upgrade_stop(self) -> str:
@@ -491,6 +495,14 @@ def continue_upgrade(self) -> bool:
         if self.upgrade_state and not self.upgrade_state.paused:
             try:
                 self._do_upgrade()
+            except HostConnectionError as e:
+                self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
+                    'severity': 'error',
+                    'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
+                    'count': 1,
+                    'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
+                })
+                return False
             except Exception as e:
                 self._fail_upgrade('UPGRADE_EXCEPTION', {
                     'severity': 'error',
@@ -1025,6 +1037,18 @@ def _do_upgrade(self):
             logger.debug('_do_upgrade no state, exiting')
             return
 
+        if self.mgr.offline_hosts:
+            # offline host(s), on top of potential connection errors when trying to upgrade a daemon
+            # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
+            # whether or not that risk is present for any given offline hosts is a difficult problem,
+            # it's best to just fail upgrade cleanly so user can address the offline host(s)
+
+            # the HostConnectionError expects a hostname and addr, so let's just take
+            # one at random. It doesn't really matter which host we say we couldn't reach here.
+            hostname: str = list(self.mgr.offline_hosts)[0]
+            addr: str = self.mgr.inventory.get_addr(hostname)
+            raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
+
         target_image = self.target_image
         target_id = self.upgrade_state.target_id
         target_digests = self.upgrade_state.target_digests