ceph · kamoltat · Mar 18, 2022 · Feb 11, 2022 · adk3798 · Jul 27, 2022
@@ -1,14 +1,14 @@
 import json
 import logging
-
+import time
 from tasks.mgr.mgr_test_case import MgrTestCase
 
 log = logging.getLogger(__name__)
 
 
 class TestCephadmCLI(MgrTestCase):
 
-    APPLY_MON_PERIOD = 60
+    APPLY_MON_PERIOD = 60 # Shouldn't be this long, but just incase.
 
     def _cmd(self, *args) -> str:
         assert self.mgr_cluster is not None
@@ -20,13 +20,6 @@ def _orch_cmd(self, *args) -> str:
     def setUp(self):
         super(TestCephadmCLI, self).setUp()
 
-    def _create_and_write_pool(self, pool_name):
-        # Create new pool and write to it, simulating a small workload.
-        self.mgr_cluster.mon_manager.create_pool(pool_name)
-        args = [
-            "rados", "-p", pool_name, "bench", "30", "write", "-t", "16"]
-        self.mgr_cluster.admin_remote.run(args=args, wait=True)
-
     def _get_quorum_size(self) -> int:
         # Evaluate if the quorum size of the cluster is correct.
         # log the quorum_status before reducing the monitors
@@ -43,29 +36,38 @@ def _check_no_crashes(self):
         )
         log.info("test_apply_mon._check_no_crashes: %s" % retstr)
         self.assertEqual(0, len(retstr)) # check if there are no crashes
+
+    def _apply_mon_write_pool_and_check(self, num, pool_name):
+        # Increase/Decrease the monitors to <num> by exercising the orch apply mon command.
+        # Write to <pool_name> simulate real workloads during apply command.
+        # Check for crashes and unhealthy cluster states.
 
-    def test_apply_mon_three(self):
-        # Evaluating the process of reducing the number of 
-        # monitors from 5 to 3 and increasing the number of
-        # monitors from 3 to 5, using the `ceph orch apply mon <num>` command.
+        self._orch_cmd('apply', 'mon', str(num))
 
-        self.wait_until_equal(lambda : self._get_quorum_size(), 5,
-                      timeout=self.APPLY_MON_PERIOD, period=10)
-
-        self._orch_cmd('apply', 'mon', '3') # reduce the monitors from 5 -> 3
+        args = ["rados", "-p", pool_name, "bench", "60", "write", "-t", "16"]
 
-        self._create_and_write_pool('test_pool1')
+        self.mgr_cluster.admin_remote.run(args=args, wait=True) # Write to pool.
 
-        self.wait_until_equal(lambda : self._get_quorum_size(), 3,
+        self.wait_until_equal(lambda : self._get_quorum_size(), num,
                       timeout=self.APPLY_MON_PERIOD, period=10)
 
-        self._check_no_crashes()
+        self._check_no_crashes() # Check for any crashes.
+
+        time.sleep(60) # Buffer for fairness, incase any unhealthy status needed time to appear.
 
-        self._orch_cmd('apply', 'mon', '5') # increase the monitors from 3 -> 5
+        self.wait_for_health_clear(120) # Make sure there is no unhealthy state, e.g., Stray Daemon
 
-        self._create_and_write_pool('test_pool2')
+    def test_decrease_increase_mon(self):
+        # Evaluating the process of reducing the number of
+        # monitors from 5 to 3 and increasing the number of
+        # monitors from 3 to 5, using the `ceph orch apply mon <num>` command.
 
+        # Make sure we have 5 MONs at the beginning.
         self.wait_until_equal(lambda : self._get_quorum_size(), 5,
                       timeout=self.APPLY_MON_PERIOD, period=10)
 
-        self._check_no_crashes()
+        # Create 1 pool to be used as part of the test.
+        self.mgr_cluster.mon_manager.create_pool("test_pool1")
+
+        self._apply_mon_write_pool_and_check(3, "test_pool1") # Reduce MON and check.
+        self._apply_mon_write_pool_and_check(5, "test_pool1") # Increase MON and check.
@@ -7091,6 +7091,7 @@ def _stop_and_disable(ctx, unit_name):
 ##################################
 
 
+@infer_config
 def command_rm_daemon(ctx):
     # type: (CephadmContext) -> None
     lock = FileLock(ctx, ctx.fsid)
@@ -7105,6 +7106,29 @@ def command_rm_daemon(ctx):
 
     call(ctx, ['systemctl', 'stop', unit_name],
          verbosity=CallVerbosity.DEBUG)
+
+    # remove the mon from cluster after we stop the mon.
+    # necessary to do this strictly inorder here because
+    # in the monitor elector code (and possible other parts)
+    # it is assumed that monitor will be unreachable (stopped)
+    # before it is removed. This is the correct procedure according to:
+    # https://docs.ceph.com/en/latest/rados/operations/add-or-rm-mons/#removing-a-monitor-manual
+    if daemon_type == 'mon':
+        mon_key_path = os.path.join(get_data_dir(ctx.fsid, ctx.data_dir, 'mon', daemon_id), 'keyring')
+        c = CephContainer(
+            ctx,
+            image=ctx.image,
+            entrypoint='/usr/bin/ceph',
+            args=['--name', 'mon.', 'mon', 'remove', daemon_id],
+            volume_mounts={
+                mon_key_path: '/etc/ceph/ceph.keyring:z',
+                ctx.config: '/etc/ceph/ceph.conf:z',
+            },
+        )
+        _, err, ret = call_throws(ctx, c.run_cmd(), timeout=ctx.timeout if ctx.timeout else 60)
+        if ret:
+            raise Error(f'Failed removing monitor: {err}')
+
     call(ctx, ['systemctl', 'reset-failed', unit_name],
          verbosity=CallVerbosity.DEBUG)
     call(ctx, ['systemctl', 'disable', unit_name],

@@ -624,13 +624,6 @@ def pre_remove(self, daemon: DaemonDescription) -> None:
         daemon_id: str = daemon.daemon_id
         self._check_safe_to_destroy(daemon_id)
 
-        # remove mon from quorum before we destroy the daemon
-        logger.info('Removing monitor %s from monmap...' % daemon_id)
-        ret, out, err = self.mgr.check_mon_command({
-            'prefix': 'mon rm',
-            'name': daemon_id,
-        })
-
     def post_remove(self, daemon: DaemonDescription, is_failed_deploy: bool) -> None:
         # Do not remove the mon keyring.
         # super().post_remove(daemon)