From d52c07ce9f75d6701d08a2d62629bc1fbc5ca3bf Mon Sep 17 00:00:00 2001 From: Nitzan Mordechai Date: Thu, 18 May 2023 13:37:38 +0000 Subject: [PATCH] test: monitor thrasher wait until quorum With 1 sec. delay we may sometimes fail to get correct length of quorum since the monitor didn't updated on time. With the following fix, we will wait for quorum and check every few seconds (3) until timeout (30). Fixes: https://tracker.ceph.com/issues/52316 Signed-off-by: Nitzan Mordechai (cherry picked from commit fbd10badbfad71f208de6b48008a20963d375ae9) --- qa/tasks/mon_thrash.py | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/qa/tasks/mon_thrash.py b/qa/tasks/mon_thrash.py index 4224acf031950..30a7555b55a41 100644 --- a/qa/tasks/mon_thrash.py +++ b/qa/tasks/mon_thrash.py @@ -9,6 +9,7 @@ import json import math from teuthology import misc as teuthology +from teuthology.contextutil import safe_while from tasks import ceph_manager from tasks.cephfs.filesystem import MDSCluster from tasks.thrasher import Thrasher @@ -224,6 +225,25 @@ def max_killable(self): else: return m + def _wait_until_quorum(self, mon, size, timeout=300): + """ + Wait until the monitor specified is in the quorum. + """ + self.log('waiting for quorum size %d for mon %s' % (size, mon)) + s = {} + + with safe_while(sleep=3, + tries=timeout // 3, + action=f'wait for quorum size {size} on mon {mon}') as proceed: + while proceed(): + s = self.manager.get_mon_status(mon) + if len(s['quorum']) == size: + break + self.log("quorum is size %d" % len(s['quorum'])) + + self.log("final quorum is size %d" % len(s['quorum'])) + return s + def do_thrash(self): """ _do_thrash() wrapper. @@ -261,7 +281,11 @@ def _do_thrash(self): self.manager.wait_for_mon_quorum_size(len(mons)) self.log('making sure all monitors are in the quorum') for m in mons: - s = self.manager.get_mon_status(m) + try: + s = self._wait_until_quorum(m, len(mons), timeout=30) + except Exception as e: + self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e)) + self.log('mon_status: {s}'.format(s=s)) assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons) @@ -300,7 +324,12 @@ def _do_thrash(self): for m in mons: if m in mons_to_kill: continue - s = self.manager.get_mon_status(m) + try: + s = self._wait_until_quorum(m, len(mons)-len(mons_to_kill), timeout=30) + except Exception as e: + self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e)) + self.log('mon_status: {s}'.format(s=s)) + assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons)-len(mons_to_kill) @@ -322,7 +351,12 @@ def _do_thrash(self): self.manager.wait_for_mon_quorum_size(len(mons)) for m in mons: - s = self.manager.get_mon_status(m) + try: + s = self._wait_until_quorum(m, len(mons), timeout=30) + except Exception as e: + self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e)) + self.log('mon_status: {s}'.format(s=s)) + assert s['state'] == 'leader' or s['state'] == 'peon' assert len(s['quorum']) == len(mons)