Skip to content

Commit

Permalink
test: monitor thrasher wait until quorum
Browse files Browse the repository at this point in the history
With 1 sec. delay we may sometimes fail to get correct length of
quorum since the monitor didn't updated on time.
With the following fix, we will wait for quorum and check every few
seconds (3) until timeout (30).

Fixes: https://tracker.ceph.com/issues/52316
Signed-off-by: Nitzan Mordechai <nmordech@redhat.com>
(cherry picked from commit fbd10ba)
  • Loading branch information
NitzanMordhai committed May 29, 2023
1 parent d8c5d34 commit d52c07c
Showing 1 changed file with 37 additions and 3 deletions.
40 changes: 37 additions & 3 deletions qa/tasks/mon_thrash.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import json
import math
from teuthology import misc as teuthology
from teuthology.contextutil import safe_while
from tasks import ceph_manager
from tasks.cephfs.filesystem import MDSCluster
from tasks.thrasher import Thrasher
Expand Down Expand Up @@ -224,6 +225,25 @@ def max_killable(self):
else:
return m

def _wait_until_quorum(self, mon, size, timeout=300):
"""
Wait until the monitor specified is in the quorum.
"""
self.log('waiting for quorum size %d for mon %s' % (size, mon))
s = {}

with safe_while(sleep=3,
tries=timeout // 3,
action=f'wait for quorum size {size} on mon {mon}') as proceed:
while proceed():
s = self.manager.get_mon_status(mon)
if len(s['quorum']) == size:
break
self.log("quorum is size %d" % len(s['quorum']))

self.log("final quorum is size %d" % len(s['quorum']))
return s

def do_thrash(self):
"""
_do_thrash() wrapper.
Expand Down Expand Up @@ -261,7 +281,11 @@ def _do_thrash(self):
self.manager.wait_for_mon_quorum_size(len(mons))
self.log('making sure all monitors are in the quorum')
for m in mons:
s = self.manager.get_mon_status(m)
try:
s = self._wait_until_quorum(m, len(mons), timeout=30)
except Exception as e:
self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e))
self.log('mon_status: {s}'.format(s=s))
assert s['state'] == 'leader' or s['state'] == 'peon'
assert len(s['quorum']) == len(mons)

Expand Down Expand Up @@ -300,7 +324,12 @@ def _do_thrash(self):
for m in mons:
if m in mons_to_kill:
continue
s = self.manager.get_mon_status(m)
try:
s = self._wait_until_quorum(m, len(mons)-len(mons_to_kill), timeout=30)
except Exception as e:
self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e))
self.log('mon_status: {s}'.format(s=s))

assert s['state'] == 'leader' or s['state'] == 'peon'
assert len(s['quorum']) == len(mons)-len(mons_to_kill)

Expand All @@ -322,7 +351,12 @@ def _do_thrash(self):

self.manager.wait_for_mon_quorum_size(len(mons))
for m in mons:
s = self.manager.get_mon_status(m)
try:
s = self._wait_until_quorum(m, len(mons), timeout=30)
except Exception as e:
self.log('mon.{m} is not in quorum size, exception: {e}'.format(m=m,e=e))
self.log('mon_status: {s}'.format(s=s))

assert s['state'] == 'leader' or s['state'] == 'peon'
assert len(s['quorum']) == len(mons)

Expand Down

0 comments on commit d52c07c

Please sign in to comment.