Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

qa: disable mon-health-to-clog in upgrade test #19233

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ tasks:
- ceph.restart:
daemons: [mon.a, mon.b, mon.c]
wait-for-healthy: false
mon-health-to-clog: false
- ceph.restart:
daemons: [osd.0, osd.1, osd.2]
wait-for-healthy: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ upgrade-sequence:
sequential:
- ceph.restart:
daemons: [mon.a, mon.b, mon.c, mgr.x]
mon-health-to-clog: false
- ceph.restart:
daemons: [osd.0, osd.1, osd.2, osd.3]
wait-for-healthy: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ upgrade-sequence:
- ceph.restart:
daemons: [mon.b, mon.c, mgr.x]
wait-for-healthy: true
mon-health-to-clog: false
- sleep:
duration: 60
- ceph.restart:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ tasks:
- print: "**** done install.upgrade osd.0"
- ceph.restart:
daemons: [mon.a,mon.b,mon.c,mgr.x,osd.0,osd.1,osd.2]
mon-health-to-clog: false
- print: "**** done ceph.restart 1st half"
45 changes: 40 additions & 5 deletions qa/tasks/ceph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,39 @@ def created_pool(ctx, config):
new_pool, 'pg_num')


@contextlib.contextmanager
def tweaked_option(ctx, config):
"""
set an option, and then restore it with its original value

Note, due to the way how tasks are executed/nested, it's not suggested to
use this method as a standalone task. otherwise, it's likely that it will
restore the tweaked option at the /end/ of 'tasks' block.
"""
saved_options = {}
# we can complicate this when necessary
options = ['mon-health-to-clog']
type_, id_ = 'mon', '*'
cluster = config.get('cluster', 'ceph')
manager = ctx.managers[cluster]
if id_ == '*':
get_from = next(teuthology.all_roles_of_type(ctx.cluster, type_))
else:
get_from = id_
for option in options:
if option not in config:
continue
value = 'true' if config[option] else 'false'
option = option.replace('-', '_')
old_value = manager.get_config(type_, get_from, option)
if value != old_value:
saved_options[option] = old_value
manager.inject_args(type_, id_, option, value)
yield
for option, value in saved_options.items():
manager.inject_args(type_, id_, option, value)


@contextlib.contextmanager
def restart(ctx, config):
"""
Expand Down Expand Up @@ -1375,12 +1408,14 @@ def restart(ctx, config):

daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
clusters = set()
for role in daemons:
cluster, type_, id_ = teuthology.split_role(role)
ctx.daemons.get_daemon(type_, id_, cluster).restart()
clusters.add(cluster)

manager = ctx.managers['ceph']

with tweaked_option(ctx, config):
for role in daemons:
cluster, type_, id_ = teuthology.split_role(role)
ctx.daemons.get_daemon(type_, id_, cluster).restart()
clusters.add(cluster)

for dmon in daemons:
if '.' in dmon:
dm_parts = dmon.split('.')
Expand Down
64 changes: 33 additions & 31 deletions qa/tasks/ceph_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def tmp(x):
first_mon[1],
opt)
self.saved_options.append((service, opt, old_value))
self._set_config(service, '*', opt, new_value)
manager.inject_args(service, '*', opt, new_value)
# initialize ceph_objectstore_tool property - must be done before
# do_thrash is spawned - http://tracker.ceph.com/issues/18799
if (self.config.get('powercycle') or
Expand All @@ -180,13 +180,6 @@ def tmp(x):
if self.noscrub_toggle_delay:
self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle)

def _set_config(self, service_type, service_id, name, value):
opt_arg = '--{name} {value}'.format(name=name, value=value)
whom = '.'.join([service_type, service_id])
self.ceph_manager.raw_cluster_cmd('--', 'tell', whom,
'injectargs', opt_arg)


def cmd_exists_on_osds(self, cmd):
allremotes = self.ceph_manager.ctx.cluster.only(\
teuthology.is_type('osd', self.cluster)).remotes.keys()
Expand Down Expand Up @@ -392,10 +385,12 @@ def revive_osd(self, osd=None, skip_admin_check=False):
self.dead_osds.remove(osd)
self.live_osds.append(osd)
if self.random_eio > 0 and osd is self.rerrosd:
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--filestore_debug_random_read_err='+str(self.random_eio))
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--bluestore_debug_random_read_err='+str(self.random_eio))
self.ceph_manager.inject_args('osd', self.rerrosd,
'filestore_debug_random_read_err',
self.random_eio)
self.ceph_manager.inject_args('osd', self.rerrosd,
'bluestore_debug_random_read_err',
self.random_eio)


def out_osd(self, osd=None):
Expand Down Expand Up @@ -893,8 +888,9 @@ def do_optrack_toggle(self):
osd_state = "false"
else:
osd_state = "true"
self.ceph_manager.raw_cluster_cmd_result('tell', 'osd.*',
'injectargs', '--osd_enable_op_tracker=%s' % osd_state)
self.ceph_manager.inject_args('osd', '*',
'osd_enable_op_tracker',
osd_state)
gevent.sleep(delay)

@log_exc
Expand Down Expand Up @@ -952,10 +948,12 @@ def do_thrash(self):
delay = self.config.get("op_delay", 5)
self.rerrosd = self.live_osds[0]
if self.random_eio > 0:
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--filestore_debug_random_read_err='+str(self.random_eio))
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--bluestore_debug_random_read_err='+str(self.random_eio))
self.ceph_manager.inject_args('osd', self.rerrosd,
'filestore_debug_random_read_err',
self.random_eio)
self.ceph_manager.inject_args('osd', self.rerrosd,
'bluestore_debug_random_read_err',
self.random_eio)
self.log("starting do_thrash")
while not self.stopping:
to_log = [str(x) for x in ["in_osds: ", self.in_osds,
Expand Down Expand Up @@ -985,16 +983,16 @@ def do_thrash(self):
time.sleep(delay)
self.all_up()
if self.random_eio > 0:
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--filestore_debug_random_read_err=0.0')
self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
'injectargs', '--', '--bluestore_debug_random_read_err=0.0')
self.ceph_manager.inject_args('osd', self.rerrosd,
'filestore_debug_random_read_err', '0.0')
self.ceph_manager.inject_args('osd', self.rerrosd,
'bluestore_debug_random_read_err', '0.0')
for pool in list(self.pools_to_fix_pgp_num):
if self.ceph_manager.get_pool_pg_num(pool) > 0:
self.fix_pgp_num(pool)
self.pools_to_fix_pgp_num.clear()
for service, opt, saved_value in self.saved_options:
self._set_config(service, '*', opt, saved_value)
self.ceph_manager.inject_args(service, '*', opt, saved_value)
self.saved_options = []
self.all_up_in()

Expand Down Expand Up @@ -1481,6 +1479,13 @@ def get_config(self, service_type, service_id, name):
j = json.loads(proc.stdout.getvalue())
return j[name]

def inject_args(self, service_type, service_id, name, value):
whom = '{0}.{1}'.format(service_type, service_id)
if isinstance(value, bool):
value = 'true' if value else 'false'
opt_arg = '--{name}={value}'.format(name=name, value=value)
self.raw_cluster_cmd('--', 'tell', whom, 'injectargs', opt_arg)

def set_config(self, osdnum, **argdict):
"""
:param osdnum: osd number
Expand Down Expand Up @@ -2314,11 +2319,9 @@ def kill_osd(self, osd):
remote.console.power_off()
elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'):
if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5):
self.raw_cluster_cmd(
'--', 'tell', 'osd.%d' % osd,
'injectargs',
'--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'),
)
self.inject_args(
'osd', osd,
'bdev-inject-crash', self.config.get('bdev_inject_crash'))
try:
self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
except:
Expand All @@ -2340,9 +2343,8 @@ def blackhole_kill_osd(self, osd):
"""
Stop osd if nothing else works.
"""
self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd,
'injectargs',
'--objectstore-blackhole')
self.inject_args('osd', osd,
'objectstore-blackhole', True)
time.sleep(2)
self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()

Expand Down