diff --git a/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml b/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml index bde754e4d5f5b3..8785de6c3f2455 100644 --- a/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml +++ b/qa/suites/rados/upgrade/jewel-x-singleton/3-thrash/default.yaml @@ -18,4 +18,6 @@ split_tasks: chance_pgnum_grow: 1 chance_pgpnum_fix: 1 chance_thrash_cluster_full: 0 + chance_thrash_pg_remap: 0 + chance_thrash_pg_remap_items: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml b/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml index 8b4de177db4ab5..0272cb94c14498 100644 --- a/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml +++ b/qa/suites/upgrade/jewel-x/stress-split-erasure-code/3-thrash/default.yaml @@ -18,4 +18,6 @@ stress-tasks: chance_pgpnum_fix: 1 min_in: 4 chance_thrash_cluster_full: 0 + chance_thrash_pg_remap: 0 + chance_thrash_pg_remap_items: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml b/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml index f33ade4acf9d18..185bdbceb7feac 100644 --- a/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml +++ b/qa/suites/upgrade/jewel-x/stress-split/3-thrash/default.yaml @@ -17,4 +17,6 @@ stress-tasks: chance_pgnum_grow: 1 chance_pgpnum_fix: 1 chance_thrash_cluster_full: 0 + chance_thrash_pg_remap: 0 + chance_thrash_pg_remap_items: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml index 8b4de177db4ab5..0272cb94c14498 100644 --- a/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml +++ b/qa/suites/upgrade/kraken-x/stress-split-erasure-code/3-thrash/default.yaml @@ -18,4 +18,6 @@ stress-tasks: chance_pgpnum_fix: 1 min_in: 4 chance_thrash_cluster_full: 0 + chance_thrash_pg_remap: 0 + chance_thrash_pg_remap_items: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml b/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml index f33ade4acf9d18..185bdbceb7feac 100644 --- a/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml +++ b/qa/suites/upgrade/kraken-x/stress-split/3-thrash/default.yaml @@ -17,4 +17,6 @@ stress-tasks: chance_pgnum_grow: 1 chance_pgpnum_fix: 1 chance_thrash_cluster_full: 0 + chance_thrash_pg_remap: 0 + chance_thrash_pg_remap_items: 0 - print: "**** done thrashosds 3-thrash" diff --git a/qa/tasks/ceph_manager.py b/qa/tasks/ceph_manager.py index 5b00132b20235d..8ff2556a7a0e93 100644 --- a/qa/tasks/ceph_manager.py +++ b/qa/tasks/ceph_manager.py @@ -123,6 +123,8 @@ def __init__(self, manager, config, logger=None): self.dump_ops_enable = self.config.get('dump_ops_enable') self.noscrub_toggle_delay = self.config.get('noscrub_toggle_delay') self.chance_thrash_cluster_full = self.config.get('chance_thrash_cluster_full', .05) + self.chance_thrash_pg_remap = self.config.get('chance_thrash_pg_remap', 1.0) + self.chance_thrash_pg_remap_items = self.config.get('chance_thrash_pg_remap', 1.0) num_osds = self.in_osds + self.out_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds @@ -505,6 +507,86 @@ def thrash_cluster_full(self): self.log('Setting full ratio back to .95') self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.95') + def thrash_pg_remap(self): + """ + Install or remove random pg_remap entries in OSDMap + """ + from random import shuffle + out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') + j = json.loads(out) + self.log('j is %s' % j) + try: + if random.random() >= .3: + pgs = self.ceph_manager.get_pg_stats() + pg = random.choice(pgs) + pgid = str(pg['pgid']) + poolid = int(pgid.split('.')[0]) + sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] + if len(sizes) == 0: + return + n = sizes[0] + osds = self.in_osds + self.out_osds + shuffle(osds) + osds = osds[0:n] + self.log('Setting %s to %s' % (pgid, osds)) + cmd = ['osd', 'pg-remap', pgid] + [str(x) for x in osds] + self.log('cmd %s' % cmd) + self.ceph_manager.raw_cluster_cmd(*cmd) + else: + m = j['pg_remap'] + if len(m) > 0: + shuffle(m) + pg = m[0]['pgid'] + self.log('Clearing pg_remap on %s' % pg) + self.ceph_manager.raw_cluster_cmd( + 'osd', + 'rm-pg-remap', + pg) + else: + self.log('No pg_remap entries; doing nothing') + except CommandFailedError: + self.log('Failed to rm-pg-remap, ignoring') + + def thrash_pg_remap_items(self): + """ + Install or remove random pg_remap_items entries in OSDMap + """ + from random import shuffle + out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty') + j = json.loads(out) + self.log('j is %s' % j) + try: + if random.random() >= .3: + pgs = self.ceph_manager.get_pg_stats() + pg = random.choice(pgs) + pgid = str(pg['pgid']) + poolid = int(pgid.split('.')[0]) + sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid] + if len(sizes) == 0: + return + n = sizes[0] + osds = self.in_osds + self.out_osds + shuffle(osds) + osds = osds[0:n*2] + self.log('Setting %s to %s' % (pgid, osds)) + cmd = ['osd', 'pg-remap-items', pgid] + [str(x) for x in osds] + self.log('cmd %s' % cmd) + self.ceph_manager.raw_cluster_cmd(*cmd) + else: + m = j['pg_remap_items'] + if len(m) > 0: + shuffle(m) + pg = m[0]['pgid'] + self.log('Clearing pg_remap on %s' % pg) + self.ceph_manager.raw_cluster_cmd( + 'osd', + 'rm-pg-remap-items', + pg) + else: + self.log('No pg_remap entries; doing nothing') + except CommandFailedError: + self.log('Failed to rm-pg-remap-items, ignoring') + def all_up(self): """ Make sure all osds are up and not out. @@ -723,6 +805,11 @@ def choose_action(self): chance_test_backfill_full,)) if self.chance_thrash_cluster_full > 0: actions.append((self.thrash_cluster_full, self.chance_thrash_cluster_full,)) + if self.chance_thrash_pg_remap > 0: + actions.append((self.thrash_pg_remap, self.chance_thrash_pg_remap,)) + if self.chance_thrash_pg_remap_items > 0: + actions.append((self.thrash_pg_remap_items, self.chance_thrash_pg_remap_items,)) + for key in ['heartbeat_inject_failure', 'filestore_inject_stall']: for scenario in [ (lambda: diff --git a/qa/tasks/thrashosds.py b/qa/tasks/thrashosds.py index 876aab395ec7d9..37c52062b5b221 100644 --- a/qa/tasks/thrashosds.py +++ b/qa/tasks/thrashosds.py @@ -122,6 +122,9 @@ def task(ctx, config): chance_thrash_cluster_full: .05 + chance_thrash_pg_remap: 1.0 + chance_thrash_pg_remap_items: 1.0 + example: tasks: