Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

qa/tasks/ceph: raise exceptions if scrubbing fails or cannot proceed #15310

Merged
merged 7 commits into from Jun 16, 2017
1 change: 1 addition & 0 deletions qa/suites/rados/basic-luminous/scrub_test.yaml
@@ -1,5 +1,6 @@
overrides:
ceph:
wait-for-scrub: false
log-whitelist:
- '!= data_digest'
- '!= omap_digest'
Expand Down
1 change: 1 addition & 0 deletions qa/suites/rados/basic/tasks/repair_test.yaml
@@ -1,5 +1,6 @@
overrides:
ceph:
wait-for-scrub: false
log-whitelist:
- candidate had a stat error
- candidate had a read error
Expand Down
18 changes: 12 additions & 6 deletions qa/suites/rados/thrash/workloads/radosbench.yaml
Expand Up @@ -9,19 +9,25 @@ tasks:
- full_sequential:
- radosbench:
clients: [client.0]
time: 120
time: 90
- radosbench:
clients: [client.0]
time: 120
time: 90
- radosbench:
clients: [client.0]
time: 120
time: 90
- radosbench:
clients: [client.0]
time: 120
time: 90
- radosbench:
clients: [client.0]
time: 120
time: 90
- radosbench:
clients: [client.0]
time: 120
time: 90
- radosbench:
clients: [client.0]
time: 90
- radosbench:
clients: [client.0]
time: 90
8 changes: 3 additions & 5 deletions qa/tasks/ceph.py
Expand Up @@ -1030,7 +1030,7 @@ def osd_scrub_pgs(ctx, config):
indicate the last scrub completed. Time out if no progess is made
here after two minutes.
"""
retries = 12
retries = 20
delays = 10
cluster_name = config['cluster']
manager = ctx.managers[cluster_name]
Expand All @@ -1044,8 +1044,7 @@ def osd_scrub_pgs(ctx, config):
log.info("Waiting for all osds to be active and clean.")
time.sleep(delays)
if not all_clean:
log.info("Scrubbing terminated -- not all pgs were active and clean.")
return
raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
check_time_now = time.localtime()
time.sleep(1)
all_roles = teuthology.all_roles(ctx.cluster)
Expand Down Expand Up @@ -1074,8 +1073,7 @@ def osd_scrub_pgs(ctx, config):
else:
gap_cnt += 1
if gap_cnt > retries:
log.info('Exiting scrub checking -- not all pgs scrubbed.')
return
raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
if loop:
log.info('Still waiting for all pgs to be scrubbed.')
time.sleep(delays)
Expand Down
3 changes: 3 additions & 0 deletions qa/tasks/ceph_manager.py
Expand Up @@ -1186,6 +1186,9 @@ def flush_pg_stats(self, osds, no_wait=None, wait_for_mon=3*5):
'osd.{osd}: {got} < {need}'.
format(osd=osd, got=got, need=need))

def flush_all_pg_stats(self):
self.flush_pg_stats(range(len(self.get_osd_dump())))

def do_rados(self, remote, cmd, check_status=True):
"""
Execute a remote rados command.
Expand Down
2 changes: 2 additions & 0 deletions qa/tasks/thrashosds.py
Expand Up @@ -197,4 +197,6 @@ def task(ctx, config):
finally:
log.info('joining thrashosds')
thrash_proc.do_join()
cluster_manager.wait_for_all_up()
cluster_manager.flush_all_pg_stats()
cluster_manager.wait_for_recovery(config.get('timeout', 360))