Skip to content

Commit

Permalink
Merge pull request #15643 from liewegas/wip-health
Browse files Browse the repository at this point in the history
mon: revamp health check/warning system
  • Loading branch information
liewegas committed Jul 12, 2017
2 parents b932d7f + 5e2e708 commit 8859627
Show file tree
Hide file tree
Showing 142 changed files with 3,149 additions and 611 deletions.
9 changes: 9 additions & 0 deletions PendingReleaseNotes
Expand Up @@ -194,3 +194,12 @@
If you deployed Luminous dev releases or 12.1.0 rc release and made use of
the CRUSH choose_args feature, you need to remove all choose_args mappings
from your CRUSH map before starting the upgrade.

* The 'ceph health' structured output (JSON or XML) no longer contains
a 'timechecks' section describing the time sync status. This
information is now available via the 'ceph time-sync-status'
command.

* Certain extra fields in the 'ceph health' structured output that
used to appear if the mons were low on disk space (which duplicated
the information in the normal health warning messages) are now gone.
5 changes: 5 additions & 0 deletions qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml
@@ -1,7 +1,12 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (OSD_DOWN)
- (OSD_
- wrongly marked me down
# MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
- is not responding
conf:
mds:
debug mds: 20
Expand Down
9 changes: 9 additions & 0 deletions qa/suites/fs/basic_functional/overrides/whitelist_health.yaml
@@ -0,0 +1,9 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (FS_DEGRADED)
- (MDS_FAILED)
- (MDS_DEGRADED)
- (FS_WITH_FAILED_MDS)
- (MDS_DAMAGE)
2 changes: 2 additions & 0 deletions qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml
Expand Up @@ -4,6 +4,8 @@ overrides:
- Scrub error on inode
- Behind on trimming
- Metadata damage detected
- overall HEALTH_
- (MDS_TRIM)
conf:
mds:
mds log max segments: 1
Expand Down
8 changes: 8 additions & 0 deletions qa/suites/fs/thrash/overrides/whitelist_health.yaml
@@ -0,0 +1,8 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (FS_DEGRADED)
- (MDS_FAILED)
- (MDS_DEGRADED)
- (FS_WITH_FAILED_MDS)
6 changes: 6 additions & 0 deletions qa/suites/rados/basic-luminous/scrub_test.yaml
Expand Up @@ -15,6 +15,12 @@ overrides:
- 'attr name mistmatch'
- 'deep-scrub 1 missing, 0 inconsistent objects'
- 'failed to pick suitable auth object'
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (OSD_SCRUB_ERRORS)
- (TOO_FEW_PGS)
conf:
osd:
osd deep scrub update digest min age: 0
Expand Down
5 changes: 5 additions & 0 deletions qa/suites/rados/basic/tasks/rados_api_tests.yaml
Expand Up @@ -3,6 +3,11 @@ overrides:
log-whitelist:
- reached quota
- wrongly marked me down
- overall HEALTH_
- (POOL_FULL)
- (SMALLER_PGP_NUM)
- (CACHE_POOL_NO_HIT_SET)
- (CACHE_POOL_NEAR_FULL)
tasks:
- workunit:
clients:
Expand Down
5 changes: 5 additions & 0 deletions qa/suites/rados/basic/tasks/rados_python.yaml
Expand Up @@ -2,6 +2,11 @@ overrides:
ceph:
log-whitelist:
- wrongly marked me down
- overall HEALTH_
- (OSDMAP_FLAGS)
- (PG_
- (OSD_
- (OBJECT_
tasks:
- workunit:
clients:
Expand Down
6 changes: 6 additions & 0 deletions qa/suites/rados/basic/tasks/rados_stress_watch.yaml
@@ -1,3 +1,9 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (CACHE_POOL_NO_HIT_SET)
- (TOO_FEW_PGS)
tasks:
- workunit:
clients:
Expand Down
4 changes: 4 additions & 0 deletions qa/suites/rados/basic/tasks/repair_test.yaml
Expand Up @@ -17,6 +17,10 @@ overrides:
- 'size 1 != size'
- attr name mismatch
- Regular scrub request, losing deep-scrub details
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
conf:
osd:
filestore debug inject read err: true
Expand Down
6 changes: 5 additions & 1 deletion qa/suites/rados/mgr/tasks/failover.yaml
Expand Up @@ -4,7 +4,11 @@ tasks:
- ceph:
# tests may leave mgrs broken, so don't try and call into them
# to invoke e.g. pg dump during teardown.
wait-for-scrub: false
wait-for-scrub: false
log-whitelist:
- overall HEALTH_
- (MGR_DOWN)
- (PG_
- cephfs_test_runner:
modules:
- tasks.mgr.test_failover
6 changes: 6 additions & 0 deletions qa/suites/rados/monthrash/thrashers/force-sync-many.yaml
@@ -1,3 +1,9 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (MON_DOWN)
- (TOO_FEW_PGS)
tasks:
- mon_thrash:
revive_delay: 90
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/monthrash/thrashers/many.yaml
@@ -1,5 +1,8 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (MON_DOWN)
conf:
osd:
mon client ping interval: 4
Expand Down
5 changes: 5 additions & 0 deletions qa/suites/rados/monthrash/thrashers/one.yaml
@@ -1,3 +1,8 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (MON_DOWN)
tasks:
- mon_thrash:
revive_delay: 20
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/monthrash/thrashers/sync-many.yaml
@@ -1,5 +1,8 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (MON_DOWN)
conf:
mon:
paxos min: 10
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/monthrash/thrashers/sync.yaml
@@ -1,5 +1,8 @@
overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (MON_DOWN)
conf:
mon:
paxos min: 10
Expand Down
6 changes: 6 additions & 0 deletions qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
Expand Up @@ -2,6 +2,12 @@ overrides:
ceph:
log-whitelist:
- reached quota
- overall HEALTH_
- (CACHE_POOL_NO_HIT_SET)
- (POOL_FULL)
- (REQUEST_SLOW)
- (MON_DOWN)
- (PG_
conf:
global:
debug objecter: 20
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml
Expand Up @@ -2,6 +2,9 @@ overrides:
ceph:
log-whitelist:
- wrongly marked me down
- overall HEALTH_
- (PG_
- (MON_DOWN)
tasks:
- workunit:
clients:
Expand Down
2 changes: 2 additions & 0 deletions qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml
Expand Up @@ -5,5 +5,7 @@ tasks:
- slow request
- .*clock.*skew.*
- clocks not synchronized
- overall HEALTH_
- (MON_CLOCK_SKEW)
- mon_clock_skew_check:
expect-skew: false
2 changes: 2 additions & 0 deletions qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml
Expand Up @@ -9,5 +9,7 @@ tasks:
- slow request
- .*clock.*skew.*
- clocks not synchronized
- overall HEALTH_
- (MON_CLOCK_SKEW)
- mon_clock_skew_check:
expect-skew: true
3 changes: 3 additions & 0 deletions qa/suites/rados/multimon/tasks/mon_recovery.yaml
@@ -1,4 +1,7 @@
tasks:
- install:
- ceph:
log-whitelist:
- overall HEALTH_
- (MON_DOWN)
- mon_recovery:
6 changes: 6 additions & 0 deletions qa/suites/rados/objectstore/ceph_objectstore_tool.yaml
Expand Up @@ -12,5 +12,11 @@ tasks:
global:
osd max object name len: 460
osd max object namespace len: 64
log-whitelist:
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (TOO_FEW_PGS)
- ceph_objectstore_tool:
objects: 20
3 changes: 3 additions & 0 deletions qa/suites/rados/rest/mgr-restful.yaml
Expand Up @@ -3,6 +3,9 @@ roles:
tasks:
- install:
- ceph:
log-whitelist:
- overall HEALTH_
- (MGR_DOWN)
- exec:
mon.a:
- ceph config-key put mgr/restful/x/server_addr 127.0.0.1
Expand Down
5 changes: 5 additions & 0 deletions qa/suites/rados/singleton-bluestore/all/cephtool.yaml
Expand Up @@ -21,6 +21,11 @@ tasks:
- must scrub before tier agent can activate
- failsafe engaged, dropping updates
- failsafe disengaged, no longer dropping updates
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (SMALLER_PG_NUM)
- workunit:
clients:
all:
Expand Down
4 changes: 4 additions & 0 deletions qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
Expand Up @@ -5,6 +5,10 @@ overrides:
log-whitelist:
- MDS in read-only mode
- force file system read-only
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_FULL)
- (MDS_READ_ONLY)
tasks:
- install:
- ceph:
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml
Expand Up @@ -3,6 +3,9 @@ roles:
tasks:
- install:
- ceph:
log-whitelist:
- overall HEALTH_
- (CACHE_POOL_NO_HIT_SET)
conf:
global:
osd max object name len: 460
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml
Expand Up @@ -8,6 +8,9 @@ roles:
tasks:
- install:
- ceph:
log-whitelist:
- overall HEALTH_
- (CACHE_POOL_NO_HIT_SET)
conf:
global:
osd max object name len: 460
Expand Down
4 changes: 4 additions & 0 deletions qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml
Expand Up @@ -5,6 +5,10 @@ overrides:
ceph:
log-whitelist:
- is full
- overall HEALTH_
- (POOL_FULL)
- (POOL_NEAR_FULL)
- (CACHE_POOL_NO_HIT_SET)
tasks:
- install:
- ceph:
Expand Down
4 changes: 4 additions & 0 deletions qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
Expand Up @@ -10,6 +10,10 @@ tasks:
osd max object namespace len: 64
log-whitelist:
- wrongly marked me down
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- workunit:
clients:
all:
Expand Down
Expand Up @@ -11,6 +11,11 @@ roles:
tasks:
- install:
- ceph:
log-whitelist:
- overall HEALTH_
- (PG_
- (OSD_
- (OBJECT_
conf:
osd:
osd debug reject backfill probability: .3
Expand Down
3 changes: 3 additions & 0 deletions qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml
Expand Up @@ -7,6 +7,9 @@ overrides:
flavor: notcmalloc
debuginfo: true
ceph:
log-whitelist:
- overall HEALTH_
- (PG_
conf:
global:
osd heartbeat grace: 40
Expand Down
6 changes: 6 additions & 0 deletions qa/suites/rados/singleton/all/divergent_priors.yaml
Expand Up @@ -12,6 +12,12 @@ openstack:

overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (OBJECT_DEGRADED)
conf:
osd:
debug osd: 5
Expand Down
6 changes: 6 additions & 0 deletions qa/suites/rados/singleton/all/divergent_priors2.yaml
Expand Up @@ -12,6 +12,12 @@ openstack:

overrides:
ceph:
log-whitelist:
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (OBJECT_DEGRADED)
conf:
osd:
debug osd: 5
Expand Down
6 changes: 5 additions & 1 deletion qa/suites/rados/singleton/all/dump-stuck.yaml
Expand Up @@ -11,5 +11,9 @@ tasks:
- install:
- ceph:
log-whitelist:
- wrongly marked me down
- wrongly marked me down
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- dump_stuck:
7 changes: 6 additions & 1 deletion qa/suites/rados/singleton/all/ec-lost-unfound.yaml
Expand Up @@ -15,5 +15,10 @@ tasks:
- install:
- ceph:
log-whitelist:
- objects unfound and apparently lost
- objects unfound and apparently lost
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (OBJECT_
- ec_lost_unfound:
7 changes: 6 additions & 1 deletion qa/suites/rados/singleton/all/lost-unfound-delete.yaml
Expand Up @@ -14,5 +14,10 @@ tasks:
- install:
- ceph:
log-whitelist:
- objects unfound and apparently lost
- objects unfound and apparently lost
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (OBJECT_
- rep_lost_unfound_delete:
7 changes: 6 additions & 1 deletion qa/suites/rados/singleton/all/lost-unfound.yaml
Expand Up @@ -14,5 +14,10 @@ tasks:
- install:
- ceph:
log-whitelist:
- objects unfound and apparently lost
- objects unfound and apparently lost
- overall HEALTH_
- (OSDMAP_FLAGS)
- (OSD_
- (PG_
- (OBJECT_
- lost_unfound:

0 comments on commit 8859627

Please sign in to comment.