Merge pull request #15643 from liewegas/wip-health

mon: revamp health check/warning system
ceph · Jul 12, 2017 · 8859627 · 8859627
2 parents b932d7f + 5e2e708
commit 8859627
Show file tree

Hide file tree

Showing 142 changed files with 3,149 additions and 611 deletions.
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
@@ -194,3 +194,12 @@
   If you deployed Luminous dev releases or 12.1.0 rc release and made use of
   the CRUSH choose_args feature, you need to remove all choose_args mappings
   from your CRUSH map before starting the upgrade.
+
+* The 'ceph health' structured output (JSON or XML) no longer contains
+  a 'timechecks' section describing the time sync status.  This
+  information is now available via the 'ceph time-sync-status'
+  command.
+
+* Certain extra fields in the 'ceph health' structured output that
+  used to appear if the mons were low on disk space (which duplicated
+  the information in the normal health warning messages) are now gone.
diff --git a/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml b/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml
@@ -1,7 +1,12 @@
 overrides:
   ceph:
     log-whitelist:
+      - overall HEALTH_
+      - (OSD_DOWN)
+      - (OSD_
       - wrongly marked me down
+# MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
+      - is not responding
     conf:
       mds:
         debug mds: 20

diff --git a/qa/suites/fs/basic_functional/overrides/whitelist_health.yaml b/qa/suites/fs/basic_functional/overrides/whitelist_health.yaml
@@ -0,0 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (FS_DEGRADED)
+      - (MDS_FAILED)
+      - (MDS_DEGRADED)
+      - (FS_WITH_FAILED_MDS)
+      - (MDS_DAMAGE)
diff --git a/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml b/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml
@@ -4,6 +4,8 @@ overrides:
       - Scrub error on inode
       - Behind on trimming
       - Metadata damage detected
+      - overall HEALTH_
+      - (MDS_TRIM)
     conf:
       mds:
         mds log max segments: 1

diff --git a/qa/suites/fs/thrash/overrides/whitelist_health.yaml b/qa/suites/fs/thrash/overrides/whitelist_health.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (FS_DEGRADED)
+      - (MDS_FAILED)
+      - (MDS_DEGRADED)
+      - (FS_WITH_FAILED_MDS)
diff --git a/qa/suites/rados/basic-luminous/scrub_test.yaml b/qa/suites/rados/basic-luminous/scrub_test.yaml
@@ -15,6 +15,12 @@ overrides:
     - 'attr name mistmatch'
     - 'deep-scrub 1 missing, 0 inconsistent objects'
     - 'failed to pick suitable auth object'
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (OSD_
+    - (PG_
+    - (OSD_SCRUB_ERRORS)
+    - (TOO_FEW_PGS)
     conf:
       osd:
         osd deep scrub update digest min age: 0

diff --git a/qa/suites/rados/basic/tasks/rados_api_tests.yaml b/qa/suites/rados/basic/tasks/rados_api_tests.yaml
@@ -3,6 +3,11 @@ overrides:
     log-whitelist:
     - reached quota
     - wrongly marked me down
+    - overall HEALTH_
+    - (POOL_FULL)
+    - (SMALLER_PGP_NUM)
+    - (CACHE_POOL_NO_HIT_SET)
+    - (CACHE_POOL_NEAR_FULL)
 tasks:
 - workunit:
     clients:

diff --git a/qa/suites/rados/basic/tasks/rados_python.yaml b/qa/suites/rados/basic/tasks/rados_python.yaml
@@ -2,6 +2,11 @@ overrides:
   ceph:
     log-whitelist:
     - wrongly marked me down
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (PG_
+    - (OSD_
+    - (OBJECT_
 tasks:
 - workunit:
     clients:

diff --git a/qa/suites/rados/basic/tasks/rados_stress_watch.yaml b/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
@@ -1,3 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
+      - (TOO_FEW_PGS)
 tasks:
 - workunit:
     clients:

diff --git a/qa/suites/rados/basic/tasks/repair_test.yaml b/qa/suites/rados/basic/tasks/repair_test.yaml
@@ -17,6 +17,10 @@ overrides:
       - 'size 1 != size'
       - attr name mismatch
       - Regular scrub request, losing deep-scrub details
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
     conf:
       osd:
         filestore debug inject read err: true

diff --git a/qa/suites/rados/mgr/tasks/failover.yaml b/qa/suites/rados/mgr/tasks/failover.yaml
@@ -4,7 +4,11 @@ tasks:
   - ceph:
       # tests may leave mgrs broken, so don't try and call into them
       # to invoke e.g. pg dump during teardown.
-      wait-for-scrub: false  
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - (MGR_DOWN)
+        - (PG_
   - cephfs_test_runner:
       modules:
         - tasks.mgr.test_failover
diff --git a/qa/suites/rados/monthrash/thrashers/force-sync-many.yaml b/qa/suites/rados/monthrash/thrashers/force-sync-many.yaml
@@ -1,3 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
+      - (TOO_FEW_PGS)
 tasks:
 - mon_thrash:
     revive_delay: 90

diff --git a/qa/suites/rados/monthrash/thrashers/many.yaml b/qa/suites/rados/monthrash/thrashers/many.yaml
@@ -1,5 +1,8 @@
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
     conf:
       osd:
         mon client ping interval: 4

diff --git a/qa/suites/rados/monthrash/thrashers/one.yaml b/qa/suites/rados/monthrash/thrashers/one.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
 tasks:
 - mon_thrash:
     revive_delay: 20

diff --git a/qa/suites/rados/monthrash/thrashers/sync-many.yaml b/qa/suites/rados/monthrash/thrashers/sync-many.yaml
@@ -1,5 +1,8 @@
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
     conf:
       mon:
         paxos min: 10

diff --git a/qa/suites/rados/monthrash/thrashers/sync.yaml b/qa/suites/rados/monthrash/thrashers/sync.yaml
@@ -1,5 +1,8 @@
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
     conf:
       mon:
         paxos min: 10

diff --git a/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml b/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
@@ -2,6 +2,12 @@ overrides:
   ceph:
     log-whitelist:
       - reached quota
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
+      - (POOL_FULL)
+      - (REQUEST_SLOW)
+      - (MON_DOWN)
+      - (PG_
     conf:
       global:
         debug objecter: 20

diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml
@@ -2,6 +2,9 @@ overrides:
   ceph:
     log-whitelist:
     - wrongly marked me down
+    - overall HEALTH_
+    - (PG_
+    - (MON_DOWN)
 tasks:
 - workunit:
     clients:

diff --git a/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml b/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml
@@ -5,5 +5,7 @@ tasks:
     - slow request
     - .*clock.*skew.*
     - clocks not synchronized
+    - overall HEALTH_
+    - (MON_CLOCK_SKEW)
 - mon_clock_skew_check:
     expect-skew: false
diff --git a/qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml b/qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml
@@ -9,5 +9,7 @@ tasks:
     - slow request
     - .*clock.*skew.*
     - clocks not synchronized
+    - overall HEALTH_
+    - (MON_CLOCK_SKEW)
 - mon_clock_skew_check:
     expect-skew: true
diff --git a/qa/suites/rados/multimon/tasks/mon_recovery.yaml b/qa/suites/rados/multimon/tasks/mon_recovery.yaml
@@ -1,4 +1,7 @@
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
 - mon_recovery:
diff --git a/qa/suites/rados/objectstore/ceph_objectstore_tool.yaml b/qa/suites/rados/objectstore/ceph_objectstore_tool.yaml
@@ -12,5 +12,11 @@ tasks:
       global:
         osd max object name len: 460
         osd max object namespace len: 64
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (TOO_FEW_PGS)
 - ceph_objectstore_tool:
     objects: 20
diff --git a/qa/suites/rados/rest/mgr-restful.yaml b/qa/suites/rados/rest/mgr-restful.yaml
@@ -3,6 +3,9 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MGR_DOWN)
 - exec:
     mon.a:
       - ceph config-key put mgr/restful/x/server_addr 127.0.0.1

diff --git a/qa/suites/rados/singleton-bluestore/all/cephtool.yaml b/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
@@ -21,6 +21,11 @@ tasks:
     - must scrub before tier agent can activate
     - failsafe engaged, dropping updates
     - failsafe disengaged, no longer dropping updates
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (OSD_
+    - (PG_
+    - (SMALLER_PG_NUM)
 - workunit:
     clients:
       all:

diff --git a/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml b/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
@@ -5,6 +5,10 @@ overrides:
     log-whitelist:
       - MDS in read-only mode
       - force file system read-only
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_FULL)
+      - (MDS_READ_ONLY)
 tasks:
 - install:
 - ceph:

diff --git a/qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml b/qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml
@@ -3,6 +3,9 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
     conf:
       global:
         osd max object name len: 460

diff --git a/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml b/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml
@@ -8,6 +8,9 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
     conf:
       global:
         osd max object name len: 460

diff --git a/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml b/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml
@@ -5,6 +5,10 @@ overrides:
   ceph:
     log-whitelist:
       - is full
+      - overall HEALTH_
+      - (POOL_FULL)
+      - (POOL_NEAR_FULL)
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - install:
 - ceph:

diff --git a/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml b/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
@@ -10,6 +10,10 @@ tasks:
         osd max object namespace len: 64
     log-whitelist:
       - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - workunit:
     clients:
       all:

diff --git a/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml b/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml
@@ -11,6 +11,11 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (PG_
+      - (OSD_
+      - (OBJECT_
     conf:
       osd:
         osd debug reject backfill probability: .3

diff --git a/qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml b/qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml
@@ -7,6 +7,9 @@ overrides:
       flavor: notcmalloc
       debuginfo: true
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (PG_
     conf:
       global:
         osd heartbeat grace: 40

diff --git a/qa/suites/rados/singleton/all/divergent_priors.yaml b/qa/suites/rados/singleton/all/divergent_priors.yaml
@@ -12,6 +12,12 @@ openstack:
 
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
     conf:
       osd:
         debug osd: 5

diff --git a/qa/suites/rados/singleton/all/divergent_priors2.yaml b/qa/suites/rados/singleton/all/divergent_priors2.yaml
@@ -12,6 +12,12 @@ openstack:
 
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
     conf:
       osd:
         debug osd: 5

diff --git a/qa/suites/rados/singleton/all/dump-stuck.yaml b/qa/suites/rados/singleton/all/dump-stuck.yaml
@@ -11,5 +11,9 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
+      - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - dump_stuck:
diff --git a/qa/suites/rados/singleton/all/ec-lost-unfound.yaml b/qa/suites/rados/singleton/all/ec-lost-unfound.yaml
@@ -15,5 +15,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
 - ec_lost_unfound:
diff --git a/qa/suites/rados/singleton/all/lost-unfound-delete.yaml b/qa/suites/rados/singleton/all/lost-unfound-delete.yaml
@@ -14,5 +14,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
 - rep_lost_unfound_delete:
diff --git a/qa/suites/rados/singleton/all/lost-unfound.yaml b/qa/suites/rados/singleton/all/lost-unfound.yaml
@@ -14,5 +14,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
 - lost_unfound: