diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index b61fb70c2acc6..572e45f438b2a 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -194,3 +194,12 @@
   If you deployed Luminous dev releases or 12.1.0 rc release and made use of
   the CRUSH choose_args feature, you need to remove all choose_args mappings
   from your CRUSH map before starting the upgrade.
+
+* The 'ceph health' structured output (JSON or XML) no longer contains
+  a 'timechecks' section describing the time sync status.  This
+  information is now available via the 'ceph time-sync-status'
+  command.
+
+* Certain extra fields in the 'ceph health' structured output that
+  used to appear if the mons were low on disk space (which duplicated
+  the information in the normal health warning messages) are now gone.
diff --git a/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml b/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml
index 4f2d6df18864d..155ca72452963 100644
--- a/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml
+++ b/qa/cephfs/overrides/whitelist_wrongly_marked_down.yaml
@@ -1,7 +1,12 @@
 overrides:
   ceph:
     log-whitelist:
+      - overall HEALTH_
+      - (OSD_DOWN)
+      - (OSD_
       - wrongly marked me down
+# MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
+      - is not responding
     conf:
       mds:
         debug mds: 20
diff --git a/qa/suites/fs/basic_functional/overrides/whitelist_health.yaml b/qa/suites/fs/basic_functional/overrides/whitelist_health.yaml
new file mode 100644
index 0000000000000..b5bf1fa7b5eff
--- /dev/null
+++ b/qa/suites/fs/basic_functional/overrides/whitelist_health.yaml
@@ -0,0 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (FS_DEGRADED)
+      - (MDS_FAILED)
+      - (MDS_DEGRADED)
+      - (FS_WITH_FAILED_MDS)
+      - (MDS_DAMAGE)
diff --git a/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml b/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml
index 3b2714689f0df..30b3a96e20760 100644
--- a/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml
+++ b/qa/suites/fs/basic_functional/tasks/cephfs_scrub_tests.yaml
@@ -4,6 +4,8 @@ overrides:
       - Scrub error on inode
       - Behind on trimming
       - Metadata damage detected
+      - overall HEALTH_
+      - (MDS_TRIM)
     conf:
       mds:
         mds log max segments: 1
diff --git a/qa/suites/fs/thrash/overrides/whitelist_health.yaml b/qa/suites/fs/thrash/overrides/whitelist_health.yaml
new file mode 100644
index 0000000000000..fc8119daca809
--- /dev/null
+++ b/qa/suites/fs/thrash/overrides/whitelist_health.yaml
@@ -0,0 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (FS_DEGRADED)
+      - (MDS_FAILED)
+      - (MDS_DEGRADED)
+      - (FS_WITH_FAILED_MDS)
diff --git a/qa/suites/rados/basic-luminous/scrub_test.yaml b/qa/suites/rados/basic-luminous/scrub_test.yaml
index 07f039aae2915..d87f5bfdd35b4 100644
--- a/qa/suites/rados/basic-luminous/scrub_test.yaml
+++ b/qa/suites/rados/basic-luminous/scrub_test.yaml
@@ -15,6 +15,12 @@ overrides:
     - 'attr name mistmatch'
     - 'deep-scrub 1 missing, 0 inconsistent objects'
     - 'failed to pick suitable auth object'
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (OSD_
+    - (PG_
+    - (OSD_SCRUB_ERRORS)
+    - (TOO_FEW_PGS)
     conf:
       osd:
         osd deep scrub update digest min age: 0
diff --git a/qa/suites/rados/basic/tasks/rados_api_tests.yaml b/qa/suites/rados/basic/tasks/rados_api_tests.yaml
index b66423988d7cf..1d77207d2b43b 100644
--- a/qa/suites/rados/basic/tasks/rados_api_tests.yaml
+++ b/qa/suites/rados/basic/tasks/rados_api_tests.yaml
@@ -3,6 +3,11 @@ overrides:
     log-whitelist:
     - reached quota
     - wrongly marked me down
+    - overall HEALTH_
+    - (POOL_FULL)
+    - (SMALLER_PGP_NUM)
+    - (CACHE_POOL_NO_HIT_SET)
+    - (CACHE_POOL_NEAR_FULL)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rados/basic/tasks/rados_python.yaml b/qa/suites/rados/basic/tasks/rados_python.yaml
index d8b332b343dc6..aa22ccd16e1be 100644
--- a/qa/suites/rados/basic/tasks/rados_python.yaml
+++ b/qa/suites/rados/basic/tasks/rados_python.yaml
@@ -2,6 +2,11 @@ overrides:
   ceph:
     log-whitelist:
     - wrongly marked me down
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (PG_
+    - (OSD_
+    - (OBJECT_
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rados/basic/tasks/rados_stress_watch.yaml b/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
index 0e1ba010c5b4e..ded794c17412f 100644
--- a/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
+++ b/qa/suites/rados/basic/tasks/rados_stress_watch.yaml
@@ -1,3 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
+      - (TOO_FEW_PGS)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rados/basic/tasks/repair_test.yaml b/qa/suites/rados/basic/tasks/repair_test.yaml
index f69866994417e..8401c1a303f0c 100644
--- a/qa/suites/rados/basic/tasks/repair_test.yaml
+++ b/qa/suites/rados/basic/tasks/repair_test.yaml
@@ -17,6 +17,10 @@ overrides:
       - 'size 1 != size'
       - attr name mismatch
       - Regular scrub request, losing deep-scrub details
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
     conf:
       osd:
         filestore debug inject read err: true
diff --git a/qa/suites/rados/mgr/tasks/failover.yaml b/qa/suites/rados/mgr/tasks/failover.yaml
index e02b8bf2cb078..fd5eb8515c905 100644
--- a/qa/suites/rados/mgr/tasks/failover.yaml
+++ b/qa/suites/rados/mgr/tasks/failover.yaml
@@ -4,7 +4,11 @@ tasks:
   - ceph:
       # tests may leave mgrs broken, so don't try and call into them
       # to invoke e.g. pg dump during teardown.
-      wait-for-scrub: false  
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - (MGR_DOWN)
+        - (PG_
   - cephfs_test_runner:
       modules:
         - tasks.mgr.test_failover
diff --git a/qa/suites/rados/monthrash/thrashers/force-sync-many.yaml b/qa/suites/rados/monthrash/thrashers/force-sync-many.yaml
index 2867f2db5ec7c..38570fcf615ef 100644
--- a/qa/suites/rados/monthrash/thrashers/force-sync-many.yaml
+++ b/qa/suites/rados/monthrash/thrashers/force-sync-many.yaml
@@ -1,3 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
+      - (TOO_FEW_PGS)
 tasks:
 - mon_thrash:
     revive_delay: 90
diff --git a/qa/suites/rados/monthrash/thrashers/many.yaml b/qa/suites/rados/monthrash/thrashers/many.yaml
index fe52bb2bbeb5e..e940c42ad7435 100644
--- a/qa/suites/rados/monthrash/thrashers/many.yaml
+++ b/qa/suites/rados/monthrash/thrashers/many.yaml
@@ -1,5 +1,8 @@
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
     conf:
       osd:
         mon client ping interval: 4
diff --git a/qa/suites/rados/monthrash/thrashers/one.yaml b/qa/suites/rados/monthrash/thrashers/one.yaml
index 2ce44c8601fa5..92c9eb3a808ff 100644
--- a/qa/suites/rados/monthrash/thrashers/one.yaml
+++ b/qa/suites/rados/monthrash/thrashers/one.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
 tasks:
 - mon_thrash:
     revive_delay: 20
diff --git a/qa/suites/rados/monthrash/thrashers/sync-many.yaml b/qa/suites/rados/monthrash/thrashers/sync-many.yaml
index 9868f18159f64..68020cd665143 100644
--- a/qa/suites/rados/monthrash/thrashers/sync-many.yaml
+++ b/qa/suites/rados/monthrash/thrashers/sync-many.yaml
@@ -1,5 +1,8 @@
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
     conf:
       mon:
         paxos min: 10
diff --git a/qa/suites/rados/monthrash/thrashers/sync.yaml b/qa/suites/rados/monthrash/thrashers/sync.yaml
index 1e7054c271d86..b07f8b511f65e 100644
--- a/qa/suites/rados/monthrash/thrashers/sync.yaml
+++ b/qa/suites/rados/monthrash/thrashers/sync.yaml
@@ -1,5 +1,8 @@
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
     conf:
       mon:
         paxos min: 10
diff --git a/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml b/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
index b536557fdbac2..0834f9c34c183 100644
--- a/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
+++ b/qa/suites/rados/monthrash/workloads/rados_api_tests.yaml
@@ -2,6 +2,12 @@ overrides:
   ceph:
     log-whitelist:
       - reached quota
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
+      - (POOL_FULL)
+      - (REQUEST_SLOW)
+      - (MON_DOWN)
+      - (PG_
     conf:
       global:
         debug objecter: 20
diff --git a/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml b/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml
index 31465cffe7127..86818b58dff70 100644
--- a/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml
+++ b/qa/suites/rados/monthrash/workloads/rados_mon_workunits.yaml
@@ -2,6 +2,9 @@ overrides:
   ceph:
     log-whitelist:
     - wrongly marked me down
+    - overall HEALTH_
+    - (PG_
+    - (MON_DOWN)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml b/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml
index e86bdde1d7d24..ec761e2955ee2 100644
--- a/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml
+++ b/qa/suites/rados/multimon/tasks/mon_clock_no_skews.yaml
@@ -5,5 +5,7 @@ tasks:
     - slow request
     - .*clock.*skew.*
     - clocks not synchronized
+    - overall HEALTH_
+    - (MON_CLOCK_SKEW)
 - mon_clock_skew_check:
     expect-skew: false
diff --git a/qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml b/qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml
index 1c6c1538b800e..2bba607152ea2 100644
--- a/qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml
+++ b/qa/suites/rados/multimon/tasks/mon_clock_with_skews.yaml
@@ -9,5 +9,7 @@ tasks:
     - slow request
     - .*clock.*skew.*
     - clocks not synchronized
+    - overall HEALTH_
+    - (MON_CLOCK_SKEW)
 - mon_clock_skew_check:
     expect-skew: true
diff --git a/qa/suites/rados/multimon/tasks/mon_recovery.yaml b/qa/suites/rados/multimon/tasks/mon_recovery.yaml
index 94721ea53a495..4234bf73e6816 100644
--- a/qa/suites/rados/multimon/tasks/mon_recovery.yaml
+++ b/qa/suites/rados/multimon/tasks/mon_recovery.yaml
@@ -1,4 +1,7 @@
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
 - mon_recovery:
diff --git a/qa/suites/rados/objectstore/ceph_objectstore_tool.yaml b/qa/suites/rados/objectstore/ceph_objectstore_tool.yaml
index 215d0f08f9b6b..2001faa3fe81b 100644
--- a/qa/suites/rados/objectstore/ceph_objectstore_tool.yaml
+++ b/qa/suites/rados/objectstore/ceph_objectstore_tool.yaml
@@ -12,5 +12,11 @@ tasks:
       global:
         osd max object name len: 460
         osd max object namespace len: 64
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (TOO_FEW_PGS)
 - ceph_objectstore_tool:
     objects: 20
diff --git a/qa/suites/rados/rest/mgr-restful.yaml b/qa/suites/rados/rest/mgr-restful.yaml
index 571857c251107..5dd16fda288cf 100644
--- a/qa/suites/rados/rest/mgr-restful.yaml
+++ b/qa/suites/rados/rest/mgr-restful.yaml
@@ -3,6 +3,9 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MGR_DOWN)
 - exec:
     mon.a:
       - ceph config-key put mgr/restful/x/server_addr 127.0.0.1
diff --git a/qa/suites/rados/singleton-bluestore/all/cephtool.yaml b/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
index 880628f4fd2b7..7e1a1f7b389a4 100644
--- a/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
+++ b/qa/suites/rados/singleton-bluestore/all/cephtool.yaml
@@ -21,6 +21,11 @@ tasks:
     - must scrub before tier agent can activate
     - failsafe engaged, dropping updates
     - failsafe disengaged, no longer dropping updates
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (OSD_
+    - (PG_
+    - (SMALLER_PG_NUM)
 - workunit:
     clients:
       all:
diff --git a/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml b/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
index 969c40902fb91..3aaca87594032 100644
--- a/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/admin_socket_output.yaml
@@ -5,6 +5,10 @@ overrides:
     log-whitelist:
       - MDS in read-only mode
       - force file system read-only
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_FULL)
+      - (MDS_READ_ONLY)
 tasks:
 - install:
 - ceph:
diff --git a/qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml b/qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml
index 5009ee617035c..ac64165aaaa6a 100644
--- a/qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/cache-fs-trunc.yaml
@@ -3,6 +3,9 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
     conf:
       global:
         osd max object name len: 460
diff --git a/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml b/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml
index e0badd4d3afad..1b777ab0f00b7 100644
--- a/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/export-after-evict.yaml
@@ -8,6 +8,9 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
     conf:
       global:
         osd max object name len: 460
diff --git a/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml b/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml
index 9dc1fe7dcc9bf..5eb42f4dd6390 100644
--- a/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/full-tiering.yaml
@@ -5,6 +5,10 @@ overrides:
   ceph:
     log-whitelist:
       - is full
+      - overall HEALTH_
+      - (POOL_FULL)
+      - (POOL_NEAR_FULL)
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - install:
 - ceph:
diff --git a/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml b/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
index 4c8228b0cd948..749bd8d39c3be 100644
--- a/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/health-warnings.yaml
@@ -10,6 +10,10 @@ tasks:
         osd max object namespace len: 64
     log-whitelist:
       - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - workunit:
     clients:
       all:
diff --git a/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml b/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml
index b73899738e674..cadf3044a1d12 100644
--- a/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/multi-backfill-reject.yaml
@@ -11,6 +11,11 @@ roles:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (PG_
+      - (OSD_
+      - (OBJECT_
     conf:
       osd:
         osd debug reject backfill probability: .3
diff --git a/qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml b/qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml
index e5d5702a32b65..65af1a2e817c5 100644
--- a/qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml
+++ b/qa/suites/rados/singleton-nomsgr/all/valgrind-leaks.yaml
@@ -7,6 +7,9 @@ overrides:
       flavor: notcmalloc
       debuginfo: true
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (PG_
     conf:
       global:
         osd heartbeat grace: 40
diff --git a/qa/suites/rados/singleton/all/divergent_priors.yaml b/qa/suites/rados/singleton/all/divergent_priors.yaml
index bb7c2b57f50f8..f15fb88961587 100644
--- a/qa/suites/rados/singleton/all/divergent_priors.yaml
+++ b/qa/suites/rados/singleton/all/divergent_priors.yaml
@@ -12,6 +12,12 @@ openstack:
 
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
     conf:
       osd:
         debug osd: 5
diff --git a/qa/suites/rados/singleton/all/divergent_priors2.yaml b/qa/suites/rados/singleton/all/divergent_priors2.yaml
index ab749f1b516ec..90d8b1838b986 100644
--- a/qa/suites/rados/singleton/all/divergent_priors2.yaml
+++ b/qa/suites/rados/singleton/all/divergent_priors2.yaml
@@ -12,6 +12,12 @@ openstack:
 
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
     conf:
       osd:
         debug osd: 5
diff --git a/qa/suites/rados/singleton/all/dump-stuck.yaml b/qa/suites/rados/singleton/all/dump-stuck.yaml
index 7d3b443021bbb..f3900e121fe72 100644
--- a/qa/suites/rados/singleton/all/dump-stuck.yaml
+++ b/qa/suites/rados/singleton/all/dump-stuck.yaml
@@ -11,5 +11,9 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
+      - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - dump_stuck:
diff --git a/qa/suites/rados/singleton/all/ec-lost-unfound.yaml b/qa/suites/rados/singleton/all/ec-lost-unfound.yaml
index 6ceefe1222e35..e095fd0d58690 100644
--- a/qa/suites/rados/singleton/all/ec-lost-unfound.yaml
+++ b/qa/suites/rados/singleton/all/ec-lost-unfound.yaml
@@ -15,5 +15,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
 - ec_lost_unfound:
diff --git a/qa/suites/rados/singleton/all/lost-unfound-delete.yaml b/qa/suites/rados/singleton/all/lost-unfound-delete.yaml
index 15f4710bd9bf4..5502b5c9b0ffc 100644
--- a/qa/suites/rados/singleton/all/lost-unfound-delete.yaml
+++ b/qa/suites/rados/singleton/all/lost-unfound-delete.yaml
@@ -14,5 +14,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
 - rep_lost_unfound_delete:
diff --git a/qa/suites/rados/singleton/all/lost-unfound.yaml b/qa/suites/rados/singleton/all/lost-unfound.yaml
index 3f22ba3c0136a..bb0bb2c0afe49 100644
--- a/qa/suites/rados/singleton/all/lost-unfound.yaml
+++ b/qa/suites/rados/singleton/all/lost-unfound.yaml
@@ -14,5 +14,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
 - lost_unfound:
diff --git a/qa/suites/rados/singleton/all/mon-thrasher.yaml b/qa/suites/rados/singleton/all/mon-thrasher.yaml
index 1b4622998f9ba..66a1e905f3b83 100644
--- a/qa/suites/rados/singleton/all/mon-thrasher.yaml
+++ b/qa/suites/rados/singleton/all/mon-thrasher.yaml
@@ -13,6 +13,10 @@ openstack:
 tasks:
 - install:
 - ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
+      - (PG_
 - mon_thrash:
     revive_delay: 20
     thrash_delay: 1
diff --git a/qa/suites/rados/singleton/all/osd-backfill.yaml b/qa/suites/rados/singleton/all/osd-backfill.yaml
index f84a0df8d6600..84e2273d3f883 100644
--- a/qa/suites/rados/singleton/all/osd-backfill.yaml
+++ b/qa/suites/rados/singleton/all/osd-backfill.yaml
@@ -14,7 +14,12 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
+      - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
     conf:
       osd:
         osd min pg log entries: 5
diff --git a/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml b/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
index 773cb2480a6fe..60789d5ca68c6 100644
--- a/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
+++ b/qa/suites/rados/singleton/all/osd-recovery-incomplete.yaml
@@ -15,7 +15,12 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
+      - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_
     conf:
       osd:
         osd min pg log entries: 5
diff --git a/qa/suites/rados/singleton/all/osd-recovery.yaml b/qa/suites/rados/singleton/all/osd-recovery.yaml
index 214d7f20cc74c..d6e5e957f24c6 100644
--- a/qa/suites/rados/singleton/all/osd-recovery.yaml
+++ b/qa/suites/rados/singleton/all/osd-recovery.yaml
@@ -14,7 +14,12 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
+      - wrongly marked me down
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
     conf:
       osd:
         osd min pg log entries: 5
diff --git a/qa/suites/rados/singleton/all/peer.yaml b/qa/suites/rados/singleton/all/peer.yaml
index 6e22b44563e9d..e87cd543ce6c9 100644
--- a/qa/suites/rados/singleton/all/peer.yaml
+++ b/qa/suites/rados/singleton/all/peer.yaml
@@ -17,5 +17,9 @@ tasks:
       global:
         osd pool default min size : 1
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - peer:
diff --git a/qa/suites/rados/singleton/all/pg-removal-interruption.yaml b/qa/suites/rados/singleton/all/pg-removal-interruption.yaml
index f7e61c962633f..856b08dd43735 100644
--- a/qa/suites/rados/singleton/all/pg-removal-interruption.yaml
+++ b/qa/suites/rados/singleton/all/pg-removal-interruption.yaml
@@ -13,8 +13,12 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
-    - slow request
+      - wrongly marked me down
+      - slow request
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - exec:
     client.0:
       - sudo ceph osd pool create foo 128 128
diff --git a/qa/suites/rados/singleton/all/radostool.yaml b/qa/suites/rados/singleton/all/radostool.yaml
index 8bc9dbdcd0f06..700b3a33a3cfb 100644
--- a/qa/suites/rados/singleton/all/radostool.yaml
+++ b/qa/suites/rados/singleton/all/radostool.yaml
@@ -17,6 +17,8 @@ tasks:
     - had wrong client addr
     - had wrong cluster addr
     - reached quota
+    - overall HEALTH_
+    - (POOL_FULL)
 - workunit:
     clients:
       all:
diff --git a/qa/suites/rados/singleton/all/rebuild-mondb.yaml b/qa/suites/rados/singleton/all/rebuild-mondb.yaml
index c3be13ae6a2cf..6847cef8da29a 100644
--- a/qa/suites/rados/singleton/all/rebuild-mondb.yaml
+++ b/qa/suites/rados/singleton/all/rebuild-mondb.yaml
@@ -15,7 +15,12 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - no reply from
+      - no reply from
+      - overall HEALTH_
+      - (MON_DOWN)
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
 - full_sequential:
   - radosbench:
       clients: [client.0]
diff --git a/qa/suites/rados/singleton/all/resolve_stuck_peering.yaml b/qa/suites/rados/singleton/all/resolve_stuck_peering.yaml
index c64593212a5f2..97da137909608 100644
--- a/qa/suites/rados/singleton/all/resolve_stuck_peering.yaml
+++ b/qa/suites/rados/singleton/all/resolve_stuck_peering.yaml
@@ -6,5 +6,11 @@ tasks:
 - install:
 - ceph:
     fs: xfs
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
 - resolve_stuck_peering:
 
diff --git a/qa/suites/rados/singleton/all/rest-api.yaml b/qa/suites/rados/singleton/all/rest-api.yaml
index cbd90e4097da0..77c881b0ee502 100644
--- a/qa/suites/rados/singleton/all/rest-api.yaml
+++ b/qa/suites/rados/singleton/all/rest-api.yaml
@@ -16,8 +16,13 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
-    - had wrong client addr
+      - wrongly marked me down
+      - had wrong client addr
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
     conf:
       client.rest0:
         debug ms: 1
diff --git a/qa/suites/rados/singleton/all/thrash-eio.yaml b/qa/suites/rados/singleton/all/thrash-eio.yaml
index a70636549cd53..6ff629033cd07 100644
--- a/qa/suites/rados/singleton/all/thrash-eio.yaml
+++ b/qa/suites/rados/singleton/all/thrash-eio.yaml
@@ -24,6 +24,11 @@ tasks:
     - wrongly marked me down
     - missing primary copy of
     - objects unfound and apparently lost
+    - overall HEALTH_
+    - (OSDMAP_FLAGS)
+    - (REQUEST_SLOW)
+    - (PG_
+    - (OSD_
 - thrashosds:
     op_delay: 30
     clean_interval: 120
diff --git a/qa/suites/rados/singleton/all/thrash-rados/+ b/qa/suites/rados/singleton/all/thrash-rados/+
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/qa/suites/rados/singleton/all/thrash-rados.yaml b/qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml
similarity index 92%
rename from qa/suites/rados/singleton/all/thrash-rados.yaml
rename to qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml
index 49e3e8799c143..f61897eaabae9 100644
--- a/qa/suites/rados/singleton/all/thrash-rados.yaml
+++ b/qa/suites/rados/singleton/all/thrash-rados/thrash-rados.yaml
@@ -16,7 +16,7 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
+      - wrongly marked me down
 - thrashosds:
     op_delay: 30
     clean_interval: 120
diff --git a/qa/suites/rados/singleton/all/thrash-rados/thrashosds-health.yaml b/qa/suites/rados/singleton/all/thrash-rados/thrashosds-health.yaml
new file mode 120000
index 0000000000000..0b1d7b060a925
--- /dev/null
+++ b/qa/suites/rados/singleton/all/thrash-rados/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml b/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml
index 1875da409a25e..02fee3e88ea7d 100644
--- a/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml
+++ b/qa/suites/rados/singleton/all/thrash_cache_writeback_proxy_none.yaml
@@ -16,8 +16,10 @@ tasks:
 - install:
 - ceph:
     log-whitelist:
-    - wrongly marked me down
-    - slow request
+      - wrongly marked me down
+      - slow request
+      - overall HEALTH_
+      - (CACHE_POOL_
 - exec:
     client.0:
       - sudo ceph osd pool create base 4
diff --git a/qa/suites/rados/singleton/all/watch-notify-same-primary.yaml b/qa/suites/rados/singleton/all/watch-notify-same-primary.yaml
index ad1fd17d5f589..3efdb955fe62c 100644
--- a/qa/suites/rados/singleton/all/watch-notify-same-primary.yaml
+++ b/qa/suites/rados/singleton/all/watch-notify-same-primary.yaml
@@ -22,6 +22,11 @@ tasks:
         debug objecter: 20
         debug rados: 20
     log-whitelist:
-    - objects unfound and apparently lost
+      - objects unfound and apparently lost
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (OBJECT_DEGRADED)
 - watch_notify_same_primary:
     clients: [client.0]
diff --git a/qa/suites/rados/thrash-erasure-code-big/thrashosds-health.yaml b/qa/suites/rados/thrash-erasure-code-big/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-big/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-isa/thrashosds-health.yaml b/qa/suites/rados/thrash-erasure-code-isa/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-isa/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-overwrites/thrashosds-health.yaml b/qa/suites/rados/thrash-erasure-code-overwrites/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-overwrites/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code-shec/thrashosds-health.yaml b/qa/suites/rados/thrash-erasure-code-shec/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code-shec/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/thrash-erasure-code/thrashosds-health.yaml b/qa/suites/rados/thrash-erasure-code/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash-erasure-code/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/thrash-luminous/thrashosds-health.yaml b/qa/suites/rados/thrash-luminous/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash-luminous/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/thrash/thrashosds-health.yaml b/qa/suites/rados/thrash/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rados/thrash/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/upgrade/jewel-x-singleton/thrashosds-health.yaml b/qa/suites/rados/upgrade/jewel-x-singleton/thrashosds-health.yaml
new file mode 120000
index 0000000000000..e0426dbe49935
--- /dev/null
+++ b/qa/suites/rados/upgrade/jewel-x-singleton/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/verify/d-thrash/default/+ b/qa/suites/rados/verify/d-thrash/default/+
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/qa/suites/rados/verify/d-thrash/default.yaml b/qa/suites/rados/verify/d-thrash/default/default.yaml
similarity index 100%
rename from qa/suites/rados/verify/d-thrash/default.yaml
rename to qa/suites/rados/verify/d-thrash/default/default.yaml
diff --git a/qa/suites/rados/verify/d-thrash/default/thrashosds-health.yaml b/qa/suites/rados/verify/d-thrash/default/thrashosds-health.yaml
new file mode 120000
index 0000000000000..0b1d7b060a925
--- /dev/null
+++ b/qa/suites/rados/verify/d-thrash/default/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rados/verify/tasks/mon_recovery.yaml b/qa/suites/rados/verify/tasks/mon_recovery.yaml
index 6986303409ee7..412db863022be 100644
--- a/qa/suites/rados/verify/tasks/mon_recovery.yaml
+++ b/qa/suites/rados/verify/tasks/mon_recovery.yaml
@@ -1,2 +1,9 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (MON_DOWN)
+      - (OSDMAP_FLAGS)
+      - (SMALLER_PGP_NUM)
 tasks:
 - mon_recovery:
diff --git a/qa/suites/rados/verify/tasks/rados_api_tests.yaml b/qa/suites/rados/verify/tasks/rados_api_tests.yaml
index 11e3858f6a7da..7c06248d25492 100644
--- a/qa/suites/rados/verify/tasks/rados_api_tests.yaml
+++ b/qa/suites/rados/verify/tasks/rados_api_tests.yaml
@@ -2,6 +2,12 @@ overrides:
   ceph:
     log-whitelist:
       - reached quota
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
+      - (POOL_FULL)
+      - (SMALLER_PGP_NUM)
+      - (REQUEST_SLOW)
+      - (CACHE_POOL_NEAR_FULL)
     conf:
       client:
         debug ms: 1
diff --git a/qa/suites/rbd/basic/cachepool/small.yaml b/qa/suites/rbd/basic/cachepool/small.yaml
index 8262be3304469..5c8f924abadcd 100644
--- a/qa/suites/rbd/basic/cachepool/small.yaml
+++ b/qa/suites/rbd/basic/cachepool/small.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - exec:
     client.0:
diff --git a/qa/suites/rbd/basic/tasks/rbd_api_tests_old_format.yaml b/qa/suites/rbd/basic/tasks/rbd_api_tests_old_format.yaml
index a98768540ba97..9d34002a15882 100644
--- a/qa/suites/rbd/basic/tasks/rbd_api_tests_old_format.yaml
+++ b/qa/suites/rbd/basic/tasks/rbd_api_tests_old_format.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml b/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
index 263b784e27dd2..f60a5ffa7e1b2 100644
--- a/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
+++ b/qa/suites/rbd/basic/tasks/rbd_python_api_tests_old_format.yaml
@@ -1,3 +1,7 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - (REQUEST_SLOW)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/cli/pool/ec-data-pool.yaml b/qa/suites/rbd/cli/pool/ec-data-pool.yaml
index 75dfc6a45534e..32dd2ab90792e 100644
--- a/qa/suites/rbd/cli/pool/ec-data-pool.yaml
+++ b/qa/suites/rbd/cli/pool/ec-data-pool.yaml
@@ -11,6 +11,9 @@ overrides:
     bdev_inject_crash_probability: .5
   ceph:
     fs: xfs
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
     conf:
       client:
         rbd default data pool: datapool
diff --git a/qa/suites/rbd/cli/pool/small-cache-pool.yaml b/qa/suites/rbd/cli/pool/small-cache-pool.yaml
index 8262be3304469..5c8f924abadcd 100644
--- a/qa/suites/rbd/cli/pool/small-cache-pool.yaml
+++ b/qa/suites/rbd/cli/pool/small-cache-pool.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - exec:
     client.0:
diff --git a/qa/suites/rbd/librbd/pool/small-cache-pool.yaml b/qa/suites/rbd/librbd/pool/small-cache-pool.yaml
index 8262be3304469..5c8f924abadcd 100644
--- a/qa/suites/rbd/librbd/pool/small-cache-pool.yaml
+++ b/qa/suites/rbd/librbd/pool/small-cache-pool.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - exec:
     client.0:
diff --git a/qa/suites/rbd/librbd/workloads/c_api_tests.yaml b/qa/suites/rbd/librbd/workloads/c_api_tests.yaml
index 188ddc56c6078..b70e8d52b80af 100644
--- a/qa/suites/rbd/librbd/workloads/c_api_tests.yaml
+++ b/qa/suites/rbd/librbd/workloads/c_api_tests.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/librbd/workloads/c_api_tests_with_defaults.yaml b/qa/suites/rbd/librbd/workloads/c_api_tests_with_defaults.yaml
index ee1de610a91c0..c2af3573dc5df 100644
--- a/qa/suites/rbd/librbd/workloads/c_api_tests_with_defaults.yaml
+++ b/qa/suites/rbd/librbd/workloads/c_api_tests_with_defaults.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/librbd/workloads/c_api_tests_with_journaling.yaml b/qa/suites/rbd/librbd/workloads/c_api_tests_with_journaling.yaml
index eda2b5e8a607b..f1121a4039658 100644
--- a/qa/suites/rbd/librbd/workloads/c_api_tests_with_journaling.yaml
+++ b/qa/suites/rbd/librbd/workloads/c_api_tests_with_journaling.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/nbd/thrashosds-health.yaml b/qa/suites/rbd/nbd/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rbd/nbd/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml b/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml
index 80379a1026b78..09e8bc3f24b6c 100644
--- a/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml
+++ b/qa/suites/rbd/qemu/pool/ec-cache-pool.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - exec:
     client.0:
diff --git a/qa/suites/rbd/qemu/pool/small-cache-pool.yaml b/qa/suites/rbd/qemu/pool/small-cache-pool.yaml
index 8262be3304469..5c8f924abadcd 100644
--- a/qa/suites/rbd/qemu/pool/small-cache-pool.yaml
+++ b/qa/suites/rbd/qemu/pool/small-cache-pool.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - exec:
     client.0:
diff --git a/qa/suites/rbd/singleton/all/rbd_mirror.yaml b/qa/suites/rbd/singleton/all/rbd_mirror.yaml
index 21624164beba5..5006dd8017b39 100644
--- a/qa/suites/rbd/singleton/all/rbd_mirror.yaml
+++ b/qa/suites/rbd/singleton/all/rbd_mirror.yaml
@@ -4,6 +4,9 @@ tasks:
 - install:
 - ceph:
     fs: xfs
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 - workunit:
     clients:
       all: [rbd/test_rbd_mirror.sh]
diff --git a/qa/suites/rbd/thrash/thrashosds-health.yaml b/qa/suites/rbd/thrash/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rbd/thrash/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/suites/rbd/thrash/workloads/rbd_api_tests.yaml b/qa/suites/rbd/thrash/workloads/rbd_api_tests.yaml
index ee1de610a91c0..c2af3573dc5df 100644
--- a/qa/suites/rbd/thrash/workloads/rbd_api_tests.yaml
+++ b/qa/suites/rbd/thrash/workloads/rbd_api_tests.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/thrash/workloads/rbd_api_tests_copy_on_read.yaml b/qa/suites/rbd/thrash/workloads/rbd_api_tests_copy_on_read.yaml
index cfa0a25a70d0f..7f64ef3f13618 100644
--- a/qa/suites/rbd/thrash/workloads/rbd_api_tests_copy_on_read.yaml
+++ b/qa/suites/rbd/thrash/workloads/rbd_api_tests_copy_on_read.yaml
@@ -7,6 +7,9 @@ tasks:
       RBD_FEATURES: "61"
 overrides:
   ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
     conf:
       client:
         rbd clone copy on read: true
diff --git a/qa/suites/rbd/thrash/workloads/rbd_api_tests_journaling.yaml b/qa/suites/rbd/thrash/workloads/rbd_api_tests_journaling.yaml
index eda2b5e8a607b..f1121a4039658 100644
--- a/qa/suites/rbd/thrash/workloads/rbd_api_tests_journaling.yaml
+++ b/qa/suites/rbd/thrash/workloads/rbd_api_tests_journaling.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/thrash/workloads/rbd_api_tests_no_locking.yaml b/qa/suites/rbd/thrash/workloads/rbd_api_tests_no_locking.yaml
index 188ddc56c6078..b70e8d52b80af 100644
--- a/qa/suites/rbd/thrash/workloads/rbd_api_tests_no_locking.yaml
+++ b/qa/suites/rbd/thrash/workloads/rbd_api_tests_no_locking.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/valgrind/workloads/c_api_tests.yaml b/qa/suites/rbd/valgrind/workloads/c_api_tests.yaml
index 188ddc56c6078..b70e8d52b80af 100644
--- a/qa/suites/rbd/valgrind/workloads/c_api_tests.yaml
+++ b/qa/suites/rbd/valgrind/workloads/c_api_tests.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/valgrind/workloads/c_api_tests_with_defaults.yaml b/qa/suites/rbd/valgrind/workloads/c_api_tests_with_defaults.yaml
index ee1de610a91c0..c2af3573dc5df 100644
--- a/qa/suites/rbd/valgrind/workloads/c_api_tests_with_defaults.yaml
+++ b/qa/suites/rbd/valgrind/workloads/c_api_tests_with_defaults.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/valgrind/workloads/c_api_tests_with_journaling.yaml b/qa/suites/rbd/valgrind/workloads/c_api_tests_with_journaling.yaml
index eda2b5e8a607b..f1121a4039658 100644
--- a/qa/suites/rbd/valgrind/workloads/c_api_tests_with_journaling.yaml
+++ b/qa/suites/rbd/valgrind/workloads/c_api_tests_with_journaling.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rbd/valgrind/workloads/rbd_mirror.yaml b/qa/suites/rbd/valgrind/workloads/rbd_mirror.yaml
index 4a2ee40e394ca..8adc7209ad7df 100644
--- a/qa/suites/rbd/valgrind/workloads/rbd_mirror.yaml
+++ b/qa/suites/rbd/valgrind/workloads/rbd_mirror.yaml
@@ -1,3 +1,8 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (CACHE_POOL_NO_HIT_SET)
 tasks:
 - workunit:
     clients:
diff --git a/qa/suites/rgw/thrash/thrashosds-health.yaml b/qa/suites/rgw/thrash/thrashosds-health.yaml
new file mode 120000
index 0000000000000..ebf7f34f39bcf
--- /dev/null
+++ b/qa/suites/rgw/thrash/thrashosds-health.yaml
@@ -0,0 +1 @@
+../../../tasks/thrashosds-health.yaml
\ No newline at end of file
diff --git a/qa/tasks/ceph.py b/qa/tasks/ceph.py
index 69ccbde5aa340..5318643d27a55 100644
--- a/qa/tasks/ceph.py
+++ b/qa/tasks/ceph.py
@@ -331,6 +331,13 @@ def create_rbd_pool(ctx, config):
     cluster_name = config['cluster']
     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    log.info('Waiting for OSDs to come up')
+    teuthology.wait_until_osds_up(
+        ctx,
+        cluster=ctx.cluster,
+        remote=mon_remote,
+        ceph_cluster=cluster_name,
+    )
     log.info('Creating RBD pool')
     mon_remote.run(
         args=['sudo', 'ceph', '--cluster', cluster_name,
@@ -1621,3 +1628,20 @@ def task(ctx, config):
         finally:
             if config.get('wait-for-scrub', True):
                 osd_scrub_pgs(ctx, config)
+
+            # stop logging health to clog during shutdown, or else we generate
+            # a bunch of scary messages unrelated to our actual run.
+            firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
+            (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+            mon0_remote.run(
+                args=[
+                    'sudo',
+                    'ceph',
+                    '--cluster', config['cluster'],
+                    'tell',
+                    'mon.*',
+                    'injectargs',
+                    '--',
+                    '--no-mon-health-to-clog',
+                ]
+            )
diff --git a/qa/tasks/ceph_test_case.py b/qa/tasks/ceph_test_case.py
index 270c18553edbb..47f3921347dbd 100644
--- a/qa/tasks/ceph_test_case.py
+++ b/qa/tasks/ceph_test_case.py
@@ -83,7 +83,8 @@ def wait_for_health(self, pattern, timeout):
         """
         def seen_health_warning():
             health = self.ceph_cluster.mon_manager.get_mon_health()
-            summary_strings = [s['summary'] for s in health['summary']]
+            codes = [s for s in health['checks']]
+            summary_strings = [s[1]['message'] for s in health['checks'].iteritems()]
             if len(summary_strings) == 0:
                 log.debug("Not expected number of summary strings ({0})".format(summary_strings))
                 return False
@@ -91,6 +92,8 @@ def seen_health_warning():
                 for ss in summary_strings:
                     if pattern in ss:
                          return True
+                if pattern in codes:
+                    return True
 
             log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
             return False
@@ -103,7 +106,7 @@ def wait_for_health_clear(self, timeout):
         """
         def is_clear():
             health = self.ceph_cluster.mon_manager.get_mon_health()
-            return len(health['summary']) == 0
+            return len(health['checks']) == 0
 
         self.wait_until_true(is_clear, timeout)
 
diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py
index 033d8dde902c6..c0aa2e4c70fae 100644
--- a/qa/tasks/cephfs/test_auto_repair.py
+++ b/qa/tasks/cephfs/test_auto_repair.py
@@ -81,7 +81,7 @@ def test_mds_readonly(self):
         self.assertTrue(writer.finished)
 
         # The MDS should report its readonly health state to the mon
-        self.wait_for_health("MDS in read-only mode", timeout=30)
+        self.wait_for_health("MDS_READ_ONLY", timeout=30)
 
         # restart mds to make it writable
         self.fs.mds_fail_restart()
diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py
index f25cb4a21f33b..d8675fdad8b16 100644
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@@ -62,12 +62,12 @@ def _test_client_pin(self, use_subdir):
         # MDS should not be happy about that, as the client is failing to comply
         # with the SESSION_RECALL messages it is being sent
         mds_recall_state_timeout = int(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("failing to respond to cache pressure",
+        self.wait_for_health("MDS_HEALTH_CLIENT_RECALL",
                 mds_recall_state_timeout + 10)
 
         # We can also test that the MDS health warning for oversized
         # cache is functioning as intended.
-        self.wait_for_health("Too many inodes in cache",
+        self.wait_for_health("MDS_CACHE_OVERSIZED",
                 mds_recall_state_timeout + 10)
 
         # When the client closes the files, it should retain only as many caps as allowed
@@ -123,7 +123,7 @@ def test_client_release_bug(self):
         # After mds_revoke_cap_timeout, we should see a health warning (extra lag from
         # MDS beacon period)
         mds_revoke_cap_timeout = int(self.fs.get_config("mds_revoke_cap_timeout"))
-        self.wait_for_health("failing to respond to capability release", mds_revoke_cap_timeout + 10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_revoke_cap_timeout + 10)
 
         # Client B should still be stuck
         self.assertFalse(rproc.finished)
@@ -163,7 +163,7 @@ def test_client_oldest_tid(self):
         self.mount_a.create_n_files("testdir/file2", 5, True)
 
         # Wait for the health warnings. Assume mds can handle 10 request per second at least
-        self.wait_for_health("failing to advance its oldest client/flush tid", max_requests / 10)
+        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests / 10)
 
     def _test_client_cache_size(self, mount_subdir):
         """
diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py
index 1bb2ff7322b33..53c2d5e301e7d 100644
--- a/qa/tasks/cephfs/test_failover.py
+++ b/qa/tasks/cephfs/test_failover.py
@@ -112,7 +112,7 @@ def test_standby_count_wanted(self):
         victim = standbys.pop()
         self.fs.mds_stop(victim)
         log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("insufficient standby daemons available", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
 
         # restart the standby, see that he becomes a standby, check health clears
         self.fs.mds_restart(victim)
@@ -127,7 +127,7 @@ def test_standby_count_wanted(self):
         self.assertGreaterEqual(len(standbys), 1)
         self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
         log.info("waiting for insufficient standby daemon warning")
-        self.wait_for_health("insufficient standby daemons available", grace*2)
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
 
         # Set it to 0
         self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
diff --git a/qa/tasks/mon_clock_skew_check.py b/qa/tasks/mon_clock_skew_check.py
index 891e6ec484ede..547339f79a11b 100644
--- a/qa/tasks/mon_clock_skew_check.py
+++ b/qa/tasks/mon_clock_skew_check.py
@@ -13,43 +13,19 @@
 
 class ClockSkewCheck:
     """
-    Periodically check if there are any clock skews among the monitors in the
-    quorum. By default, assume no skews are supposed to exist; that can be
-    changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
-    then we will always succeed and only report skews if any are found.
-
-    This class does not spawn a thread. It assumes that, if that is indeed
-    wanted, it should be done by a third party (for instance, the task using
-    this class). We intend it as such in order to reuse this class if need be.
+    Check if there are any clock skews among the monitors in the
+    quorum.
 
     This task accepts the following options:
 
-    interval     amount of seconds to wait in-between checks. (default: 30.0)
-    max-skew     maximum skew, in seconds, that is considered tolerable before
-                 issuing a warning. (default: 0.05)
+    interval     amount of seconds to wait before check. (default: 30.0)
     expect-skew  'true' or 'false', to indicate whether to expect a skew during
                  the run or not. If 'true', the test will fail if no skew is
                  found, and succeed if a skew is indeed found; if 'false', it's
                  the other way around. (default: false)
-    never-fail   Don't fail the run if a skew is detected and we weren't
-                 expecting it, or if no skew is detected and we were expecting
-                 it. (default: False)
-
-    at-least-once          Runs at least once, even if we are told to stop.
-                           (default: True)
-    at-least-once-timeout  If we were told to stop but we are attempting to
-                           run at least once, timeout after this many seconds.
-                           (default: 600)
-
-    Example:
-        Expect a skew higher than 0.05 seconds, but only report it without
-        failing the teuthology run.
 
     - mon_clock_skew_check:
-        interval: 30
-        max-skew: 0.05
-        expect_skew: true
-        never-fail: true
+        expect-skew: true
     """
 
     def __init__(self, ctx, manager, config, logger):
@@ -63,181 +39,15 @@ def __init__(self, ctx, manager, config, logger):
         if self.config is None:
             self.config = dict()
 
-        self.check_interval = float(self.config.get('interval', 30.0))
-
-        first_mon = teuthology.get_first_mon(ctx, config)
-        remote = ctx.cluster.only(first_mon).remotes.keys()[0]
-        proc = remote.run(
-            args=[
-                'sudo',
-                'ceph-mon',
-                '-i', first_mon[4:],
-                '--show-config-value', 'mon_clock_drift_allowed'
-                ], stdout=StringIO(), wait=True
-                )
-        self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))
-
-        self.expect_skew = self.config.get('expect-skew', False)
-        self.never_fail = self.config.get('never-fail', False)
-        self.at_least_once = self.config.get('at-least-once', True)
-        self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
-
-    def info(self, x):
-        """
-        locally define logger for info messages
-        """
-        self.logger.info(x)
-
-    def warn(self, x):
-        """
-        locally define logger for warnings
-        """
-        self.logger.warn(x)
-
-    def debug(self, x):
-        """
-        locally define logger for debug messages
-        """
-        self.logger.info(x)
-        self.logger.debug(x)
-
-    def finish(self):
-        """
-        Break out of the do_check loop.
-        """
-        self.stopping = True
-
-    def sleep_interval(self):
-        """
-        If a sleep interval is set, sleep for that amount of time.
-        """
-        if self.check_interval > 0.0:
-            self.debug('sleeping for {s} seconds'.format(
-                s=self.check_interval))
-            time.sleep(self.check_interval)
-
-    def print_skews(self, skews):
-        """
-        Display skew values.
-        """
-        total = len(skews)
-        if total > 0:
-            self.info('---------- found {n} skews ----------'.format(n=total))
-            for mon_id, values in skews.iteritems():
-                self.info('mon.{id}: {v}'.format(id=mon_id, v=values))
-            self.info('-------------------------------------')
-        else:
-            self.info('---------- no skews were found ----------')
-
-    def do_check(self):
-        """
-        Clock skew checker.  Loops until finish() is called.
-        """
-        self.info('start checking for clock skews')
-        skews = dict()
-        ran_once = False
-        
-        started_on = None
-
-        while not self.stopping or (self.at_least_once and not ran_once):
-
-            if self.at_least_once and not ran_once and self.stopping:
-                if started_on is None:
-                    self.info('kicking-off timeout (if any)')
-                    started_on = time.time()
-                elif self.at_least_once_timeout > 0.0:
-                    assert time.time() - started_on < self.at_least_once_timeout, \
-                        'failed to obtain a timecheck before timeout expired'
-
-            quorum_size = len(teuthology.get_mon_names(self.ctx))
-            self.manager.wait_for_mon_quorum_size(quorum_size)
-
-            health = self.manager.get_mon_health(True)
-            timechecks = health['timechecks']
-
-            clean_check = False
 
-            if timechecks['round_status'] == 'finished':
-                assert (timechecks['round'] % 2) == 0, \
-                    'timecheck marked as finished but round ' \
-                    'disagrees (r {r})'.format(
-                        r=timechecks['round'])
-                clean_check = True
-            else:
-                assert timechecks['round_status'] == 'on-going', \
-                        'timecheck status expected \'on-going\' ' \
-                        'but found \'{s}\' instead'.format(
-                            s=timechecks['round_status'])
-                if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
-                    self.info('round still on-going, but there are available reports')
-                else:
-                    self.info('no timechecks available just yet')
-                    self.sleep_interval()
-                    continue
-
-            assert len(timechecks['mons']) > 1, \
-                'there are not enough reported timechecks; ' \
-                'expected > 1 found {n}'.format(n=len(timechecks['mons']))
-
-            for check in timechecks['mons']:
-                mon_skew = float(check['skew'])
-                mon_health = check['health']
-                mon_id = check['name']
-                if abs(mon_skew) > self.max_skew:
-                    assert mon_health == 'HEALTH_WARN', \
-                        'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
-                            id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)
-
-                    log_str = 'mon.{id} with skew {s} > max {ms}'.format(
-                        id=mon_id,s=abs(mon_skew),ms=self.max_skew)
-
-                    """ add to skew list """
-                    details = check['details']
-                    skews[mon_id] = {'skew': mon_skew, 'details': details}
-
-                    if self.expect_skew:
-                        self.info('expected skew: {str}'.format(str=log_str))
-                    else:
-                        self.warn('unexpected skew: {str}'.format(str=log_str))
-
-            if clean_check or (self.expect_skew and len(skews) > 0):
-                ran_once = True
-                self.print_skews(skews)
-            self.sleep_interval()
-
-        total = len(skews)
-        self.print_skews(skews)
-
-        error_str = ''
-        found_error = False
-
-        if self.expect_skew:
-            if total == 0:
-                error_str = 'We were expecting a skew, but none was found!'
-                found_error = True
-        else:
-            if total > 0:
-                error_str = 'We were not expecting a skew, but we did find it!'
-                found_error = True
-
-        if found_error:
-            self.info(error_str)
-            if not self.never_fail:
-                assert False, error_str
-
-@contextlib.contextmanager
 def task(ctx, config):
-    """
-    Use clas ClockSkewCheck to check for clock skews on the monitors.
-    This task will spawn a thread running ClockSkewCheck's do_check().
-
-    All the configuration will be directly handled by ClockSkewCheck,
-    so please refer to the class documentation for further information.
-    """
     if config is None:
         config = {}
     assert isinstance(config, dict), \
         'mon_clock_skew_check task only accepts a dict for configuration'
+    interval = float(config.get('interval', 30.0))
+    expect_skew = config.get('expect-skew', False)
+
     log.info('Beginning mon_clock_skew_check...')
     first_mon = teuthology.get_first_mon(ctx, config)
     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
@@ -247,15 +57,20 @@ def task(ctx, config):
         logger=log.getChild('ceph_manager'),
         )
 
-    skew_check = ClockSkewCheck(ctx,
-        manager, config,
-        logger=log.getChild('mon_clock_skew_check'))
-    skew_check_thread = gevent.spawn(skew_check.do_check)
-    try:
-        yield
-    finally:
-        log.info('joining mon_clock_skew_check')
-        skew_check.finish()
-        skew_check_thread.get()
-
+    quorum_size = len(teuthology.get_mon_names(ctx))
+    manager.wait_for_mon_quorum_size(quorum_size)
+
+    # wait a bit
+    log.info('sleeping for {s} seconds'.format(
+        s=interval))
+    time.sleep(interval)
+
+    health = manager.get_mon_health(True)
+    log.info('got health %s' % health)
+    if expect_skew:
+        if 'MON_CLOCK_SKEW' not in health['checks']:
+            raise RuntimeError('expected MON_CLOCK_SKEW but got none')
+    else:
+        if 'MON_CLOCK_SKEW' in health['checks']:
+            raise RuntimeError('got MON_CLOCK_SKEW but expected none')
 
diff --git a/qa/tasks/thrashosds-health.yaml b/qa/tasks/thrashosds-health.yaml
new file mode 100644
index 0000000000000..7113e5948aa2b
--- /dev/null
+++ b/qa/tasks/thrashosds-health.yaml
@@ -0,0 +1,13 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - (OSDMAP_FLAGS)
+      - (OSD_
+      - (PG_
+      - (POOL_
+      - (CACHE_POOL_
+      - (SMALLER_PGP_NUM)
+      - (OBJECT_
+      - (REQUEST_SLOW)
+      - (TOO_FEW_PGS)
diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh
index e17f97efdabe6..428c404baf5c4 100755
--- a/qa/workunits/cephtool/test.sh
+++ b/qa/workunits/cephtool/test.sh
@@ -706,6 +706,8 @@ function test_mon_misc()
   ceph health --format json-pretty
   ceph health detail --format xml-pretty
 
+  ceph time-sync-status
+
   ceph node ls
   for t in mon osd mds ; do
       ceph node ls $t
@@ -1437,21 +1439,21 @@ function test_mon_osd()
   ceph osd find 0
 
   ceph osd add-nodown 0 1
-  ceph health detail | grep 'nodown osd(s).*0.*1'
+  ceph health detail | grep 'NODOWN'
   ceph osd rm-nodown 0 1
-  ! ceph health detail | grep 'nodown osd(s).*0.*1'
+  ! ceph health detail | grep 'NODOWN'
 
   ceph osd out 0 # so we can mark it as noin later
   ceph osd add-noin 0
-  ceph health detail | grep 'noin osd(s).*0'
+  ceph health detail | grep 'NOIN'
   ceph osd rm-noin 0
-  ! ceph health detail | grep 'noin osd(s).*0'
+  ! ceph health detail | grep 'NOIN'
   ceph osd in 0
 
   ceph osd add-noout 0
-  ceph health detail | grep 'noout osd(s).*0'
+  ceph health detail | grep 'NOOUT'
   ceph osd rm-noout 0
-  ! ceph health detail | grep 'noout osds(s).*0'
+  ! ceph health detail | grep 'NOOUT'
 
   # test osd id parse
   expect_false ceph osd add-noup 797er
@@ -1470,12 +1472,12 @@ function test_mon_osd()
     ceph osd add-nodown $osd
     ceph osd add-noout $osd
   done
-  ceph -s | grep 'nodown osd(s)'
-  ceph -s | grep 'noout osd(s)'
+  ceph -s | grep 'NODOWN'
+  ceph -s | grep 'NOOUT'
   ceph osd rm-nodown any
   ceph osd rm-noout all
-  ! ceph -s | grep 'nodown osd(s)'
-  ! ceph -s | grep 'noout osd(s)'
+  ! ceph -s | grep 'NODOWN'
+  ! ceph -s | grep 'NOOUT'
 
   # make sure mark out preserves weight
   ceph osd reweight osd.0 .5
@@ -1777,29 +1779,38 @@ function test_mon_pg()
 
   # Check health status
   ceph osd set-nearfull-ratio .913
-  ceph health | grep 'HEALTH_ERR.*Full ratio(s) out of order'
-  ceph health detail | grep 'backfillfull_ratio (0.912) < nearfull_ratio (0.913), increased'
+  ceph health -f json | grep OSD_OUT_OF_ORDER_FULL
+  ceph health detail | grep OSD_OUT_OF_ORDER_FULL
   ceph osd set-nearfull-ratio .892
   ceph osd set-backfillfull-ratio .963
-  ceph health detail | grep 'full_ratio (0.962) < backfillfull_ratio (0.963), increased'
+  ceph health -f json | grep OSD_OUT_OF_ORDER_FULL
+  ceph health detail | grep OSD_OUT_OF_ORDER_FULL
   ceph osd set-backfillfull-ratio .912
 
   # Check injected full results
   $SUDO ceph --admin-daemon $(get_admin_socket osd.0) injectfull nearfull
-  wait_for_health "HEALTH_WARN.*1 nearfull osd(s)"
+  wait_for_health "OSD_NEARFULL"
+  ceph health detail | grep "osd.0 is near full"
+  $SUDO ceph --admin-daemon $(get_admin_socket osd.0) injectfull none
+  wait_for_health_ok
+
   $SUDO ceph --admin-daemon $(get_admin_socket osd.1) injectfull backfillfull
-  wait_for_health "HEALTH_WARN.*1 backfillfull osd(s)"
+  wait_for_health "OSD_BACKFILLFULL"
+  ceph health detail | grep "osd.1 is backfill full"
+  $SUDO ceph --admin-daemon $(get_admin_socket osd.1) injectfull none
+  wait_for_health_ok
+
   $SUDO ceph --admin-daemon $(get_admin_socket osd.2) injectfull failsafe
   # failsafe and full are the same as far as the monitor is concerned
-  wait_for_health "HEALTH_ERR.*1 full osd(s)"
+  wait_for_health "OSD_FULL"
+  ceph health detail | grep "osd.2 is full"
+  $SUDO ceph --admin-daemon $(get_admin_socket osd.2) injectfull none
+  wait_for_health_ok
+
   $SUDO ceph --admin-daemon $(get_admin_socket osd.0) injectfull full
-  wait_for_health "HEALTH_ERR.*2 full osd(s)"
+  wait_for_health "OSD_FULL"
   ceph health detail | grep "osd.0 is full"
-  ceph health detail | grep "osd.2 is full"
-  ceph health detail | grep "osd.1 is backfill full"
   $SUDO ceph --admin-daemon $(get_admin_socket osd.0) injectfull none
-  $SUDO ceph --admin-daemon $(get_admin_socket osd.1) injectfull none
-  $SUDO ceph --admin-daemon $(get_admin_socket osd.2) injectfull none
   wait_for_health_ok
 
   ceph pg stat | grep 'pgs:'
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index d879a2c38a858..6f1156a29017f 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -308,8 +308,7 @@ OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between
 OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings
 OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
 OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
-OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
-OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
+OPTION(mon_pg_stuck_threshold, OPT_INT, 60) // number of seconds after which pgs can be considered stuck inactive, unclean, etc (see doc/control.rst under dump_stuck for more info)
 OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
 OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30)  // min # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300)  // max # pgs per (in) osd before we warn the admin
@@ -352,6 +351,8 @@ OPTION(mon_health_data_update_interval, OPT_FLOAT, 60.0)
 OPTION(mon_health_to_clog, OPT_BOOL, true)
 OPTION(mon_health_to_clog_interval, OPT_INT, 3600)
 OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE, 60.0)
+OPTION(mon_health_preluminous_compat, OPT_BOOL, false)
+OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
 OPTION(mon_data_avail_crit, OPT_INT, 5)
 OPTION(mon_data_avail_warn, OPT_INT, 30)
 OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes)
@@ -1754,6 +1755,7 @@ OPTION(mgr_service_beacon_grace, OPT_DOUBLE, 60.0)
 OPTION(mon_mgr_digest_period, OPT_INT, 5)  // How frequently to send digests
 OPTION(mon_mgr_beacon_grace, OPT_INT, 30)  // How long to wait to failover
 OPTION(mon_mgr_inactive_grace, OPT_INT, 60) // How long before health WARN -> ERR
+OPTION(mon_mgr_mkfs_grace, OPT_INT, 60) // How long before we complain about MGR_DOWN
 OPTION(rgw_crypt_require_ssl, OPT_BOOL, true) // requests including encryption key headers must be sent over ssl
 OPTION(rgw_crypt_default_encryption_key, OPT_STR, "") // base64 encoded key for encryption of rgw objects
 OPTION(rgw_crypt_s3_kms_encryption_keys, OPT_STR, "") // extra keys that may be used for aws:kms
diff --git a/src/include/health.h b/src/include/health.h
new file mode 100644
index 0000000000000..b23a4d4e2b32e
--- /dev/null
+++ b/src/include/health.h
@@ -0,0 +1,68 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "include/encoding.h"
+
+// health_status_t
+enum health_status_t {
+  HEALTH_ERR = 0,
+  HEALTH_WARN = 1,
+  HEALTH_OK = 2,
+};
+
+static inline void encode(health_status_t hs, bufferlist& bl) {
+  uint8_t v = hs;
+  ::encode(v, bl);
+}
+static inline void decode(health_status_t& hs, bufferlist::iterator& p) {
+  uint8_t v;
+  ::decode(v, p);
+  hs = health_status_t(v);
+}
+template<>
+struct denc_traits<health_status_t> {
+  static constexpr bool supported = true;
+  static constexpr bool featured = false;
+  static constexpr bool bounded = true;
+  static constexpr bool need_contiguous = false;
+  static void bound_encode(const bufferptr& v, size_t& p, uint64_t f=0) {
+    p++;
+  }
+  static void encode(const health_status_t& v,
+		     buffer::list::contiguous_appender& p,
+		     uint64_t f=0) {
+    ::denc((uint8_t)v, p);
+  }
+  static void decode(health_status_t& v, buffer::ptr::iterator& p,
+		     uint64_t f=0) {
+    uint8_t tmp;
+    ::denc(tmp, p);
+    v = health_status_t(tmp);
+  }
+  static void decode(health_status_t& v, buffer::list::iterator& p,
+		     uint64_t f=0) {
+    uint8_t tmp;
+    ::denc(tmp, p);
+    v = health_status_t(tmp);
+  }
+};
+
+inline std::ostream& operator<<(std::ostream &oss, const health_status_t status) {
+  switch (status) {
+    case HEALTH_ERR:
+      oss << "HEALTH_ERR";
+      break;
+    case HEALTH_WARN:
+      oss << "HEALTH_WARN";
+      break;
+    case HEALTH_OK:
+      oss << "HEALTH_OK";
+      break;
+  }
+  return oss;
+}
diff --git a/src/include/types.h b/src/include/types.h
index 371f884f82e85..e904a151d75d8 100644
--- a/src/include/types.h
+++ b/src/include/types.h
@@ -411,29 +411,6 @@ inline ostream& operator<<(ostream& out, const ceph_mon_subscribe_item& i)
 	     << ((i.flags & CEPH_SUBSCRIBE_ONETIME) ? "" : "+");
 }
 
-enum health_status_t {
-  HEALTH_ERR = 0,
-  HEALTH_WARN = 1,
-  HEALTH_OK = 2,
-};
-
-#ifdef __cplusplus
-inline ostream& operator<<(ostream &oss, const health_status_t status) {
-  switch (status) {
-    case HEALTH_ERR:
-      oss << "HEALTH_ERR";
-      break;
-    case HEALTH_WARN:
-      oss << "HEALTH_WARN";
-      break;
-    case HEALTH_OK:
-      oss << "HEALTH_OK";
-      break;
-  }
-  return oss;
-}
-#endif
-
 struct weightf_t {
   float v;
   // cppcheck-suppress noExplicitConstructor
diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc
index eb08d02b7dcb0..cdce14b60892c 100644
--- a/src/mds/FSMap.cc
+++ b/src/mds/FSMap.cc
@@ -18,6 +18,8 @@
 #include <sstream>
 using std::stringstream;
 
+#include "mon/health_check.h"
+
 
 void Filesystem::dump(Formatter *f) const
 {
@@ -327,6 +329,30 @@ bool FSMap::check_health(void)
   return changed;
 }
 
+void FSMap::get_health_checks(health_check_map_t *checks) const
+{
+  mds_rank_t standby_count_wanted = 0;
+  for (const auto &i : filesystems) {
+    const auto &fs = i.second;
+    health_check_map_t fschecks;
+    fs->mds_map.get_health_checks(&fschecks);
+    checks->merge(fschecks);
+    standby_count_wanted = std::max(
+      standby_count_wanted,
+      fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
+  }
+
+  // MDS_INSUFFICIENT_STANDBY
+  if (standby_count_wanted) {
+    std::ostringstream oss, dss;
+    oss << "insufficient standby daemons available";
+    auto& d = checks->add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
+    dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
+	<< " more";
+    d.detail.push_back(dss.str());
+  }
+}
+
 void FSMap::encode(bufferlist& bl, uint64_t features) const
 {
   if (features & CEPH_FEATURE_SERVER_JEWEL) {
diff --git a/src/mds/FSMap.h b/src/mds/FSMap.h
index 3d389c48885b5..ea102a712740c 100644
--- a/src/mds/FSMap.h
+++ b/src/mds/FSMap.h
@@ -35,6 +35,7 @@
 #include "mds/mdstypes.h"
 
 class CephContext;
+class health_check_map_t;
 
 #define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20")
 #define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges")
@@ -476,6 +477,8 @@ class FSMap {
   void get_health(list<pair<health_status_t,std::string> >& summary,
 		  list<pair<health_status_t,std::string> > *detail) const;
 
+  void get_health_checks(health_check_map_t *checks) const;
+
   bool check_health(void);
 
   /**
diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc
index b397eb089e9fa..bd54469756f42 100644
--- a/src/mds/MDSMap.cc
+++ b/src/mds/MDSMap.cc
@@ -18,6 +18,8 @@
 #include <sstream>
 using std::stringstream;
 
+#include "mon/health_check.h"
+
 
 // features
 CompatSet get_mdsmap_compat_set_all() {
@@ -404,6 +406,78 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
   }
 }
 
+void MDSMap::get_health_checks(health_check_map_t *checks) const
+{
+  // FS_WITH_FAILED_MDS
+  // MDS_FAILED
+  if (!failed.empty()) {
+    health_check_t& fscheck = checks->add(
+      "FS_WITH_FAILED_MDS", HEALTH_WARN,
+      "%num% filesystem%plurals% %isorare% have a failed mds daemon");
+    ostringstream ss;
+    ss << "fs " << fs_name << " has " << failed.size() << " failed mds"
+       << (failed.size() > 1 ? "s" : "");
+    fscheck.detail.push_back(ss.str());
+
+    health_check_t& check = checks->add("MDS_FAILED", HEALTH_ERR,
+					 "%num% mds daemon%plurals% down");
+    for (auto p : failed) {
+      std::ostringstream oss;
+      oss << "fs " << fs_name << " mds." << p << " has failed";
+      check.detail.push_back(oss.str());
+    }
+  }
+
+  // MDS_DAMAGED
+  if (!damaged.empty()) {
+    health_check_t& check = checks->add("MDS_DAMAGED", HEALTH_ERR,
+					"%num% mds daemon%plurals% damaged");
+    for (auto p : damaged) {
+      std::ostringstream oss;
+      oss << "fs " << fs_name << " mds." << p << " is damaged";
+      check.detail.push_back(oss.str());
+    }
+  }
+
+  // FS_DEGRADED
+  // MDS_DEGRADED
+  if (is_degraded()) {
+    health_check_t& fscheck = checks->add(
+      "FS_DEGRADED", HEALTH_WARN,
+      "%num% filesystem%plurals% %isorare% degraded");
+    ostringstream ss;
+    ss << "fs " << fs_name << " is degraded";
+    fscheck.detail.push_back(ss.str());
+
+    list<string> detail;
+    for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
+      if (!is_up(i))
+	continue;
+      mds_gid_t gid = up.find(i)->second;
+      map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid);
+      stringstream ss;
+      ss << "fs " << fs_name << " mds." << info->second.name << " at "
+	 << info->second.addr << " rank " << i;
+      if (is_resolve(i))
+	ss << " is resolving";
+      if (is_replay(i))
+	ss << " is replaying journal";
+      if (is_rejoin(i))
+	ss << " is rejoining";
+      if (is_reconnect(i))
+	ss << " is reconnecting to clients";
+      if (ss.str().length())
+	detail.push_back(ss.str());
+    }
+    if (!detail.empty()) {
+      health_check_t& check = checks->add(
+	"MDS_DEGRADED", HEALTH_WARN,
+	"%num% mds daemon%plurals% %isorare% degraded");
+      check.detail.insert(check.detail.end(), detail.begin(), detail.end());
+    }
+  }
+}
+
 void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
 {
   ENCODE_START(7, 4, bl);
diff --git a/src/mds/MDSMap.h b/src/mds/MDSMap.h
index e99be2be67b57..e6423c9bea1fa 100644
--- a/src/mds/MDSMap.h
+++ b/src/mds/MDSMap.h
@@ -21,6 +21,7 @@
 #include "include/types.h"
 #include "common/Clock.h"
 #include "msg/Message.h"
+#include "include/health.h"
 
 #include <set>
 #include <map>
@@ -58,6 +59,7 @@
 */
 
 class CephContext;
+class health_check_map_t;
 
 extern CompatSet get_mdsmap_compat_set_all();
 extern CompatSet get_mdsmap_compat_set_default();
@@ -461,6 +463,8 @@ class MDSMap {
   void get_health(list<pair<health_status_t,std::string> >& summary,
 		  list<pair<health_status_t,std::string> > *detail) const;
 
+  void get_health_checks(health_check_map_t *checks) const;
+
   typedef enum
   {
     AVAILABLE = 0,
diff --git a/src/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h
index 31febe50a353c..a83502e85c444 100644
--- a/src/messages/MMDSBeacon.h
+++ b/src/messages/MMDSBeacon.h
@@ -43,6 +43,56 @@ enum mds_metric_t {
   MDS_HEALTH_CACHE_OVERSIZED
 };
 
+static inline const char *mds_metric_name(mds_metric_t m)
+{
+  switch (m) {
+  case MDS_HEALTH_TRIM: return "MDS_TRIM";
+  case MDS_HEALTH_CLIENT_RECALL: return "MDS_CLIENT_RECALL";
+  case MDS_HEALTH_CLIENT_LATE_RELEASE: return "MDS_CLIENT_LATE_RELEASE";
+  case MDS_HEALTH_CLIENT_RECALL_MANY: return "MDS_CLIENT_RECALL_MANY";
+  case MDS_HEALTH_CLIENT_LATE_RELEASE_MANY: return "MDS_CLIENT_LATE_RELEASE_MANY";
+  case MDS_HEALTH_CLIENT_OLDEST_TID: return "MDS_CLIENT_OLDEST_TID";
+  case MDS_HEALTH_CLIENT_OLDEST_TID_MANY: return "MDS_CLIENT_OLDEST_TID_MANY";
+  case MDS_HEALTH_DAMAGE: return "MDS_DAMAGE";
+  case MDS_HEALTH_READ_ONLY: return "MDS_READ_ONLY";
+  case MDS_HEALTH_SLOW_REQUEST: return "MDS_SLOW_REQUEST";
+  case MDS_HEALTH_CACHE_OVERSIZED: return "MDS_CACHE_OVERSIZED";
+  default:
+    return "???";
+  }
+}
+
+static inline const char *mds_metric_summary(mds_metric_t m)
+{
+  switch (m) {
+  case MDS_HEALTH_TRIM:
+    return "%num% MDSs behind on trimming";
+  case MDS_HEALTH_CLIENT_RECALL:
+    return "%num% clients failing to respond to cache pressure";
+  case MDS_HEALTH_CLIENT_LATE_RELEASE:
+    return "%num% clients failing to respond to capability release";
+  case MDS_HEALTH_CLIENT_RECALL_MANY:
+    return "%num% MDSs have many clients failing to respond to cache pressure";
+  case MDS_HEALTH_CLIENT_LATE_RELEASE_MANY:
+    return "%num% MDSs have many clients failing to respond to capability "
+      "release";
+  case MDS_HEALTH_CLIENT_OLDEST_TID:
+    return "%num% clients failing to advance oldest client/flush tid";
+  case MDS_HEALTH_CLIENT_OLDEST_TID_MANY:
+    return "%num% MDSs have clients failing to advance oldest client/flush tid";
+  case MDS_HEALTH_DAMAGE:
+    return "%num% MDSs report damaged metadata";
+  case MDS_HEALTH_READ_ONLY:
+    return "%num% MDSs are read only";
+  case MDS_HEALTH_SLOW_REQUEST:
+    return "%num% MDSs report slow requests";
+  case MDS_HEALTH_CACHE_OVERSIZED:
+    return "%num% MDSs report oversized cache";
+  default:
+    return "???";
+  }
+}
+
 /**
  * This structure is designed to allow some flexibility in how we emit health
  * complaints, such that:
diff --git a/src/messages/MMonHealthChecks.h b/src/messages/MMonHealthChecks.h
new file mode 100644
index 0000000000000..6b66847633f9d
--- /dev/null
+++ b/src/messages/MMonHealthChecks.h
@@ -0,0 +1,47 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef CEPH_MMON_HEALTH_CHECKS_H
+#define CEPH_MMON_HEALTH_CHECKS_H
+
+#include "messages/PaxosServiceMessage.h"
+#include "mon/health_check.h"
+
+struct MMonHealthChecks : public PaxosServiceMessage
+{
+  static const int HEAD_VERSION = 1;
+  static const int COMPAT_VERSION = 1;
+
+  health_check_map_t health_checks;
+
+  MMonHealthChecks()
+    : PaxosServiceMessage(MSG_MON_HEALTH_CHECKS, HEAD_VERSION, COMPAT_VERSION) {
+  }
+  MMonHealthChecks(health_check_map_t& m)
+    : PaxosServiceMessage(MSG_MON_HEALTH_CHECKS, HEAD_VERSION, COMPAT_VERSION),
+      health_checks(m) {
+  }
+
+private:
+  ~MMonHealthChecks() override { }
+
+public:
+  const char *get_type_name() const override { return "mon_health_checks"; }
+  void print(ostream &o) const override {
+    o << "mon_health_checks(" << health_checks.checks.size() << " checks)";
+  }
+
+  void decode_payload() override {
+    bufferlist::iterator p = payload.begin();
+    paxos_decode(p);
+    ::decode(health_checks, p);
+  }
+
+  void encode_payload(uint64_t features) override {
+    paxos_encode();
+    ::encode(health_checks, payload);
+  }
+
+};
+
+#endif
diff --git a/src/messages/MMonMgrReport.h b/src/messages/MMonMgrReport.h
index 8f3a8fe911540..eef0966f15c0e 100644
--- a/src/messages/MMonMgrReport.h
+++ b/src/messages/MMonMgrReport.h
@@ -17,17 +17,8 @@
 
 #include "messages/PaxosServiceMessage.h"
 #include "include/types.h"
-
-// health_status_t
-static inline void encode(health_status_t hs, bufferlist& bl) {
-  uint8_t v = hs;
-  ::encode(v, bl);
-}
-static inline void decode(health_status_t& hs, bufferlist::iterator& p) {
-  uint8_t v;
-  ::decode(v, p);
-  hs = health_status_t(v);
-}
+#include "include/health.h"
+#include "mon/health_check.h"
 
 class MMonMgrReport : public PaxosServiceMessage {
 
@@ -36,7 +27,7 @@ class MMonMgrReport : public PaxosServiceMessage {
 
 public:
   // PGMapDigest is in data payload
-  list<pair<health_status_t,std::string>> health_summary, health_detail;
+  health_check_map_t health_checks;
   bufferlist service_map_bl;  // encoded ServiceMap
 
   MMonMgrReport()
@@ -49,20 +40,18 @@ class MMonMgrReport : public PaxosServiceMessage {
   const char *get_type_name() const override { return "monmgrreport"; }
 
   void print(ostream& out) const override {
-    out << get_type_name();
+    out << get_type_name() << "(" << health_checks.checks.size() << " checks)";
   }
 
   void encode_payload(uint64_t features) override {
     paxos_encode();
-    ::encode(health_summary, payload);
-    ::encode(health_detail, payload);
+    ::encode(health_checks, payload);
     ::encode(service_map_bl, payload);
   }
   void decode_payload() override {
     bufferlist::iterator p = payload.begin();
     paxos_decode(p);
-    ::decode(health_summary, p);
-    ::decode(health_detail, p);
+    ::decode(health_checks, p);
     ::decode(service_map_bl, p);
   }
 };
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
index 7b9086c4800dd..6454c8da306a5 100644
--- a/src/mgr/DaemonServer.cc
+++ b/src/mgr/DaemonServer.cc
@@ -113,6 +113,8 @@ int DaemonServer::init(uint64_t gid, entity_addr_t client_addr)
   msgr->start();
   msgr->add_dispatcher_tail(this);
 
+  started_at = ceph_clock_now();
+
   return 0;
 }
 
@@ -235,6 +237,7 @@ bool DaemonServer::ms_dispatch(Message *m)
   switch (m->get_type()) {
     case MSG_PGSTATS:
       cluster_state.ingest_pgstats(static_cast<MPGStats*>(m));
+      maybe_ready(m->get_source().num());
       m->put();
       return true;
     case MSG_MGR_REPORT:
@@ -249,6 +252,35 @@ bool DaemonServer::ms_dispatch(Message *m)
   };
 }
 
+void DaemonServer::maybe_ready(int32_t osd_id)
+{
+  if (!pgmap_ready && reported_osds.find(osd_id) == reported_osds.end()) {
+    dout(4) << "initial report from osd " << osd_id << dendl;
+    reported_osds.insert(osd_id);
+    std::set<int32_t> up_osds;
+
+    cluster_state.with_osdmap([&](const OSDMap& osdmap) {
+        osdmap.get_up_osds(up_osds);
+    });
+
+    std::set<int32_t> unreported_osds;
+    std::set_difference(up_osds.begin(), up_osds.end(),
+                        reported_osds.begin(), reported_osds.end(),
+                        std::inserter(unreported_osds, unreported_osds.begin()));
+
+    if (unreported_osds.size() == 0) {
+      dout(4) << "all osds have reported, sending PG state to mon" << dendl;
+      pgmap_ready = true;
+      reported_osds.clear();
+      // Avoid waiting for next tick
+      send_report();
+    } else {
+      dout(4) << "still waiting for " << unreported_osds.size() << " osds"
+                 " to report in before PGMap is ready" << dendl;
+    }
+  }
+}
+
 void DaemonServer::shutdown()
 {
   dout(10) << "begin" << dendl;
@@ -977,6 +1009,19 @@ void DaemonServer::_prune_pending_service_map()
 
 void DaemonServer::send_report()
 {
+  if (!pgmap_ready) {
+    if (ceph_clock_now() - started_at > g_conf->mgr_stats_period * 4.0) {
+      pgmap_ready = true;
+      reported_osds.clear();
+      dout(1) << "Giving up on OSDs that haven't reported yet, sending "
+              << "potentially incomplete PG state to mon" << dendl;
+    } else {
+      dout(1) << "Not sending PG status to monitor yet, waiting for OSDs"
+              << dendl;
+      return;
+    }
+  }
+
   auto m = new MMonMgrReport();
   cluster_state.with_pgmap([&](const PGMap& pg_map) {
       cluster_state.update_delta_stats();
@@ -992,15 +1037,21 @@ void DaemonServer::send_report()
 	}
       }
 
-      // FIXME: reporting health detail here might be a bad idea?
       cluster_state.with_osdmap([&](const OSDMap& osdmap) {
 	  // FIXME: no easy way to get mon features here.  this will do for
 	  // now, though, as long as we don't make a backward-incompat change.
 	  pg_map.encode_digest(osdmap, m->get_data(), CEPH_FEATURES_ALL);
 	  dout(10) << pg_map << dendl;
-	  pg_map.get_health(g_ceph_context, osdmap,
-			    m->health_summary,
-			    &m->health_detail);
+
+	  pg_map.get_health_checks(g_ceph_context, osdmap,
+				   &m->health_checks);
+	  dout(10) << m->health_checks.checks.size() << " health checks"
+		   << dendl;
+	  dout(20) << "health checks:\n";
+	  JSONFormatter jf(true);
+	  jf.dump_object("health_checks", m->health_checks);
+	  jf.flush(*_dout);
+	  *_dout << dendl;
 	});
     });
   // TODO? We currently do not notify the PyModules
diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h
index 06ee68b8adc80..4877cfe85aed8 100644
--- a/src/mgr/DaemonServer.h
+++ b/src/mgr/DaemonServer.h
@@ -89,6 +89,11 @@ class DaemonServer : public Dispatcher
 
   void _prune_pending_service_map();
 
+  utime_t started_at;
+  bool pgmap_ready = false;
+  std::set<int32_t> reported_osds;
+  void maybe_ready(int32_t osd_id);
+
 public:
   int init(uint64_t gid, entity_addr_t client_addr);
   void shutdown();
diff --git a/src/mon/CMakeLists.txt b/src/mon/CMakeLists.txt
index 9e40ef58863ca..556157132f805 100644
--- a/src/mon/CMakeLists.txt
+++ b/src/mon/CMakeLists.txt
@@ -16,6 +16,7 @@ set(lib_mon_srcs
   AuthMonitor.cc
   Elector.cc
   HealthMonitor.cc
+  OldHealthMonitor.cc
   DataHealthService.cc
   PGMonitor.cc
   PGMap.cc
diff --git a/src/mon/ConfigKeyService.h b/src/mon/ConfigKeyService.h
index 9977968736593..7dfb140c7e7c3 100644
--- a/src/mon/ConfigKeyService.h
+++ b/src/mon/ConfigKeyService.h
@@ -57,8 +57,7 @@ class ConfigKeyService : public QuorumService
    * @{
    */
   void init() override { }
-  void get_health(Formatter *f,
-                  list<pair<health_status_t,string> >& summary,
+  void get_health(list<pair<health_status_t,string> >& summary,
                   list<pair<health_status_t,string> > *detail) override { }
   bool service_dispatch(MonOpRequestRef op) override;
 
diff --git a/src/mon/DataHealthService.cc b/src/mon/DataHealthService.cc
index 6305263a80920..4a5b42ab38889 100644
--- a/src/mon/DataHealthService.cc
+++ b/src/mon/DataHealthService.cc
@@ -65,16 +65,10 @@ void DataHealthService::start_epoch()
 }
 
 void DataHealthService::get_health(
-    Formatter *f,
     list<pair<health_status_t,string> >& summary,
     list<pair<health_status_t,string> > *detail)
 {
   dout(10) << __func__ << dendl;
-  if (f) {
-    f->open_object_section("data_health");
-    f->open_array_section("mons");
-  }
-
   for (map<entity_inst_t,DataStats>::iterator it = stats.begin();
        it != stats.end(); ++it) {
     string mon_name = mon->monmap->get_name(it->first.addr);
@@ -110,22 +104,6 @@ void DataHealthService::get_health(
       if (detail)
 	detail->push_back(make_pair(health_status, ss.str()));
     }
-
-    if (f) {
-      f->open_object_section("mon");
-      f->dump_string("name", mon_name.c_str());
-      // leave this unenclosed by an object section to avoid breaking backward-compatibility
-      stats.dump(f);
-      f->dump_stream("health") << health_status;
-      if (health_status != HEALTH_OK)
-        f->dump_string("health_detail", health_detail);
-      f->close_section();
-    }
-  }
-
-  if (f) {
-    f->close_section(); // mons
-    f->close_section(); // data_health
   }
 }
 
diff --git a/src/mon/DataHealthService.h b/src/mon/DataHealthService.h
index 8834b600b8d9c..91caf4e32115c 100644
--- a/src/mon/DataHealthService.h
+++ b/src/mon/DataHealthService.h
@@ -65,9 +65,9 @@ class DataHealthService :
     start_tick();
   }
 
-  void get_health(Formatter *f,
-                          list<pair<health_status_t,string> >& summary,
-                          list<pair<health_status_t,string> > *detail) override;
+  void get_health(
+    list<pair<health_status_t,string> >& summary,
+    list<pair<health_status_t,string> > *detail) override;
 
   int get_type() override {
     return HealthService::SERVICE_HEALTH_DATA;
diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc
index 0887bdc1b759f..32f62667e25f1 100644
--- a/src/mon/HealthMonitor.cc
+++ b/src/mon/HealthMonitor.cc
@@ -12,13 +12,13 @@
  *
  */
 
-#include <sstream>
 #include <stdlib.h>
 #include <limits.h>
+#include <sstream>
+#include <boost/regex.hpp>
 
-// #include <boost/intrusive_ptr.hpp>
-// Because intusive_ptr clobbers our assert...
 #include "include/assert.h"
+#include "include/stringify.h"
 
 #include "mon/Monitor.h"
 #include "mon/HealthService.h"
@@ -26,8 +26,9 @@
 #include "mon/DataHealthService.h"
 
 #include "messages/MMonHealth.h"
+#include "messages/MMonHealthChecks.h"
+
 #include "common/Formatter.h"
-// #include "common/config.h"
 
 #define dout_subsys ceph_subsys_mon
 #undef dout_prefix
@@ -35,84 +36,345 @@
 static ostream& _prefix(std::ostream *_dout, const Monitor *mon,
                         const HealthMonitor *hmon) {
   return *_dout << "mon." << mon->name << "@" << mon->rank
-		<< "(" << mon->get_state_name() << ")." << hmon->get_name()
-                << "(" << hmon->get_epoch() << ") ";
+		<< "(" << mon->get_state_name() << ").health ";
+}
+
+HealthMonitor::HealthMonitor(Monitor *m, Paxos *p, const string& service_name)
+  : PaxosService(m, p, service_name) {
 }
 
 void HealthMonitor::init()
 {
   dout(10) << __func__ << dendl;
-  assert(services.empty());
-  services[HealthService::SERVICE_HEALTH_DATA] = new DataHealthService(mon);
+}
+
+void HealthMonitor::create_initial()
+{
+  dout(10) << __func__ << dendl;
+}
+
+void HealthMonitor::update_from_paxos(bool *need_bootstrap)
+{
+  version = get_last_committed();
+  dout(10) << __func__ << dendl;
+  load_health();
+
+  bufferlist qbl;
+  mon->store->get(service_name, "quorum", qbl);
+  if (qbl.length()) {
+    auto p = qbl.begin();
+    ::decode(quorum_checks, p);
+  } else {
+    quorum_checks.clear();
+  }
+
+  bufferlist lbl;
+  mon->store->get(service_name, "leader", lbl);
+  if (lbl.length()) {
+    auto p = lbl.begin();
+    ::decode(leader_checks, p);
+  } else {
+    leader_checks.clear();
+  }
 
-  for (map<int,HealthService*>::iterator it = services.begin();
-       it != services.end();
-       ++it) {
-    it->second->init();
+  dout(20) << "dump:";
+  JSONFormatter jf(true);
+  jf.open_object_section("health");
+  jf.open_object_section("quorum_health");
+  for (auto& p : quorum_checks) {
+    string s = string("mon.") + stringify(p.first);
+    jf.dump_object(s.c_str(), p.second);
   }
+  jf.close_section();
+  jf.dump_object("leader_health", leader_checks);
+  jf.close_section();
+  jf.flush(*_dout);
+  *_dout << dendl;
 }
 
-bool HealthMonitor::service_dispatch(MonOpRequestRef op)
+void HealthMonitor::create_pending()
 {
-  assert(op->get_req()->get_type() == MSG_MON_HEALTH);
-  MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
-  int service_type = hm->get_service_type();
-  if (services.count(service_type) == 0) {
-    dout(1) << __func__ << " service type " << service_type
-            << " not registered -- drop message!" << dendl;
-    return false;
-  }
-  return services[service_type]->service_dispatch(op);
+  dout(10) << " " << version << dendl;
 }
 
-void HealthMonitor::start_epoch() {
-  epoch_t epoch = get_epoch();
-  for (map<int,HealthService*>::iterator it = services.begin();
-       it != services.end(); ++it) {
-    it->second->start(epoch);
+void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
+{
+  ++version;
+  dout(10) << " " << version << dendl;
+  put_last_committed(t, version);
+
+  bufferlist qbl;
+  ::encode(quorum_checks, qbl);
+  t->put(service_name, "quorum", qbl);
+  bufferlist lbl;
+  ::encode(leader_checks, lbl);
+  t->put(service_name, "leader", lbl);
+
+  health_check_map_t pending_health;
+
+  // combine per-mon details carefully...
+  map<string,set<string>> names; // code -> <mon names>
+  for (auto p : quorum_checks) {
+    for (auto q : p.second.checks) {
+      names[q.first].insert(mon->monmap->get_name(p.first));
+    }
+    pending_health.merge(p.second);
   }
+  for (auto p : pending_health.checks) {
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%num%"), stringify(names[p.first].size()));
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%names%"), stringify(names[p.first]));
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%plurals%"),
+      names[p.first].size() > 1 ? "s" : "");
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%isorare%"),
+      names[p.first].size() > 1 ? "are" : "is");
+  }
+
+  pending_health.merge(leader_checks);
+  encode_health(pending_health, t);
 }
 
-void HealthMonitor::finish_epoch() {
-  generic_dout(20) << "HealthMonitor::finish_epoch()" << dendl;
-  for (map<int,HealthService*>::iterator it = services.begin();
-       it != services.end(); ++it) {
-    assert(it->second != NULL);
-    it->second->finish();
+version_t HealthMonitor::get_trim_to()
+{
+  // we don't actually need *any* old states, but keep a few.
+  if (version > 5) {
+    return version - 5;
   }
+  return 0;
 }
 
-void HealthMonitor::service_shutdown()
+bool HealthMonitor::preprocess_query(MonOpRequestRef op)
 {
-  dout(0) << "HealthMonitor::service_shutdown "
-          << services.size() << " services" << dendl;
-  for (map<int,HealthService*>::iterator it = services.begin();
-      it != services.end();
-       ++it) {
-    it->second->shutdown();
-    delete it->second;
-  }
-  services.clear();
+  switch (op->get_req()->get_type()) {
+  case MSG_MON_HEALTH:
+    {
+      MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
+      int service_type = hm->get_service_type();
+      if (services.count(service_type) == 0) {
+	dout(1) << __func__ << " service type " << service_type
+		<< " not registered -- drop message!" << dendl;
+	return false;
+      }
+      return services[service_type]->service_dispatch(op);
+    }
+
+  case MSG_MON_HEALTH_CHECKS:
+    return preprocess_health_checks(op);
+  }
+  return false;
 }
 
-void HealthMonitor::get_health(Formatter *f,
-			       list<pair<health_status_t,string> >& summary,
-			       list<pair<health_status_t,string> > *detail)
+bool HealthMonitor::prepare_update(MonOpRequestRef op)
 {
-  if (f) {
-    f->open_object_section("health");
-    f->open_array_section("health_services");
+  return false;
+}
+
+bool HealthMonitor::preprocess_health_checks(MonOpRequestRef op)
+{
+  MMonHealthChecks *m = static_cast<MMonHealthChecks*>(op->get_req());
+  quorum_checks[m->get_source().num()] = m->health_checks;
+  return true;
+}
+
+void HealthMonitor::tick()
+{
+  if (!is_active()) {
+    return;
+  }
+  dout(10) << __func__ << dendl;
+  bool changed = false;
+  if (check_member_health()) {
+    changed = true;
+  }
+  if (mon->is_leader()) {
+    if (check_leader_health()) {
+      changed = true;
+    }
   }
+  if (changed) {
+    propose_pending();
+  }
+}
+
+bool HealthMonitor::check_member_health()
+{
+  dout(20) << __func__ << dendl;
+  bool changed = false;
 
-  for (map<int,HealthService*>::iterator it = services.begin();
-       it != services.end();
-       ++it) {
-    it->second->get_health(f, summary, detail);
+  // snapshot of usage
+  DataStats stats;
+  get_fs_stats(stats.fs_stats, g_conf->mon_data.c_str());
+  map<string,uint64_t> extra;
+  uint64_t store_size = mon->store->get_estimated_size(extra);
+  assert(store_size > 0);
+  stats.store_stats.bytes_total = store_size;
+  stats.store_stats.bytes_sst = extra["sst"];
+  stats.store_stats.bytes_log = extra["log"];
+  stats.store_stats.bytes_misc = extra["misc"];
+  stats.last_update = ceph_clock_now();
+  dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
+	   << " total " << prettybyte_t(stats.fs_stats.byte_total)
+	   << ", used " << prettybyte_t(stats.fs_stats.byte_used)
+	   << ", avail " << prettybyte_t(stats.fs_stats.byte_avail) << dendl;
+
+  // MON_DISK_{LOW,CRIT,BIG}
+  health_check_map_t next;
+  if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_crit) {
+    stringstream ss, ss2;
+    ss << "mon%plurals% %names% %isorare% very low on available space";
+    auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str());
+    ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
+	<< "% avail";
+    d.detail.push_back(ss2.str());
+  } else if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_warn) {
+    stringstream ss, ss2;
+    ss << "mon%plurals% %names% %isorare% low on available space";
+    auto& d = next.add("MON_DISK_LOW", HEALTH_ERR, ss.str());
+    ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
+	<< "% avail";
+    d.detail.push_back(ss2.str());
+  }
+  if (stats.store_stats.bytes_total >= g_conf->mon_data_size_warn) {
+    stringstream ss, ss2;
+    ss << "mon%plurals% %names% %isorare% using a lot of disk space";
+    auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
+    ss2 << "mon." << mon->name << " is "
+	<< prettybyte_t(stats.store_stats.bytes_total)
+	<< " >= mon_data_size_warn ("
+	<< prettybyte_t(g_conf->mon_data_size_warn) << ")";
+    d.detail.push_back(ss2.str());
   }
 
-  if (f) {
-    f->close_section(); // health_services
-    f->close_section(); // health
+  auto p = quorum_checks.find(mon->rank);
+  if (p == quorum_checks.end() ||
+      p->second != next) {
+    if (mon->is_leader()) {
+      // prepare to propose
+      quorum_checks[mon->rank] = next;
+      changed = true;
+    } else {
+      // tell the leader
+      mon->messenger->send_message(new MMonHealthChecks(next),
+				   mon->monmap->get_inst(mon->get_leader()));
+    }
   }
+
+  // OSD_NO_DOWN_OUT_INTERVAL
+  {
+    // Warn if 'mon_osd_down_out_interval' is set to zero.
+    // Having this option set to zero on the leader acts much like the
+    // 'noout' flag.  It's hard to figure out what's going wrong with clusters
+    // without the 'noout' flag set but acting like that just the same, so
+    // we report a HEALTH_WARN in case this option is set to zero.
+    // This is an ugly hack to get the warning out, but until we find a way
+    // to spread global options throughout the mon cluster and have all mons
+    // using a base set of the same options, we need to work around this sort
+    // of things.
+    // There's also the obvious drawback that if this is set on a single
+    // monitor on a 3-monitor cluster, this warning will only be shown every
+    // third monitor connection.
+    if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
+        g_conf->mon_osd_down_out_interval == 0) {
+      ostringstream ss, ds;
+      ss << "mon%plurals% %names %hasorhave% mon_osd_down_out_interval set to 0";
+      auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
+      ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
+      d.detail.push_back(ds.str());
+    }
+  }
+
+  return changed;
 }
 
+bool HealthMonitor::check_leader_health()
+{
+  dout(20) << __func__ << dendl;
+  bool changed = false;
+
+  // prune quorum_health
+  {
+    auto& qset = mon->get_quorum();
+    auto p = quorum_checks.begin();
+    while (p != quorum_checks.end()) {
+      if (qset.count(p->first) == 0) {
+	p = quorum_checks.erase(p);
+	changed = true;
+      } else {
+	++p;
+      }
+    }
+  }
+
+  health_check_map_t next;
+
+  // MON_DOWN
+  {
+    int max = mon->monmap->size();
+    int actual = mon->get_quorum().size();
+    if (actual < max) {
+      ostringstream ss;
+      ss << (max-actual) << "/" << max << " mons down, quorum "
+	 << mon->get_quorum_names();
+      auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str());
+      set<int> q = mon->get_quorum();
+      for (int i=0; i<max; i++) {
+	if (q.count(i) == 0) {
+	  ostringstream ss;
+	  ss << "mon." << mon->monmap->get_name(i) << " (rank " << i
+	     << ") addr " << mon->monmap->get_addr(i)
+	     << " is down (out of quorum)";
+	  d.detail.push_back(ss.str());
+	}
+      }
+    }
+  }
+
+  // MON_CLOCK_SKEW
+  if (!mon->timecheck_skews.empty()) {
+    list<string> warns;
+    list<string> details;
+    for (map<entity_inst_t,double>::iterator i = mon->timecheck_skews.begin();
+	 i != mon->timecheck_skews.end(); ++i) {
+      entity_inst_t inst = i->first;
+      double skew = i->second;
+      double latency = mon->timecheck_latencies[inst];
+      string name = mon->monmap->get_name(inst.addr);
+      ostringstream tcss;
+      health_status_t tcstatus = mon->timecheck_status(tcss, skew, latency);
+      if (tcstatus != HEALTH_OK) {
+	warns.push_back(name);
+	ostringstream tmp_ss;
+	tmp_ss << "mon." << name
+	       << " addr " << inst.addr << " " << tcss.str()
+	       << " (latency " << latency << "s)";
+	details.push_back(tmp_ss.str());
+      }
+    }
+    if (!warns.empty()) {
+      ostringstream ss;
+      ss << "clock skew detected on";
+      while (!warns.empty()) {
+	ss << " mon." << warns.front();
+	warns.pop_front();
+	if (!warns.empty())
+	  ss << ",";
+      }
+      auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN,
+			 "monitor clock skew detected");
+      d.detail.swap(details);
+    }
+  }
+
+  if (next != leader_checks) {
+    changed = true;
+    leader_checks = next;
+  }
+  return changed;
+}
diff --git a/src/mon/HealthMonitor.h b/src/mon/HealthMonitor.h
index 9d05c64e990a5..5387ce0340a89 100644
--- a/src/mon/HealthMonitor.h
+++ b/src/mon/HealthMonitor.h
@@ -14,50 +14,54 @@
 #ifndef CEPH_HEALTH_MONITOR_H
 #define CEPH_HEALTH_MONITOR_H
 
-#include "mon/QuorumService.h"
+#include "mon/PaxosService.h"
 
 //forward declaration
 namespace ceph { class Formatter; }
 class HealthService;
 
-class HealthMonitor : public QuorumService
+class HealthMonitor : public PaxosService
 {
   map<int,HealthService*> services;
-
-protected:
-  void service_shutdown() override;
+  version_t version = 0;
+  map<int,health_check_map_t> quorum_checks;  // for each quorum member
+  health_check_map_t leader_checks;           // leader only
 
 public:
-  HealthMonitor(Monitor *m) : QuorumService(m) { }
+  HealthMonitor(Monitor *m, Paxos *p, const string& service_name);
   ~HealthMonitor() override {
     assert(services.empty());
   }
 
-
   /**
    * @defgroup HealthMonitor_Inherited_h Inherited abstract methods
    * @{
    */
   void init() override;
-  void get_health(Formatter *f,
-		  list<pair<health_status_t,string> >& summary,
-		  list<pair<health_status_t,string> > *detail) override;
-  bool service_dispatch(MonOpRequestRef op) override;
 
-  void start_epoch() override;
+  void get_health(
+    list<pair<health_status_t,string> >& summary,
+    list<pair<health_status_t,string> > *detail,
+    CephContext *cct) const override {}
 
-  void finish_epoch() override;
+  bool preprocess_query(MonOpRequestRef op) override;
+  bool prepare_update(MonOpRequestRef op) override;
 
-  void cleanup() override { }
-  void service_tick() override { }
+  bool preprocess_health_checks(MonOpRequestRef op);
+  bool prepare_health_checks(MonOpRequestRef op);
 
-  int get_type() override {
-    return QuorumService::SERVICE_HEALTH;
-  }
+  bool check_leader_health();
+  bool check_member_health();
 
-  string get_name() const override {
-    return "health";
-  }
+  void create_initial() override;
+  void update_from_paxos(bool *need_bootstrap) override;
+  void create_pending() override;
+  void encode_pending(MonitorDBStore::TransactionRef t) override;
+  version_t get_trim_to() override;
+
+  void encode_full(MonitorDBStore::TransactionRef t) override { }
+
+  void tick() override;
 
   /**
    * @} // HealthMonitor_Inherited_h
diff --git a/src/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc
index 608e1aeedc3e8..c38e681a0312c 100644
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@@ -14,6 +14,7 @@
 
 #include <sstream>
 #include <boost/utility.hpp>
+#include <boost/regex.hpp>
 
 #include "MDSMonitor.h"
 #include "FSCommands.h"
@@ -99,6 +100,8 @@ void MDSMonitor::update_from_paxos(bool *need_bootstrap)
 	   << ", my e " << fsmap.epoch << dendl;
   assert(version > fsmap.epoch);
 
+  load_health();
+
   // read and decode
   bufferlist fsmap_bl;
   fsmap_bl.clear();
@@ -174,6 +177,65 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   }
   pending_daemon_health_rm.clear();
   remove_from_metadata(t);
+
+  // health
+  health_check_map_t new_checks;
+  const auto info_map = pending_fsmap.get_mds_info();
+  for (const auto &i : info_map) {
+    const auto &gid = i.first;
+    const auto &info = i.second;
+    if (pending_daemon_health_rm.count(gid)) {
+      continue;
+    }
+    MDSHealth health;
+    auto p = pending_daemon_health.find(gid);
+    if (p != pending_daemon_health.end()) {
+      health = p->second;
+    } else {
+      bufferlist bl;
+      mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
+      if (!bl.length()) {
+	derr << "Missing health data for MDS " << gid << dendl;
+	continue;
+      }
+      bufferlist::iterator bl_i = bl.begin();
+      health.decode(bl_i);
+    }
+    for (const auto &metric : health.metrics) {
+      int const rank = info.rank;
+      health_check_t *check = &new_checks.add(
+	mds_metric_name(metric.type),
+	metric.sev,
+	mds_metric_summary(metric.type));
+      ostringstream ss;
+      ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
+      for (auto p = metric.metadata.begin();
+	   p != metric.metadata.end();
+	   ++p) {
+	if (p != metric.metadata.begin()) {
+	  ss << ", ";
+	}
+	ss << p->first << ": " << p->second;
+      }
+      check->detail.push_back(ss.str());
+    }
+  }
+  pending_fsmap.get_health_checks(&new_checks);
+  for (auto& p : new_checks.checks) {
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%num%"),
+      stringify(p.second.detail.size()));
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%plurals%"),
+      p.second.detail.size() > 1 ? "s" : "");
+    p.second.summary = boost::regex_replace(
+      p.second.summary,
+      boost::regex("%isorare%"),
+      p.second.detail.size() > 1 ? "are" : "is");
+  }
+  encode_health(new_checks, t);
 }
 
 version_t MDSMonitor::get_trim_to()
@@ -741,8 +803,9 @@ void MDSMonitor::on_active()
   tick();
   update_logger();
 
-  if (mon->is_leader())
-    mon->clog->info() << "fsmap " << fsmap;
+  if (mon->is_leader()) {
+    mon->clog->debug() << "fsmap " << fsmap;
+  }
 }
 
 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
diff --git a/src/mon/MgrMonitor.cc b/src/mon/MgrMonitor.cc
index 234453c7a7e2d..e1688a39681ef 100644
--- a/src/mon/MgrMonitor.cc
+++ b/src/mon/MgrMonitor.cc
@@ -60,6 +60,10 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
     dout(4) << "active server: " << map.active_addr
 	    << "(" << map.active_gid << ")" << dendl;
 
+    ever_had_active_mgr = get_value("ever_had_active_mgr");
+
+    load_health();
+
     if (map.available) {
       first_seen_inactive = utime_t();
     } else {
@@ -79,6 +83,27 @@ void MgrMonitor::create_pending()
   pending_map.epoch++;
 }
 
+health_status_t MgrMonitor::should_warn_about_mgr_down()
+{
+  utime_t now = ceph_clock_now();
+  // we warn if
+  //   - we've ever had an active mgr, or
+  //   - we have osds AND we've exceeded the grace period
+  // which means a new mon cluster and be HEALTH_OK indefinitely as long as
+  // no OSDs are ever created.
+  if (ever_had_active_mgr ||
+      (mon->osdmon()->osdmap.get_num_osds() > 0 &&
+       now > mon->monmap->created + g_conf->mon_mgr_mkfs_grace)) {
+    health_status_t level = HEALTH_WARN;
+    if (first_seen_inactive != utime_t() &&
+	now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
+      level = HEALTH_ERR;
+    }
+    return level;
+  }
+  return HEALTH_OK;
+}
+
 void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
 {
   dout(10) << __func__ << " " << pending_map << dendl;
@@ -86,6 +111,20 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   pending_map.encode(bl, mon->get_quorum_con_features());
   put_version(t, pending_map.epoch, bl);
   put_last_committed(t, pending_map.epoch);
+
+  health_check_map_t next;
+  if (pending_map.active_gid == 0) {
+    auto level = should_warn_about_mgr_down();
+    if (level != HEALTH_OK) {
+      next.add("MGR_DOWN", level, "no active mgr");
+    } else {
+      dout(10) << __func__ << " no health warning (never active and new cluster)"
+	       << dendl;
+    }
+  } else {
+    put_value(t, "ever_had_active_mgr", 1);
+  }
+  encode_health(next, t);
 }
 
 bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
@@ -314,8 +353,7 @@ void MgrMonitor::send_digests()
     MMgrDigest *mdigest = new MMgrDigest;
 
     JSONFormatter f;
-    std::list<std::string> health_strs;
-    mon->get_health(health_strs, nullptr, &f);
+    mon->get_health_status(true, &f, nullptr, nullptr, nullptr);
     f.flush(mdigest->health_json);
     f.reset();
 
@@ -343,8 +381,9 @@ void MgrMonitor::cancel_timer()
 
 void MgrMonitor::on_active()
 {
-  if (mon->is_leader())
-    mon->clog->info() << "mgrmap e" << map.epoch << ": " << map;
+  if (mon->is_leader()) {
+    mon->clog->debug() << "mgrmap e" << map.epoch << ": " << map;
+  }
 }
 
 void MgrMonitor::get_health(
@@ -363,7 +402,7 @@ void MgrMonitor::get_health(
     return;
   }
 
-  if (!map.available) {
+  if (map.active_gid == 0) {
     auto level = HEALTH_WARN;
     // do not escalate to ERR if they are still upgrading to jewel.
     if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
@@ -434,11 +473,25 @@ void MgrMonitor::tick()
     }
   }
 
+  if (!pending_map.available &&
+      should_warn_about_mgr_down() != HEALTH_OK) {
+    dout(10) << " exceeded mon_mgr_mkfs_grace " << g_conf->mon_mgr_mkfs_grace
+	     << " seconds" << dendl;
+    propose = true;
+  }
+
   if (propose) {
     propose_pending();
   }
 }
 
+void MgrMonitor::on_restart()
+{
+  // Clear out the leader-specific state.
+  last_beacon.clear();
+}
+
+
 bool MgrMonitor::promote_standby()
 {
   assert(pending_map.active_gid == 0);
diff --git a/src/mon/MgrMonitor.h b/src/mon/MgrMonitor.h
index ea1a0a91a4aca..0dc1af571deea 100644
--- a/src/mon/MgrMonitor.h
+++ b/src/mon/MgrMonitor.h
@@ -22,6 +22,7 @@ class MgrMonitor: public PaxosService
 {
   MgrMap map;
   MgrMap pending_map;
+  bool ever_had_active_mgr = false;
 
   utime_t first_seen_inactive;
 
@@ -42,6 +43,8 @@ class MgrMonitor: public PaxosService
 
   bool check_caps(MonOpRequestRef op, const uuid_d& fsid);
 
+  health_status_t should_warn_about_mgr_down();
+
 public:
   MgrMonitor(Monitor *mn, Paxos *p, const string& service_name)
     : PaxosService(mn, p, service_name)
@@ -76,6 +79,8 @@ class MgrMonitor: public PaxosService
   void send_digests();
 
   void on_active() override;
+  void on_restart() override;
+
   void get_health(list<pair<health_status_t,string> >& summary,
 		  list<pair<health_status_t,string> > *detail,
 		  CephContext *cct) const override;
diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc
index add84e278b72a..6a0606a4be9fa 100644
--- a/src/mon/MgrStatMonitor.cc
+++ b/src/mon/MgrStatMonitor.cc
@@ -71,7 +71,7 @@ MonPGStatService *MgrStatMonitor::get_pg_stat_service()
 
 void MgrStatMonitor::create_initial()
 {
-  dout(10) << dendl;
+  dout(10) << __func__ << dendl;
   version = 0;
   service_map.epoch = 1;
   ::encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
@@ -81,6 +81,7 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
 {
   version = get_last_committed();
   dout(10) << " " << version << dendl;
+  load_health();
   bufferlist bl;
   get_version(version, bl);
   if (version) {
@@ -88,8 +89,6 @@ void MgrStatMonitor::update_from_paxos(bool *need_bootstrap)
     try {
       auto p = bl.begin();
       ::decode(digest, p);
-      ::decode(health_summary, p);
-      ::decode(health_detail, p);
       ::decode(service_map, p);
       dout(10) << __func__ << " v" << version
 	       << " service_map e" << service_map.epoch << dendl;
@@ -151,8 +150,7 @@ void MgrStatMonitor::create_pending()
 {
   dout(10) << " " << version << dendl;
   pending_digest = digest;
-  pending_health_summary = health_summary;
-  pending_health_detail = health_detail;
+  pending_health_checks = get_health_checks();
   pending_service_map_bl.clear();
   ::encode(service_map, pending_service_map_bl, mon->get_quorum_con_features());
 }
@@ -168,12 +166,12 @@ void MgrStatMonitor::encode_pending(MonitorDBStore::TransactionRef t)
   dout(10) << " " << version << dendl;
   bufferlist bl;
   ::encode(pending_digest, bl, mon->get_quorum_con_features());
-  ::encode(pending_health_summary, bl);
-  ::encode(pending_health_detail, bl);
   assert(pending_service_map_bl.length());
   bl.append(pending_service_map_bl);
   put_version(t, version, bl);
   put_last_committed(t, version);
+
+  encode_health(pending_health_checks, t);
 }
 
 version_t MgrStatMonitor::get_trim_to()
@@ -194,14 +192,6 @@ void MgrStatMonitor::get_health(list<pair<health_status_t,string> >& summary,
 				list<pair<health_status_t,string> > *detail,
 				CephContext *cct) const
 {
-  if (mon->osdmon()->osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
-    return;
-  }
-
-  summary.insert(summary.end(), health_summary.begin(), health_summary.end());
-  if (detail) {
-    detail->insert(detail->end(), health_detail.begin(), health_detail.end());
-  }
 }
 
 void MgrStatMonitor::tick()
@@ -254,12 +244,12 @@ bool MgrStatMonitor::prepare_report(MonOpRequestRef op)
   bufferlist bl = m->get_data();
   auto p = bl.begin();
   ::decode(pending_digest, p);
-  dout(10) << __func__ << " " << pending_digest << dendl;
-  pending_health_summary.swap(m->health_summary);
-  pending_health_detail.swap(m->health_detail);
+  pending_health_checks.swap(m->health_checks);
   if (m->service_map_bl.length()) {
     pending_service_map_bl.swap(m->service_map_bl);
   }
+  dout(10) << __func__ << " " << pending_digest << ", "
+	   << pending_health_checks.checks.size() << " health checks" << dendl;
   return true;
 }
 
diff --git a/src/mon/MgrStatMonitor.h b/src/mon/MgrStatMonitor.h
index ee58e293012f5..c1846a5447798 100644
--- a/src/mon/MgrStatMonitor.h
+++ b/src/mon/MgrStatMonitor.h
@@ -15,14 +15,11 @@ class MgrStatMonitor : public PaxosService {
   // live version
   version_t version = 0;
   PGMapDigest digest;
-  list<pair<health_status_t,string>> health_summary;
-  list<pair<health_status_t,string>> health_detail;
   ServiceMap service_map;
 
   // pending commit
   PGMapDigest pending_digest;
-  list<pair<health_status_t,string>> pending_health_summary;
-  list<pair<health_status_t,string>> pending_health_detail;
+  health_check_map_t pending_health_checks;
   bufferlist pending_service_map_bl;
 
   std::unique_ptr<MgrPGStatService> pgservice;
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 8d974660f29d0..569c52760a353 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -210,6 +210,7 @@ COMMAND_WITH_FLAG("injectargs " \
 COMMAND("status", "show cluster status", "mon", "r", "cli,rest")
 COMMAND("health name=detail,type=CephChoices,strings=detail,req=false", \
 	"show cluster health", "mon", "r", "cli,rest")
+COMMAND("time-sync-status", "show time sync status", "mon", "r", "cli,rest")
 COMMAND("df name=detail,type=CephChoices,strings=detail,req=false", \
 	"show cluster free space stats", "mon", "r", "cli,rest")
 COMMAND("report name=tags,type=CephString,n=N,req=false", \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index f4593f58499b3..9421b4cbf14fc 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -77,6 +77,7 @@
 #include "MgrMonitor.h"
 #include "MgrStatMonitor.h"
 #include "mon/QuorumService.h"
+#include "mon/OldHealthMonitor.h"
 #include "mon/HealthMonitor.h"
 #include "mon/ConfigKeyService.h"
 #include "common/config.h"
@@ -204,8 +205,9 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
   paxos_service[PAXOS_AUTH] = new AuthMonitor(this, paxos, "auth");
   paxos_service[PAXOS_MGR] = new MgrMonitor(this, paxos, "mgr");
   paxos_service[PAXOS_MGRSTAT] = new MgrStatMonitor(this, paxos, "mgrstat");
+  paxos_service[PAXOS_HEALTH] = new HealthMonitor(this, paxos, "health");
 
-  health_monitor = new HealthMonitor(this);
+  health_monitor = new OldHealthMonitor(this);
   config_key_service = new ConfigKeyService(this, paxos);
 
   mon_caps = new MonCap();
@@ -2422,27 +2424,180 @@ void Monitor::do_health_to_clog(bool force)
 
   dout(10) << __func__ << (force ? " (force)" : "") << dendl;
 
-  list<string> status;
-  health_status_t overall = get_health(status, NULL, NULL);
+  if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+    string summary;
+    health_status_t level = get_health_status(false, nullptr, &summary);
+    if (!force &&
+	summary == health_status_cache.summary &&
+	level == health_status_cache.overall)
+      return;
+    if (level == HEALTH_OK)
+      clog->info() << "overall " << summary;
+    else if (level == HEALTH_WARN)
+      clog->warn() << "overall " << summary;
+    else if (level == HEALTH_ERR)
+      clog->error() << "overall " << summary;
+    else
+      ceph_abort();
+    health_status_cache.summary = summary;
+    health_status_cache.overall = level;
+  } else {
+    // for jewel only
+    list<string> status;
+    health_status_t overall = get_health(status, NULL, NULL);
+    dout(25) << __func__
+	     << (force ? " (force)" : "")
+	     << dendl;
 
-  dout(25) << __func__
-           << (force ? " (force)" : "")
-           << dendl;
+    string summary = joinify(status.begin(), status.end(), string("; "));
+
+    if (!force &&
+	overall == health_status_cache.overall &&
+	!health_status_cache.summary.empty() &&
+	health_status_cache.summary == summary) {
+      // we got a dup!
+      return;
+    }
+
+    clog->info() << summary;
+
+    health_status_cache.overall = overall;
+    health_status_cache.summary = summary;
+  }
+}
+
+health_status_t Monitor::get_health_status(
+  bool want_detail,
+  Formatter *f,
+  std::string *plain,
+  const char *sep1,
+  const char *sep2)
+{
+  health_status_t r = HEALTH_OK;
+  bool compat = g_conf->mon_health_preluminous_compat;
+  if (f) {
+    f->open_object_section("health");
+    f->open_object_section("checks");
+  }
+
+  string summary;
+  string *psummary = f ? nullptr : &summary;
+  for (auto& svc : paxos_service) {
+    r = std::min(r, svc->get_health_checks().dump_summary(
+		   f, psummary, sep2, want_detail));
+  }
+
+  if (f) {
+    f->close_section();
+    f->dump_stream("status") << r;
+  } else {
+    // one-liner: HEALTH_FOO[ thing1[; thing2 ...]]
+    *plain = stringify(r);
+    if (summary.size()) {
+      *plain += sep1;
+      *plain += summary;
+    }
+    *plain += "\n";
+  }
 
-  string summary = joinify(status.begin(), status.end(), string("; "));
+  if (f && compat) {
+    f->open_array_section("summary");
+    for (auto& svc : paxos_service) {
+      svc->get_health_checks().dump_summary_compat(f);
+    }
+    f->close_section();
+    f->dump_stream("overall_status") << r;
+  }
 
-  if (!force &&
-      overall == health_status_cache.overall &&
-      !health_status_cache.summary.empty() &&
-      health_status_cache.summary == summary) {
-    // we got a dup!
+  if (want_detail) {
+    if (f && compat) {
+      f->open_array_section("detail");
+    }
+
+    for (auto& svc : paxos_service) {
+      svc->get_health_checks().dump_detail(f, plain, compat);
+    }
+
+    if (f && compat) {
+      f->close_section();
+    }
+  }
+  if (f) {
+    f->close_section();
+  }
+  return r;
+}
+
+void Monitor::log_health(
+  const health_check_map_t& updated,
+  const health_check_map_t& previous,
+  MonitorDBStore::TransactionRef t)
+{
+  if (!g_conf->mon_health_to_clog) {
     return;
   }
+  // FIXME: log atomically as part of @t instead of using clog.
+  dout(10) << __func__ << " updated " << updated.checks.size()
+	   << " previous " << previous.checks.size()
+	   << dendl;
+  for (auto& p : updated.checks) {
+    auto q = previous.checks.find(p.first);
+    if (q == previous.checks.end()) {
+      // new
+      ostringstream ss;
+      ss << "Health check failed: " << p.second.summary << " ("
+         << p.first << ")";
+      if (p.second.severity == HEALTH_WARN)
+	clog->warn() << ss.str();
+      else
+	clog->error() << ss.str();
+    } else {
+      if (p.second.summary != q->second.summary ||
+	  p.second.severity != q->second.severity) {
+	// summary or severity changed (ignore detail changes at this level)
+	ostringstream ss;
+        ss << "Health check update: " << p.second.summary << " (" << p.first << ")";
+	if (p.second.severity == HEALTH_WARN)
+	  clog->warn() << ss.str();
+	else
+	  clog->error() << ss.str();
+      }
+    }
+  }
+  for (auto& p : previous.checks) {
+    if (!updated.checks.count(p.first)) {
+      // cleared
+      ostringstream ss;
+      if (p.first == "DEGRADED_OBJECTS") {
+        clog->info() << "All degraded objects recovered";
+      } else if (p.first == "OSD_FLAGS") {
+        clog->info() << "OSD flags cleared";
+      } else {
+        clog->info() << "Health check cleared: " << p.first << " (was: "
+                     << p.second.summary << ")";
+      }
+    }
+  }
 
-  clog->info() << summary;
+  if (previous.checks.size() && updated.checks.size() == 0) {
+    // We might be going into a fully healthy state, check
+    // other subsystems
+    bool any_checks = false;
+    for (auto& svc : paxos_service) {
+      if (&(svc->get_health_checks()) == &(previous)) {
+        // Ignore the ones we're clearing right now
+        continue;
+      }
 
-  health_status_cache.overall = overall;
-  health_status_cache.summary = summary;
+      if (svc->get_health_checks().checks.size() > 0) {
+        any_checks = true;
+        break;
+      }
+    }
+    if (!any_checks) {
+      clog->info() << "Cluster is now healthy";
+    }
+  }
 }
 
 health_status_t Monitor::get_health(list<string>& status,
@@ -2462,52 +2617,29 @@ health_status_t Monitor::get_health(list<string>& status,
     s->get_health(summary, detailbl ? &detail : NULL, cct);
   }
 
-  health_monitor->get_health(f, summary, (detailbl ? &detail : NULL));
-
-  if (f) {
-    f->open_object_section("timechecks");
-    f->dump_unsigned("epoch", get_epoch());
-    f->dump_int("round", timecheck_round);
-    f->dump_stream("round_status")
-      << ((timecheck_round%2) ? "on-going" : "finished");
-  }
+  health_monitor->get_health(summary, (detailbl ? &detail : NULL));
 
   health_status_t overall = HEALTH_OK;
   if (!timecheck_skews.empty()) {
     list<string> warns;
-    if (f)
-      f->open_array_section("mons");
     for (map<entity_inst_t,double>::iterator i = timecheck_skews.begin();
          i != timecheck_skews.end(); ++i) {
       entity_inst_t inst = i->first;
       double skew = i->second;
       double latency = timecheck_latencies[inst];
       string name = monmap->get_name(inst.addr);
-
       ostringstream tcss;
       health_status_t tcstatus = timecheck_status(tcss, skew, latency);
       if (tcstatus != HEALTH_OK) {
         if (overall > tcstatus)
           overall = tcstatus;
         warns.push_back(name);
-
         ostringstream tmp_ss;
         tmp_ss << "mon." << name
                << " addr " << inst.addr << " " << tcss.str()
 	       << " (latency " << latency << "s)";
         detail.push_back(make_pair(tcstatus, tmp_ss.str()));
       }
-
-      if (f) {
-        f->open_object_section("mon");
-        f->dump_string("name", name.c_str());
-        f->dump_float("skew", skew);
-        f->dump_float("latency", latency);
-        f->dump_stream("health") << tcstatus;
-        if (tcstatus != HEALTH_OK)
-          f->dump_stream("details") << tcss.str();
-        f->close_section();
-      }
     }
     if (!warns.empty()) {
       ostringstream ss;
@@ -2521,11 +2653,7 @@ health_status_t Monitor::get_health(list<string>& status,
       status.push_back(ss.str());
       summary.push_back(make_pair(HEALTH_WARN, "Monitor clock skew detected "));
     }
-    if (f)
-      f->close_section();
   }
-  if (f)
-    f->close_section();
 
   if (f)
     f->open_array_section("summary");
@@ -2577,12 +2705,9 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
   if (f)
     f->open_object_section("status");
 
-  // reply with the status for all the components
-  list<string> health;
-  get_health(health, NULL, f);
-
   if (f) {
     f->dump_stream("fsid") << monmap->get_fsid();
+    get_health_status(false, f, nullptr);
     f->dump_unsigned("election_epoch", get_epoch());
     {
       f->open_array_section("quorum");
@@ -2606,7 +2731,6 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
     f->open_object_section("fsmap");
     mdsmon()->get_fsmap().print_summary(f, NULL);
     f->close_section();
-
     f->open_object_section("mgrmap");
     mgrmon()->get_map().print_summary(f, nullptr);
     f->close_section();
@@ -2614,11 +2738,21 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
     f->dump_object("servicemap", mgrstatmon()->get_service_map());
     f->close_section();
   } else {
-
     ss << "  cluster:\n";
     ss << "    id:     " << monmap->get_fsid() << "\n";
-    ss << "    health: " << joinify(health.begin(), health.end(), 
-				  string("\n            ")) << "\n";
+
+    string health;
+    if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+      get_health_status(false, nullptr, &health,
+			"\n            ", "\n            ");
+    } else {
+      list<string> ls;
+      get_health(ls, NULL, f);
+      health = joinify(ls.begin(), ls.end(),
+		       string("\n            "));
+    }
+    ss << "    health: " << health << "\n";
+
     ss << "\n \n  services:\n";
     {
       size_t maxlen = 3;
@@ -3089,6 +3223,40 @@ void Monitor::handle_command(MonOpRequestRef op)
       rs = "must supply options to be parsed in a single string";
       r = -EINVAL;
     }
+  } else if (prefix == "time-sync-status") {
+    if (!f)
+      f.reset(Formatter::create("json-pretty"));
+    f->open_object_section("time_sync");
+    if (!timecheck_skews.empty()) {
+      f->open_object_section("time_skew_status");
+      for (auto& i : timecheck_skews) {
+	entity_inst_t inst = i.first;
+	double skew = i.second;
+	double latency = timecheck_latencies[inst];
+	string name = monmap->get_name(inst.addr);
+	ostringstream tcss;
+	health_status_t tcstatus = timecheck_status(tcss, skew, latency);
+	f->open_object_section(name.c_str());
+	f->dump_float("skew", skew);
+	f->dump_float("latency", latency);
+	f->dump_stream("health") << tcstatus;
+	if (tcstatus != HEALTH_OK) {
+	  f->dump_stream("details") << tcss.str();
+	}
+	f->close_section();
+      }
+      f->close_section();
+    }
+    f->open_object_section("timechecks");
+    f->dump_unsigned("epoch", get_epoch());
+    f->dump_int("round", timecheck_round);
+    f->dump_stream("round_status") << ((timecheck_round%2) ?
+				       "on-going" : "finished");
+    f->close_section();
+    f->close_section();
+    f->flush(rdata);
+    r = 0;
+    rs = "";
   } else if (prefix == "status" ||
 	     prefix == "health" ||
 	     prefix == "df") {
@@ -3105,25 +3273,35 @@ void Monitor::handle_command(MonOpRequestRef op)
       }
       rdata.append(ds);
     } else if (prefix == "health") {
-      list<string> health_str;
-      get_health(health_str, detail == "detail" ? &rdata : NULL, f.get());
-      if (f) {
-        f->flush(ds);
-        ds << '\n';
+      if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+	string plain;
+	get_health_status(detail == "detail", f.get(), f ? nullptr : &plain);
+	if (f) {
+	  f->flush(rdata);
+	} else {
+	  rdata.append(plain);
+	}
       } else {
-	assert(!health_str.empty());
-	ds << health_str.front();
-	health_str.pop_front();
-	if (!health_str.empty()) {
-	  ds << ' ';
-	  ds << joinify(health_str.begin(), health_str.end(), string("; "));
+	list<string> health_str;
+	get_health(health_str, detail == "detail" ? &rdata : NULL, f.get());
+	if (f) {
+	  f->flush(ds);
+	  ds << '\n';
+	} else {
+	  assert(!health_str.empty());
+	  ds << health_str.front();
+	  health_str.pop_front();
+	  if (!health_str.empty()) {
+	    ds << ' ';
+	    ds << joinify(health_str.begin(), health_str.end(), string("; "));
+	  }
 	}
+	bufferlist comb;
+	comb.append(ds);
+	if (detail == "detail")
+	  comb.append(rdata);
+	rdata = comb;
       }
-      bufferlist comb;
-      comb.append(ds);
-      if (detail == "detail")
-	comb.append(rdata);
-      rdata = comb;
     } else if (prefix == "df") {
       bool verbose = (detail == "detail");
       if (f)
@@ -4112,6 +4290,11 @@ void Monitor::dispatch_op(MonOpRequestRef op)
       health_monitor->dispatch(op);
       break;
 
+    case MSG_MON_HEALTH_CHECKS:
+      op->set_type_service();
+      paxos_service[PAXOS_HEALTH]->dispatch(op);
+      break;
+
     default:
       dealt_with = false;
       break;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index ac3f3f0b01838..fa7f9e9acdd32 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -27,10 +27,12 @@
 #include <cmath>
 
 #include "include/types.h"
+#include "include/health.h"
 #include "msg/Messenger.h"
 
 #include "common/Timer.h"
 
+#include "health_check.h"
 #include "MonMap.h"
 #include "Elector.h"
 #include "Paxos.h"
@@ -496,6 +498,7 @@ class Monitor : public Dispatcher,
   version_t timecheck_round;
   unsigned int timecheck_acks;
   utime_t timecheck_round_start;
+  friend class HealthMonitor;
   /* When we hit a skew we will start a new round based off of
    * 'mon_timecheck_skew_interval'. Each new round will be backed off
    * until we hit 'mon_timecheck_interval' -- which is the typical
@@ -648,6 +651,10 @@ class Monitor : public Dispatcher,
     return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
   }
 
+  class MgrStatMonitor *healthmon() {
+    return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
+  }
+
   friend class Paxos;
   friend class OSDMonitor;
   friend class MDSMonitor;
@@ -737,6 +744,18 @@ class Monitor : public Dispatcher,
    */
   health_status_t get_health(list<string>& status, bufferlist *detailbl,
                              Formatter *f);
+
+  health_status_t get_health_status(
+    bool want_detail,
+    Formatter *f,
+    std::string *plain,
+    const char *sep1 = " ",
+    const char *sep2 = "; ");
+  void log_health(
+    const health_check_map_t& updated,
+    const health_check_map_t& previous,
+    MonitorDBStore::TransactionRef t);
+
   void get_cluster_status(stringstream &ss, Formatter *f);
 
   void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 05177b1f71e6c..a39f58ce21498 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -17,6 +17,8 @@
  */
 
 #include <algorithm>
+#include <boost/algorithm/string.hpp>
+#include <locale>
 #include <sstream>
 
 #include "mon/OSDMonitor.h"
@@ -275,6 +277,8 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
     mapping_job.reset();
   }
 
+  load_health();
+
   /*
    * We will possibly have a stashed latest that *we* wrote, and we will
    * always be sure to have the oldest full map in the first..last range
@@ -532,7 +536,7 @@ void OSDMonitor::on_active()
   update_logger();
 
   if (mon->is_leader()) {
-    mon->clog->info() << "osdmap " << osdmap;
+    mon->clog->debug() << "osdmap " << osdmap;
   } else {
     list<MonOpRequestRef> ls;
     take_all_failures(ls);
@@ -1101,6 +1105,11 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
     ::encode(pending_creatings, creatings_bl);
     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
   }
+
+  // health
+  health_check_map_t next;
+  tmp.check_health(&next);
+  encode_health(next, t);
 }
 
 void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
@@ -3246,7 +3255,8 @@ void OSDMonitor::tick()
 
 	  do_propose = true;
 
-	  mon->clog->info() << "osd." << o << " out (down for " << down << ")";
+	  mon->clog->info() << "Marking osd." << o << " out (has been down for "
+                            << int(down.sec()) << " seconds)";
 	} else
 	  continue;
       }
@@ -3365,7 +3375,7 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
           osds.insert(i);
         }
 	continue;
-      } 
+      }
       if (osdmap.is_out(i))
         continue;
       ++num_in_osds;
@@ -3776,7 +3786,17 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
       }
     }
 
-    get_pools_health(summary, detail);
+    for (auto it : osdmap.get_pools()) {
+      const pg_pool_t &pool = it.second;
+      if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+	const string& pool_name = osdmap.get_pool_name(it.first);
+	stringstream ss;
+	ss << "pool '" << pool_name << "' is full";
+	summary.push_back(make_pair(HEALTH_WARN, ss.str()));
+	if (detail)
+	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
+      }
+    }
   }
 }
 
@@ -5038,90 +5058,6 @@ bool OSDMonitor::update_pools_status()
   return ret;
 }
 
-void OSDMonitor::get_pools_health(
-    list<pair<health_status_t,string> >& summary,
-    list<pair<health_status_t,string> > *detail) const
-{
-  auto& pools = osdmap.get_pools();
-  for (auto it = pools.begin(); it != pools.end(); ++it) {
-    const pool_stat_t *pstat = mon->pgservice->get_pool_stat(it->first);
-    if (!pstat)
-      continue;
-    const object_stat_sum_t& sum = pstat->stats.sum;
-    const pg_pool_t &pool = it->second;
-    const string& pool_name = osdmap.get_pool_name(it->first);
-
-    if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
-      // uncomment these asserts if/when we update the FULL flag on pg_stat update
-      //assert((pool.quota_max_objects > 0) || (pool.quota_max_bytes > 0));
-
-      stringstream ss;
-      ss << "pool '" << pool_name << "' is full";
-      summary.push_back(make_pair(HEALTH_WARN, ss.str()));
-      if (detail)
-	detail->push_back(make_pair(HEALTH_WARN, ss.str()));
-    }
-
-    float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
-    float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
-
-    if (pool.quota_max_objects > 0) {
-      stringstream ss;
-      health_status_t status = HEALTH_OK;
-      if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
-	// uncomment these asserts if/when we update the FULL flag on pg_stat update
-        //assert(pool.has_flag(pg_pool_t::FLAG_FULL));
-      } else if (crit_threshold > 0 &&
-		 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
-        ss << "pool '" << pool_name
-           << "' has " << sum.num_objects << " objects"
-           << " (max " << pool.quota_max_objects << ")";
-        status = HEALTH_ERR;
-      } else if (warn_threshold > 0 &&
-		 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
-        ss << "pool '" << pool_name
-           << "' has " << sum.num_objects << " objects"
-           << " (max " << pool.quota_max_objects << ")";
-        status = HEALTH_WARN;
-      }
-      if (status != HEALTH_OK) {
-        pair<health_status_t,string> s(status, ss.str());
-        summary.push_back(s);
-        if (detail)
-          detail->push_back(s);
-      }
-    }
-
-    if (pool.quota_max_bytes > 0) {
-      health_status_t status = HEALTH_OK;
-      stringstream ss;
-      if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
-	// uncomment these asserts if/when we update the FULL flag on pg_stat update
-	//assert(pool.has_flag(pg_pool_t::FLAG_FULL));
-      } else if (crit_threshold > 0 &&
-		 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
-        ss << "pool '" << pool_name
-           << "' has " << si_t(sum.num_bytes) << " bytes"
-           << " (max " << si_t(pool.quota_max_bytes) << ")";
-        status = HEALTH_ERR;
-      } else if (warn_threshold > 0 &&
-		 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
-        ss << "pool '" << pool_name
-           << "' has " << si_t(sum.num_bytes) << " bytes"
-           << " (max " << si_t(pool.quota_max_bytes) << ")";
-        status = HEALTH_WARN;
-      }
-      if (status != HEALTH_OK) {
-        pair<health_status_t,string> s(status, ss.str());
-        summary.push_back(s);
-        if (detail)
-          detail->push_back(s);
-      }
-    }
-  }
-}
-
-
 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
 {
   op->mark_osdmon_event(__func__);
@@ -8468,6 +8404,17 @@ bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
 	      pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
 	    }
 	    ss << "marked out osd." << osd << ". ";
+            std::ostringstream msg;
+            msg << "Client " << op->get_session()->entity_name
+                << " marked osd." << osd << " out";
+            if (osdmap.is_up(osd)) {
+              msg << ", while it was still marked up";
+            } else {
+              msg << ", after it was down for " << int(down_pending_out[osd].sec())
+                  << " seconds";
+            }
+
+            mon->clog->info() << msg.str();
 	    any = true;
 	  }
         } else if (prefix == "osd in") {
diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h
index 3a9a27f5c37fa..9a944107970b5 100644
--- a/src/mon/OSDMonitor.h
+++ b/src/mon/OSDMonitor.h
@@ -359,8 +359,6 @@ class OSDMonitor : public PaxosService {
 
   void update_pool_flags(int64_t pool_id, uint64_t flags);
   bool update_pools_status();
-  void get_pools_health(list<pair<health_status_t,string> >& summary,
-                        list<pair<health_status_t,string> > *detail) const;
 
   bool prepare_set_flag(MonOpRequestRef op, int flag);
   bool prepare_unset_flag(MonOpRequestRef op, int flag);
diff --git a/src/mon/OldHealthMonitor.cc b/src/mon/OldHealthMonitor.cc
new file mode 100644
index 0000000000000..d7264a7ee26bd
--- /dev/null
+++ b/src/mon/OldHealthMonitor.cc
@@ -0,0 +1,107 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sstream>
+#include <stdlib.h>
+#include <limits.h>
+
+// #include <boost/intrusive_ptr.hpp>
+// Because intusive_ptr clobbers our assert...
+#include "include/assert.h"
+
+#include "mon/Monitor.h"
+#include "mon/HealthService.h"
+#include "mon/OldHealthMonitor.h"
+#include "mon/DataHealthService.h"
+
+#include "messages/MMonHealth.h"
+#include "common/Formatter.h"
+// #include "common/config.h"
+
+#define dout_subsys ceph_subsys_mon
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, mon, this)
+static ostream& _prefix(std::ostream *_dout, const Monitor *mon,
+                        const OldHealthMonitor *hmon) {
+  return *_dout << "mon." << mon->name << "@" << mon->rank
+		<< "(" << mon->get_state_name() << ")." << hmon->get_name()
+                << "(" << hmon->get_epoch() << ") ";
+}
+
+void OldHealthMonitor::init()
+{
+  dout(10) << __func__ << dendl;
+  assert(services.empty());
+  services[HealthService::SERVICE_HEALTH_DATA] = new DataHealthService(mon);
+
+  for (map<int,HealthService*>::iterator it = services.begin();
+       it != services.end();
+       ++it) {
+    it->second->init();
+  }
+}
+
+bool OldHealthMonitor::service_dispatch(MonOpRequestRef op)
+{
+  assert(op->get_req()->get_type() == MSG_MON_HEALTH);
+  MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
+  int service_type = hm->get_service_type();
+  if (services.count(service_type) == 0) {
+    dout(1) << __func__ << " service type " << service_type
+            << " not registered -- drop message!" << dendl;
+    return false;
+  }
+  return services[service_type]->service_dispatch(op);
+}
+
+void OldHealthMonitor::start_epoch() {
+  epoch_t epoch = get_epoch();
+  for (map<int,HealthService*>::iterator it = services.begin();
+       it != services.end(); ++it) {
+    it->second->start(epoch);
+  }
+}
+
+void OldHealthMonitor::finish_epoch() {
+  generic_dout(20) << "OldHealthMonitor::finish_epoch()" << dendl;
+  for (map<int,HealthService*>::iterator it = services.begin();
+       it != services.end(); ++it) {
+    assert(it->second != NULL);
+    it->second->finish();
+  }
+}
+
+void OldHealthMonitor::service_shutdown()
+{
+  dout(0) << "OldHealthMonitor::service_shutdown "
+          << services.size() << " services" << dendl;
+  for (map<int,HealthService*>::iterator it = services.begin();
+      it != services.end();
+       ++it) {
+    it->second->shutdown();
+    delete it->second;
+  }
+  services.clear();
+}
+
+void OldHealthMonitor::get_health(
+  list<pair<health_status_t,string> >& summary,
+  list<pair<health_status_t,string> > *detail)
+{
+  for (map<int,HealthService*>::iterator it = services.begin();
+       it != services.end();
+       ++it) {
+    it->second->get_health(summary, detail);
+  }
+}
diff --git a/src/mon/OldHealthMonitor.h b/src/mon/OldHealthMonitor.h
new file mode 100644
index 0000000000000..f295693611b25
--- /dev/null
+++ b/src/mon/OldHealthMonitor.h
@@ -0,0 +1,66 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank, Inc
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_MON_OLDHEALTHMONITOR_H
+#define CEPH_MON_OLDHEALTHMONITOR_H
+
+#include "mon/QuorumService.h"
+
+//forward declaration
+namespace ceph { class Formatter; }
+class HealthService;
+
+class OldHealthMonitor : public QuorumService
+{
+  map<int,HealthService*> services;
+
+protected:
+  void service_shutdown() override;
+
+public:
+  OldHealthMonitor(Monitor *m) : QuorumService(m) { }
+  ~OldHealthMonitor() override {
+    assert(services.empty());
+  }
+
+
+  /**
+   * @defgroup OldHealthMonitor_Inherited_h Inherited abstract methods
+   * @{
+   */
+  void init() override;
+  void get_health(list<pair<health_status_t,string> >& summary,
+		  list<pair<health_status_t,string> > *detail) override;
+  bool service_dispatch(MonOpRequestRef op) override;
+
+  void start_epoch() override;
+
+  void finish_epoch() override;
+
+  void cleanup() override { }
+  void service_tick() override { }
+
+  int get_type() override {
+    return QuorumService::SERVICE_HEALTH;
+  }
+
+  string get_name() const override {
+    return "health";
+  }
+
+  /**
+   * @} // OldHealthMonitor_Inherited_h
+   */
+};
+
+#endif
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 88add444b75cd..913e035f7ef21 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -1,6 +1,8 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
 
+#include <boost/algorithm/string.hpp>
+
 #include "PGMap.h"
 
 #define dout_subsys ceph_subsys_mon
@@ -2548,7 +2550,671 @@ namespace {
       ss << pgs_count << " unscrubbed pgs";
       summary.push_back(make_pair(HEALTH_WARN, ss.str()));
     }
+  }
+}
+
+void PGMap::get_health_checks(
+  CephContext *cct,
+  const OSDMap& osdmap,
+  health_check_map_t *checks) const
+{
+  utime_t now = ceph_clock_now();
+  const unsigned max = cct->_conf->mon_health_max_detail;
+  const auto& pools = osdmap.get_pools();
+
+  checks->clear();
+
+  typedef enum pg_consequence_t {
+    UNAVAILABLE = 1,   // Client IO to the pool may block
+    DEGRADED = 2,      // Fewer than the requested number of replicas are present
+    DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present
+                       //  and insufficiet resources are present to fix this
+    DAMAGED = 4        // The data may be missing or inconsistent on disk and
+                       //  requires repair
+  } pg_consequence_t;
+
+  // For a given PG state, how should it be reported at the pool level?
+  class PgStateResponse {
+    public:
+    pg_consequence_t consequence;
+    typedef std::function< utime_t(const pg_stat_t&) > stuck_cb;
+    stuck_cb stuck_since;
+    bool invert;
+
+    PgStateResponse(const pg_consequence_t &c, stuck_cb s)
+      : consequence(c), stuck_since(s), invert(false)
+    {
+    }
+
+    PgStateResponse(const pg_consequence_t &c, stuck_cb s, bool i)
+      : consequence(c), stuck_since(s), invert(i)
+    {
+    }
+  };
+
+  // Record the PG state counts that contributed to a reported pool state
+  class PgCauses {
+    public:
+    // Map of PG_STATE_* to number of pgs in that state.
+    std::map<unsigned, unsigned> states;
+
+    // List of all PG IDs that had a state contributing
+    // to this health condition.
+    std::set<pg_t> pgs;
+
+    std::map<pg_t, std::string> pg_messages;
+  };
+
+  // Map of PG state to how to respond to it
+  std::map<unsigned, PgStateResponse> state_to_response = {
+    // Immediate reports
+    { PG_STATE_INCONSISTENT,     {DAMAGED,     {}} },
+    { PG_STATE_INCOMPLETE,       {DEGRADED,    {}} },
+    { PG_STATE_REPAIR,           {DAMAGED,     {}} },
+    { PG_STATE_SNAPTRIM_ERROR,   {DAMAGED,     {}} },
+    { PG_STATE_BACKFILL_TOOFULL, {DEGRADED,    {}} },
+    { PG_STATE_RECOVERY_TOOFULL, {DEGRADED,    {}} },
+    { PG_STATE_DEGRADED,         {DEGRADED,    {}} },
+    { PG_STATE_DOWN,             {UNAVAILABLE, {}} },
+    // Delayed (wait until stuck) reports
+    { PG_STATE_PEERING,          {UNAVAILABLE, [](const pg_stat_t &p){return p.last_peered;}    } },
+    { PG_STATE_UNDERSIZED,       {DEGRADED,    [](const pg_stat_t &p){return p.last_fullsized;} } },
+    { PG_STATE_STALE,            {UNAVAILABLE, [](const pg_stat_t &p){return p.last_unstale;}   } },
+    // Delayed and inverted reports
+    { PG_STATE_ACTIVE,           {UNAVAILABLE, [](const pg_stat_t &p){return p.last_active;}, true} },
+    { PG_STATE_CLEAN,            {DEGRADED,    [](const pg_stat_t &p){return p.last_clean;}, true} }
+  };
+
+  // Specialized state printer that takes account of inversion of
+  // ACTIVE, CLEAN checks.
+  auto state_name = [](const uint32_t &state) {
+    // Special cases for the states that are inverted checks
+    if (state == PG_STATE_CLEAN) {
+      return std::string("unclean");
+    } else if (state == PG_STATE_ACTIVE) {
+      return std::string("inactive");
+    } else {
+      return pg_state_string(state);
+    }
+  };
+
+  // Map of what is wrong to information about why, implicitly also stores
+  // the list of what is wrong.
+  std::map<pg_consequence_t, PgCauses> detected;
+
+  // Optimisation: trim down the number of checks to apply based on
+  // the summary counters
+  std::map<unsigned, PgStateResponse> possible_responses;
+  for (const auto &i : num_pg_by_state) {
+    for (const auto &j : state_to_response) {
+      if (!j.second.invert) {
+        // Check for normal tests by seeing if any pgs have the flag
+        if (i.first & j.first) {
+          possible_responses.insert(j);
+        }
+      }
+    }
+  }
+
+  for (const auto &j : state_to_response) {
+    if (j.second.invert) {
+      // Check for inverted tests by seeing if not-all pgs have the flag
+      const auto &found = num_pg_by_state.find(j.first);
+      if (found == num_pg_by_state.end() || found->second != num_pg) {
+        possible_responses.insert(j);
+      }
+    }
+  }
+
+  utime_t cutoff = now - utime_t(cct->_conf->mon_pg_stuck_threshold, 0);
+  // Loop over all PGs, if there are any possibly-unhealthy states in there
+  if (!possible_responses.empty()) {
+    for (const auto& i : pg_stat) {
+      const auto &pg_id = i.first;
+      const auto &pg_info = i.second;
+
+      for (const auto &j : state_to_response) {
+        const auto &pg_response_state = j.first;
+        const auto &pg_response = j.second;
+
+        // Apply the state test
+        if (!(bool(pg_info.state & pg_response_state) != pg_response.invert)) {
+          continue;
+        }
+
+        // Apply stuckness test if needed
+        if (pg_response.stuck_since) {
+          // Delayed response, check for stuckness
+          utime_t last_whatever = pg_response.stuck_since(pg_info);
+          if (last_whatever >= cutoff) {
+            // Not stuck enough, ignore.
+            continue;
+          } else {
+
+          }
+        }
+
+        auto &causes = detected[pg_response.consequence];
+        causes.states[pg_response_state]++;
+        causes.pgs.insert(pg_id);
+
+        // Don't bother composing detail string if we have already recorded
+        // too many
+        if (causes.pg_messages.size() > max) {
+          continue;
+        }
+
+        std::ostringstream ss;
+        if (pg_response.stuck_since) {
+          utime_t since = pg_response.stuck_since(pg_info);
+          ss << "pg " << pg_id << " is stuck " << state_name(pg_response_state);
+          if (since == utime_t()) {
+            ss << " since forever";
+          } else {
+            utime_t dur = now - since;
+            ss << " for " << dur;
+          }
+          ss << ", current state " << pg_state_string(pg_info.state)
+             << ", last acting " << pg_info.acting;
+        } else {
+          ss << "pg " << pg_id << " is "
+             << pg_state_string(pg_info.state);
+          ss << ", acting " << pg_info.acting;
+          if (pg_info.stats.sum.num_objects_unfound) {
+            ss << ", " << pg_info.stats.sum.num_objects_unfound
+               << " unfound";
+          }
+        }
+
+        if (pg_info.state & PG_STATE_INCOMPLETE) {
+          const pg_pool_t *pi = osdmap.get_pg_pool(pg_id.pool());
+          if (pi && pi->min_size > 1) {
+            ss << " (reducing pool "
+               << osdmap.get_pool_name(pg_id.pool())
+               << " min_size from " << (int)pi->min_size
+               << " may help; search ceph.com/docs for 'incomplete')";
+          }
+        }
+
+        causes.pg_messages[pg_id] = ss.str();
+      }
+    }
+  } else {
+    dout(10) << __func__ << " skipping loop over PGs: counters look OK" << dendl;
+  }
+
+  for (const auto &i : detected) {
+    std::string health_code;
+    health_status_t sev;
+    std::string summary;
+    switch(i.first) {
+      case UNAVAILABLE:
+        health_code = "PG_AVAILABILITY";
+        sev = HEALTH_WARN;
+        summary = "Reduced data availability: ";
+        break;
+      case DEGRADED:
+        health_code = "PG_DEGRADED";
+        summary = "Degraded data redundancy: ";
+        sev = HEALTH_WARN;
+        break;
+      case DEGRADED_FULL:
+        health_code = "PG_DEGRADED_FULL";
+        summary = "Degraded data redundancy (low space): ";
+        sev = HEALTH_ERR;
+        break;
+      case DAMAGED:
+        health_code = "PG_DAMAGED";
+        summary = "Possible data damage: ";
+        sev = HEALTH_ERR;
+        break;
+      default:
+        assert(false);
+    }
+
+    if (i.first == DEGRADED) {
+      if (pg_sum.stats.sum.num_objects_degraded &&
+          pg_sum.stats.sum.num_object_copies > 0) {
+        double pc = (double)pg_sum.stats.sum.num_objects_degraded /
+          (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+        char b[20];
+        snprintf(b, sizeof(b), "%.3lf", pc);
+        ostringstream ss;
+        ss << pg_sum.stats.sum.num_objects_degraded
+           << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded ("
+           << b << "%)";
+
+        // Throw in a comma for the benefit of the following PG counts
+        summary += ss.str() + ", ";
+      }
+    }
+
+    // Compose summary message saying how many PGs in what states led
+    // to this health check failing
+    std::vector<std::string> pg_msgs;
+    for (const auto &j : i.second.states) {
+      std::ostringstream msg;
+      msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
+      pg_msgs.push_back(msg.str());
+    }
+    summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
+
+
+
+    health_check_t *check = &checks->add(
+        health_code,
+        sev,
+        summary);
+
+    // Compose list of PGs contributing to this health check failing
+    for (const auto &j : i.second.pg_messages) {
+      check->detail.push_back(j.second);
+    }
+  }
+
+  // OSD_SKEWED_USAGE
+  if (cct->_conf->mon_warn_osd_usage_min_max_delta) {
+    int max_osd = -1, min_osd = -1;
+    float max_osd_usage = 0.0, min_osd_usage = 1.0;
+    for (auto p = osd_stat.begin(); p != osd_stat.end(); ++p) {
+      // kb should never be 0, but avoid divide by zero in case of corruption
+      if (p->second.kb <= 0)
+        continue;
+      float usage = ((float)p->second.kb_used) / ((float)p->second.kb);
+      if (usage > max_osd_usage) {
+        max_osd_usage = usage;
+	max_osd = p->first;
+      }
+      if (usage < min_osd_usage) {
+        min_osd_usage = usage;
+	min_osd = p->first;
+      }
+    }
+    float diff = max_osd_usage - min_osd_usage;
+    if (diff > cct->_conf->mon_warn_osd_usage_min_max_delta) {
+      auto& d = checks->add("OSD_SKEWED_USAGE", HEALTH_WARN,
+			    "skewed osd utilization");
+      ostringstream ss;
+      ss << "difference between min (osd." << min_osd << " at "
+	 << roundf(min_osd_usage*1000.0)/100.0
+	 << "%) and max (osd." << max_osd << " at "
+	 << roundf(max_osd_usage*1000.0)/100.0
+	 << "%) osd usage " << roundf(diff*1000.0)/100.0 << "% > "
+	 << roundf(cct->_conf->mon_warn_osd_usage_min_max_delta*1000.0)/100.0
+	 << " (mon_warn_osd_usage_min_max_delta)";
+      d.detail.push_back(ss.str());
+    }
+  }
+
+  // OSD_SCRUB_ERRORS
+  if (pg_sum.stats.sum.num_scrub_errors) {
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
+    checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
+  }
+
+  // CACHE_POOL_NEAR_FULL
+  {
+    list<string> detail;
+    unsigned num_pools = 0;
+    for (auto& p : pools) {
+      if ((!p.second.target_max_objects && !p.second.target_max_bytes) ||
+	  !pg_pool_sum.count(p.first)) {
+	continue;
+      }
+      bool nearfull = false;
+      const string& name = osdmap.get_pool_name(p.first);
+      const pool_stat_t& st = get_pg_pool_sum_stat(p.first);
+      uint64_t ratio = p.second.cache_target_full_ratio_micro +
+	((1000000 - p.second.cache_target_full_ratio_micro) *
+	 cct->_conf->mon_cache_target_full_warn_ratio);
+      if (p.second.target_max_objects &&
+	  (uint64_t)(st.stats.sum.num_objects -
+		     st.stats.sum.num_objects_hit_set_archive) >
+	  p.second.target_max_objects * (ratio / 1000000.0)) {
+	ostringstream ss;
+	ss << "cache pool '" << name << "' with "
+	   << si_t(st.stats.sum.num_objects)
+	   << " objects at/near target max "
+	   << si_t(p.second.target_max_objects) << " objects";
+	detail.push_back(ss.str());
+	nearfull = true;
+      }
+      if (p.second.target_max_bytes &&
+	  (uint64_t)(st.stats.sum.num_bytes -
+		     st.stats.sum.num_bytes_hit_set_archive) >
+	  p.second.target_max_bytes * (ratio / 1000000.0)) {
+	ostringstream ss;
+	ss << "cache pool '" << name
+	   << "' with " << si_t(st.stats.sum.num_bytes)
+	   << "B at/near target max "
+	   << si_t(p.second.target_max_bytes) << "B";
+	detail.push_back(ss.str());
+	nearfull = true;
+      }
+      if (nearfull) {
+	++num_pools;
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << num_pools << " cache pools at or near target size";
+      auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
+    }
+  }
+
+  // TOO_FEW_PGS
+  int num_in = osdmap.get_num_in_osds();
+  int sum_pg_up = MAX(pg_sum.up, static_cast<int32_t>(pg_stat.size()));
+  if (num_in &&
+      cct->_conf->mon_pg_warn_min_per_osd > 0 &&
+      osdmap.get_pools().size() > 0) {
+    int per = sum_pg_up / num_in;
+    if (per < cct->_conf->mon_pg_warn_min_per_osd && per) {
+      ostringstream ss;
+      ss << "too few PGs per OSD (" << per
+	 << " < min " << cct->_conf->mon_pg_warn_min_per_osd << ")";
+      checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
+    }
+  }
+
+  // TOO_MANY_PGS
+  if (num_in && cct->_conf->mon_pg_warn_max_per_osd > 0) {
+    int per = sum_pg_up / num_in;
+    if (per > cct->_conf->mon_pg_warn_max_per_osd) {
+      ostringstream ss;
+      ss << "too many PGs per OSD (" << per
+	 << " > max " << cct->_conf->mon_pg_warn_max_per_osd << ")";
+      checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
+    }
+  }
+
+  // SMALLER_PGP_NUM
+  // MANY_OBJECTS_PER_PG
+  if (!pg_stat.empty()) {
+    list<string> pgp_detail, many_detail;
+    for (auto p = pg_pool_sum.begin();
+         p != pg_pool_sum.end();
+         ++p) {
+      const pg_pool_t *pi = osdmap.get_pg_pool(p->first);
+      if (!pi)
+	continue;   // in case osdmap changes haven't propagated to PGMap yet
+      const string& name = osdmap.get_pool_name(p->first);
+      if (pi->get_pg_num() > pi->get_pgp_num() &&
+	  !(name.find(".DELETED") != string::npos &&
+	    cct->_conf->mon_fake_pool_delete)) {
+	ostringstream ss;
+	ss << "pool " << name << " pg_num "
+	   << pi->get_pg_num() << " > pgp_num " << pi->get_pgp_num();
+	pgp_detail.push_back(ss.str());
+      }
+      int average_objects_per_pg = pg_sum.stats.sum.num_objects / pg_stat.size();
+      if (average_objects_per_pg > 0 &&
+          pg_sum.stats.sum.num_objects >= cct->_conf->mon_pg_warn_min_objects &&
+          p->second.stats.sum.num_objects >=
+	  cct->_conf->mon_pg_warn_min_pool_objects) {
+	int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
+	float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
+	if (cct->_conf->mon_pg_warn_max_object_skew > 0 &&
+	    ratio > cct->_conf->mon_pg_warn_max_object_skew) {
+	  ostringstream ss;
+	  ss << "pool " << name << " objects per pg ("
+	     << objects_per_pg << ") is more than " << ratio
+	     << " times cluster average ("
+	     << average_objects_per_pg << ")";
+	  many_detail.push_back(ss.str());
+	}
+      }
+    }
+    if (!pgp_detail.empty()) {
+      ostringstream ss;
+      ss << pgp_detail.size() << " pools have pg_num > pgp_num";
+      auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
+      d.detail.swap(pgp_detail);
+    }
+    if (!many_detail.empty()) {
+      ostringstream ss;
+      ss << many_detail.size() << " pools have many more objects per pg than"
+	 << " average";
+      auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
+      d.detail.swap(many_detail);
+    }
+  }
+
+  // POOL_FULL
+  // POOL_NEAR_FULL
+  {
+    float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
+    float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
+    list<string> full_detail, nearfull_detail;
+    unsigned full_pools = 0, nearfull_pools = 0;
+    for (auto it : pools) {
+      auto it2 = pg_pool_sum.find(it.first);
+      if (it2 == pg_pool_sum.end()) {
+	continue;
+      }
+      const pool_stat_t *pstat = &it2->second;
+      const object_stat_sum_t& sum = pstat->stats.sum;
+      const string& pool_name = osdmap.get_pool_name(it.first);
+      const pg_pool_t &pool = it.second;
+      bool full = false, nearfull = false;
+      if (pool.quota_max_objects > 0) {
+	stringstream ss;
+	if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
+	} else if (crit_threshold > 0 &&
+		   sum.num_objects >= pool.quota_max_objects*crit_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << sum.num_objects << " objects"
+	     << " (max " << pool.quota_max_objects << ")";
+	  full_detail.push_back(ss.str());
+	  full = true;
+	} else if (warn_threshold > 0 &&
+		   sum.num_objects >= pool.quota_max_objects*warn_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << sum.num_objects << " objects"
+	     << " (max " << pool.quota_max_objects << ")";
+	  nearfull_detail.push_back(ss.str());
+	  nearfull = true;
+	}
+      }
+      if (pool.quota_max_bytes > 0) {
+	stringstream ss;
+	if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+	} else if (crit_threshold > 0 &&
+		   sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << si_t(sum.num_bytes) << " bytes"
+	     << " (max " << si_t(pool.quota_max_bytes) << ")";
+	  full_detail.push_back(ss.str());
+	  full = true;
+	} else if (warn_threshold > 0 &&
+		   sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
+	  ss << "pool '" << pool_name
+	     << "' has " << si_t(sum.num_bytes) << " bytes"
+	     << " (max " << si_t(pool.quota_max_bytes) << ")";
+	  nearfull_detail.push_back(ss.str());
+	  nearfull = true;
+	}
+      }
+      if (full) {
+	++full_pools;
+      }
+      if (nearfull) {
+	++nearfull_pools;
+      }
+    }
+    if (full_pools) {
+      ostringstream ss;
+      ss << full_pools << " pools full";
+      auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
+      d.detail.swap(full_detail);
+    }
+    if (nearfull_pools) {
+      ostringstream ss;
+      ss << nearfull_pools << " pools full";
+      auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
+      d.detail.swap(nearfull_detail);
+    }
+  }
 
+  // OBJECT_MISPLACED
+  if (pg_sum.stats.sum.num_objects_misplaced &&
+      pg_sum.stats.sum.num_object_copies > 0) {
+    double pc = (double)pg_sum.stats.sum.num_objects_misplaced /
+      (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_objects_misplaced
+       << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
+       << b << "%)";
+    checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
+  }
+
+  // OBJECT_UNFOUND
+  if (pg_sum.stats.sum.num_objects_unfound &&
+      pg_sum.stats.sum.num_objects) {
+    double pc = (double)pg_sum.stats.sum.num_objects_unfound /
+      (double)pg_sum.stats.sum.num_objects * (double)100.0;
+    char b[20];
+    snprintf(b, sizeof(b), "%.3lf", pc);
+    ostringstream ss;
+    ss << pg_sum.stats.sum.num_objects_unfound
+       << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
+    checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
+  }
+
+  // REQUEST_SLOW
+  // REQUEST_STUCK
+  if (cct->_conf->mon_osd_warn_op_age > 0 &&
+      osd_sum.op_queue_age_hist.upper_bound() >
+      cct->_conf->mon_osd_warn_op_age) {
+    list<string> warn_detail, error_detail;
+    unsigned warn = 0, error = 0;
+    float err_age =
+      cct->_conf->mon_osd_warn_op_age * cct->_conf->mon_osd_err_op_age_ratio;
+    const pow2_hist_t& h = osd_sum.op_queue_age_hist;
+    for (unsigned i = h.h.size() - 1; i > 0; --i) {
+      float ub = (float)(1 << i) / 1000.0;
+      if (ub < cct->_conf->mon_osd_warn_op_age)
+	break;
+      if (h.h[i]) {
+	ostringstream ss;
+	ss << h.h[i] << " ops are blocked > " << ub << " sec";
+	if (ub > err_age) {
+	  error += h.h[i];
+	  error_detail.push_back(ss.str());
+	} else {
+	  warn += h.h[i];
+	  warn_detail.push_back(ss.str());
+	}
+      }
+    }
+
+    map<float,set<int>> warn_osd_by_max; // max -> osds
+    map<float,set<int>> error_osd_by_max; // max -> osds
+    if (!warn_detail.empty() || !error_detail.empty()) {
+      for (auto& p : osd_stat) {
+	const pow2_hist_t& h = p.second.op_queue_age_hist;
+	for (unsigned i = h.h.size() - 1; i > 0; --i) {
+	  float ub = (float)(1 << i) / 1000.0;
+	  if (ub < cct->_conf->mon_osd_warn_op_age)
+	    break;
+	  if (h.h[i]) {
+	    if (ub > err_age) {
+	      error_osd_by_max[ub].insert(p.first);
+	    } else {
+	      warn_osd_by_max[ub].insert(p.first);
+	    }
+	    break;
+	  }
+	}
+      }
+    }
+
+    if (!warn_detail.empty()) {
+      ostringstream ss;
+      ss << warn << " slow requests are blocked > "
+	 << cct->_conf->mon_osd_warn_op_age << " sec";
+      auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
+      d.detail.swap(warn_detail);
+      int left = max;
+      for (auto& p : warn_osd_by_max) {
+	ostringstream ss;
+	if (p.second.size() > 1) {
+	  ss << "osds " << p.second;
+	} else {
+	  ss << "osd." << *p.second.begin();
+	}
+	ss << " have blocked requests > " << p.first << " sec";
+	d.detail.push_back(ss.str());
+	if (--left == 0) {
+	  break;
+	}
+      }
+    }
+    if (!error_detail.empty()) {
+      ostringstream ss;
+      ss << warn << " stuck requests are blocked > "
+	 << err_age << " sec";
+      auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
+      d.detail.swap(error_detail);
+      int left = max;
+      for (auto& p : error_osd_by_max) {
+	ostringstream ss;
+	if (p.second.size() > 1) {
+	  ss << "osds " << p.second;
+	} else {
+	  ss << "osd." << *p.second.begin();
+	}
+	ss << " have stuck requests > " << p.first << " sec";
+	d.detail.push_back(ss.str());
+	if (--left == 0) {
+	  break;
+	}
+      }
+    }
+  }
+
+  // PG_NOT_SCRUBBED
+  // PG_NOT_DEEP_SCRUBBED
+  {
+    list<string> detail, deep_detail;
+    const double age = cct->_conf->mon_warn_not_scrubbed +
+      cct->_conf->mon_scrub_interval;
+    utime_t cutoff = now;
+    cutoff -= age;
+    const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
+      cct->_conf->osd_deep_scrub_interval;
+    utime_t deep_cutoff = now;
+    deep_cutoff -= deep_age;
+    for (auto& p : pg_stat) {
+      if (p.second.last_scrub_stamp < cutoff) {
+	ostringstream ss;
+	ss << "pg " << p.first << " not scrubbed since "
+	   << p.second.last_scrub_stamp;
+        detail.push_back(ss.str());
+      }
+      if (p.second.last_deep_scrub_stamp < deep_cutoff) {
+	ostringstream ss;
+	ss << "pg " << p.first << " not deep-scrubbed since "
+	   << p.second.last_deep_scrub_stamp;
+        deep_detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " pgs not scrubbed for " << age;
+      auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
+    }
+    if (!deep_detail.empty()) {
+      ostringstream ss;
+      ss << deep_detail.size() << " pgs not deep-scrubbed for " << age;
+      auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
+      d.detail.swap(deep_detail);
+    }
   }
 }
 
@@ -2938,6 +3604,70 @@ void PGMap::get_health(
     }
   }
 
+  for (auto it : pools) {
+    auto it2 = pg_pool_sum.find(it.first);
+    if (it2 == pg_pool_sum.end()) {
+      continue;
+    }
+    const pool_stat_t *pstat = &it2->second;
+    const object_stat_sum_t& sum = pstat->stats.sum;
+    const string& pool_name = osdmap.get_pool_name(it.first);
+    const pg_pool_t &pool = it.second;
+
+    float warn_threshold = (float)g_conf->mon_pool_quota_warn_threshold/100;
+    float crit_threshold = (float)g_conf->mon_pool_quota_crit_threshold/100;
+
+    if (pool.quota_max_objects > 0) {
+      stringstream ss;
+      health_status_t status = HEALTH_OK;
+      if ((uint64_t)sum.num_objects >= pool.quota_max_objects) {
+      } else if (crit_threshold > 0 &&
+		 sum.num_objects >= pool.quota_max_objects*crit_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << sum.num_objects << " objects"
+           << " (max " << pool.quota_max_objects << ")";
+        status = HEALTH_ERR;
+      } else if (warn_threshold > 0 &&
+		 sum.num_objects >= pool.quota_max_objects*warn_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << sum.num_objects << " objects"
+           << " (max " << pool.quota_max_objects << ")";
+        status = HEALTH_WARN;
+      }
+      if (status != HEALTH_OK) {
+        pair<health_status_t,string> s(status, ss.str());
+        summary.push_back(s);
+        if (detail)
+          detail->push_back(s);
+      }
+    }
+
+    if (pool.quota_max_bytes > 0) {
+      health_status_t status = HEALTH_OK;
+      stringstream ss;
+      if ((uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
+      } else if (crit_threshold > 0 &&
+		 sum.num_bytes >= pool.quota_max_bytes*crit_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << si_t(sum.num_bytes) << " bytes"
+           << " (max " << si_t(pool.quota_max_bytes) << ")";
+        status = HEALTH_ERR;
+      } else if (warn_threshold > 0 &&
+		 sum.num_bytes >= pool.quota_max_bytes*warn_threshold) {
+        ss << "pool '" << pool_name
+           << "' has " << si_t(sum.num_bytes) << " bytes"
+           << " (max " << si_t(pool.quota_max_bytes) << ")";
+        status = HEALTH_WARN;
+      }
+      if (status != HEALTH_OK) {
+        pair<health_status_t,string> s(status, ss.str());
+        summary.push_back(s);
+        if (detail)
+          detail->push_back(s);
+      }
+    }
+  }
+
   print_unscrubbed_pgs(pg_stat, summary, detail, cct);
 }
 
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 6d58e6b2546d7..3e81c7e05fa61 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -21,10 +21,12 @@
 #ifndef CEPH_PGMAP_H
 #define CEPH_PGMAP_H
 
+#include "include/health.h"
 #include "common/debug.h"
 #include "common/TextTable.h"
 #include "osd/osd_types.h"
 #include "include/mempool.h"
+#include "mon/health_check.h"
 #include <sstream>
 #include "mon/PGStatService.h"
 
@@ -495,6 +497,11 @@ class PGMap : public PGMapDigest {
 		  list<pair<health_status_t,string> >& summary,
 		  list<pair<health_status_t,string> > *detail) const;
 
+  void get_health_checks(
+    CephContext *cct,
+    const OSDMap& osdmap,
+    health_check_map_t *checks) const;
+
   static void generate_test_instances(list<PGMap*>& o);
 };
 WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental)
diff --git a/src/mon/PaxosService.cc b/src/mon/PaxosService.cc
index b133fc1a582bb..91152943b0602 100644
--- a/src/mon/PaxosService.cc
+++ b/src/mon/PaxosService.cc
@@ -431,3 +431,12 @@ void PaxosService::trim(MonitorDBStore::TransactionRef t,
   }
 }
 
+void PaxosService::load_health()
+{
+  bufferlist bl;
+  mon->store->get("health", service_name, bl);
+  if (bl.length()) {
+    auto p = bl.begin();
+    ::decode(health_checks, p);
+  }
+}
diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h
index ca75915841e59..da3038ff1e9f9 100644
--- a/src/mon/PaxosService.h
+++ b/src/mon/PaxosService.h
@@ -77,15 +77,23 @@ class PaxosService {
    */
   bool have_pending; 
 
-protected:
+  /**
+   * health checks for this service
+   *
+   * Child must populate this during encode_pending() by calling encode_health().
+   */
+  health_check_map_t health_checks;
+public:
+  const health_check_map_t& get_health_checks() {
+    return health_checks;
+  }
 
+protected:
   /**
    * format of our state in leveldb, 0 for default
    */
   version_t format_version;
 
-
-
   /**
    * @defgroup PaxosService_h_callbacks Callback classes
    * @{
@@ -428,6 +436,15 @@ class PaxosService {
 			  list<pair<health_status_t,string> > *detail,
 			  CephContext *cct) const { }
 
+  void encode_health(const health_check_map_t& next,
+		     MonitorDBStore::TransactionRef t) {
+    bufferlist bl;
+    ::encode(next, bl);
+    t->put("health", service_name, bl);
+    mon->log_health(next, health_checks, t);
+  }
+  void load_health();
+
  private:
   /**
    * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
diff --git a/src/mon/QuorumService.h b/src/mon/QuorumService.h
index b354c40a77f77..626ce659e573c 100644
--- a/src/mon/QuorumService.h
+++ b/src/mon/QuorumService.h
@@ -117,8 +117,7 @@ class QuorumService
 
   virtual void init() { }
 
-  virtual void get_health(Formatter *f,
-			  list<pair<health_status_t,string> >& summary,
+  virtual void get_health(list<pair<health_status_t,string> >& summary,
                           list<pair<health_status_t,string> > *detail) = 0;
   virtual int get_type() = 0;
   virtual string get_name() const = 0;
diff --git a/src/mon/health_check.h b/src/mon/health_check.h
new file mode 100644
index 0000000000000..e9e53836e4774
--- /dev/null
+++ b/src/mon/health_check.h
@@ -0,0 +1,192 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <string>
+#include <map>
+
+#include "include/health.h"
+#include "common/Formatter.h"
+
+struct health_check_t {
+  health_status_t severity;
+  std::string summary;
+  std::list<std::string> detail;
+
+  DENC(health_check_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.severity, p);
+    denc(v.summary, p);
+    denc(v.detail, p);
+    DENC_FINISH(p);
+  }
+
+  friend bool operator==(const health_check_t& l,
+			 const health_check_t& r) {
+    return l.severity == r.severity &&
+      l.summary == r.summary &&
+      l.detail == r.detail;
+  }
+  friend bool operator!=(const health_check_t& l,
+			 const health_check_t& r) {
+    return !(l == r);
+  }
+
+  void dump(Formatter *f) const {
+    f->dump_stream("severity") << severity;
+    f->dump_string("summary", summary);
+    f->open_array_section("detail");
+    for (auto& p : detail) {
+      f->dump_string("item", p);
+    }
+    f->close_section();
+  }
+
+  static void generate_test_instances(list<health_check_t*>& ls) {
+    ls.push_back(new health_check_t);
+    ls.push_back(new health_check_t);
+    ls.back()->severity = HEALTH_ERR;
+    ls.back()->summary = "summarization";
+    ls.back()->detail = {"one", "two", "three"};
+  }
+};
+WRITE_CLASS_DENC(health_check_t)
+
+
+struct health_check_map_t {
+  map<std::string,health_check_t> checks;
+
+  DENC(health_check_map_t, v, p) {
+    DENC_START(1, 1, p);
+    denc(v.checks, p);
+    DENC_FINISH(p);
+  }
+
+  void dump(Formatter *f) const {
+    for (auto& p : checks) {
+      f->dump_object(p.first.c_str(), p.second);
+    }
+  }
+
+  static void generate_test_instances(list<health_check_map_t*>& ls) {
+    ls.push_back(new health_check_map_t);
+    ls.push_back(new health_check_map_t);
+    {
+      auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo");
+      d.detail.push_back("a");
+      d.detail.push_back("b");
+    }
+    {
+      auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!");
+      d.detail.push_back("c");
+      d.detail.push_back("d");
+    }
+  }
+
+  void clear() {
+    checks.clear();
+  }
+  void swap(health_check_map_t& other) {
+    checks.swap(other.checks);
+  }
+
+  health_check_t& add(const std::string& code,
+		      health_status_t severity,
+		      const std::string& summary) {
+    assert(checks.count(code) == 0);
+    health_check_t& r = checks[code];
+    r.severity = severity;
+    r.summary = summary;
+    return r;
+  }
+
+  void merge(const health_check_map_t& o) {
+    for (auto& p : o.checks) {
+      auto q = checks.find(p.first);
+      if (q == checks.end()) {
+	// new check
+	checks[p.first] = p.second;
+      } else {
+	// merge details, and hope the summary matches!
+	q->second.detail.insert(
+	  q->second.detail.end(),
+	  p.second.detail.begin(),
+	  p.second.detail.end());
+      }
+    }
+  }
+
+  health_status_t dump_summary(Formatter *f, std::string *plain,
+			       const char *sep, bool detail) const {
+    health_status_t r = HEALTH_OK;
+    for (auto& p : checks) {
+      if (r > p.second.severity) {
+	r = p.second.severity;
+      }
+      if (f) {
+	f->open_object_section(p.first.c_str());
+	f->dump_stream("severity") << p.second.severity;
+	f->dump_string("message", p.second.summary);
+	if (detail) {
+	  f->open_array_section("detail");
+	  for (auto& d : p.second.detail) {
+	    f->dump_string("item", d);
+	  }
+	  f->close_section();
+	}
+	f->close_section();
+      } else {
+	if (!plain->empty()) {
+	  *plain += sep;
+	}
+	*plain += p.second.summary;
+      }
+    }
+    return r;
+  }
+
+  void dump_summary_compat(Formatter *f) const {
+    for (auto& p : checks) {
+      f->open_object_section("item");
+      f->dump_stream("severity") << p.second.severity;
+      f->dump_string("summary", p.second.summary);
+      f->close_section();
+    }
+  }
+
+  void dump_detail(Formatter *f, std::string *plain, bool compat) const {
+    for (auto& p : checks) {
+      if (f) {
+	if (compat) {
+	  // this is sloppy, but the best we can do: just dump all of the
+	  // individual checks' details together
+	  for (auto& d : p.second.detail) {
+	    f->dump_string("item", d);
+	  }
+	}
+      } else {
+	if (!compat) {
+	  *plain += p.first + " " + p.second.summary + "\n";
+	}
+	for (auto& d : p.second.detail) {
+	  if (!compat) {
+	    *plain += "    ";
+	  }
+	  *plain += d;
+	  *plain += "\n";
+	}
+      }
+    }
+  }
+
+  friend bool operator==(const health_check_map_t& l,
+			 const health_check_map_t& r) {
+    return l.checks == r.checks;
+  }
+  friend bool operator!=(const health_check_map_t& l,
+			 const health_check_map_t& r) {
+    return !(l == r);
+  }
+};
+WRITE_CLASS_DENC(health_check_map_t)
diff --git a/src/mon/mon_types.h b/src/mon/mon_types.h
index 883f4669e2b7f..a23238b7d0b9c 100644
--- a/src/mon/mon_types.h
+++ b/src/mon/mon_types.h
@@ -31,7 +31,8 @@
 #define PAXOS_AUTH       5
 #define PAXOS_MGR        6
 #define PAXOS_MGRSTAT    7
-#define PAXOS_NUM        8
+#define PAXOS_HEALTH    8
+#define PAXOS_NUM        9
 
 inline const char *get_paxos_name(int p) {
   switch (p) {
@@ -43,6 +44,7 @@ inline const char *get_paxos_name(int p) {
   case PAXOS_AUTH: return "auth";
   case PAXOS_MGR: return "mgr";
   case PAXOS_MGRSTAT: return "mgrstat";
+  case PAXOS_HEALTH: return "health";
   default: ceph_abort(); return 0;
   }
 }
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
index 4860889989fe9..9d1953d75b195 100644
--- a/src/msg/Message.cc
+++ b/src/msg/Message.cc
@@ -96,6 +96,7 @@ using namespace std;
 #include "messages/MMonGetVersion.h"
 #include "messages/MMonGetVersionReply.h"
 #include "messages/MMonHealth.h"
+#include "messages/MMonHealthChecks.h"
 #include "messages/MMonMetadata.h"
 #include "messages/MDataPing.h"
 #include "messages/MAuth.h"
@@ -783,6 +784,11 @@ Message *decode_message(CephContext *cct, int crcflags,
   case MSG_MON_HEALTH:
     m = new MMonHealth();
     break;
+
+  case MSG_MON_HEALTH_CHECKS:
+    m = new MMonHealthChecks();
+    break;
+
 #if defined(HAVE_XIO)
   case MSG_DATA_PING:
     m = new MDataPing();
diff --git a/src/msg/Message.h b/src/msg/Message.h
index 611d691df992c..d1b63ac1f2199 100644
--- a/src/msg/Message.h
+++ b/src/msg/Message.h
@@ -183,6 +183,8 @@
 // Special
 #define MSG_NOP                   0x607
 
+#define MSG_MON_HEALTH_CHECKS     0x608
+
 // *** ceph-mgr <-> OSD/MDS daemons ***
 #define MSG_MGR_OPEN              0x700
 #define MSG_MGR_CONFIGURE         0x701
diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc
index d9cf86338375b..2f441eb9a66ad 100644
--- a/src/osd/OSDMap.cc
+++ b/src/osd/OSDMap.cc
@@ -15,6 +15,8 @@
  *
  */
 
+#include <boost/algorithm/string.hpp>
+
 #include "OSDMap.h"
 #include <algorithm>
 #include "common/config.h"
@@ -24,6 +26,7 @@
 #include "include/str_map.h"
 
 #include "common/code_environment.h"
+#include "mon/health_check.h"
 
 #include "crush/CrushTreeDumper.h"
 #include "common/Clock.h"
@@ -4261,3 +4264,362 @@ void print_osd_utilization(const OSDMap& osdmap,
     out << tbl << d.summary() << "\n";
   }
 }
+
+void OSDMap::check_health(health_check_map_t *checks) const
+{
+  int num_osds = get_num_osds();
+
+  // OSD_DOWN
+  // OSD_$subtree_DOWN
+  // OSD_ORPHAN
+  if (num_osds >= 0) {
+    int num_in_osds = 0;
+    int num_down_in_osds = 0;
+    set<int> osds;
+    set<int> down_in_osds;
+    set<int> up_in_osds;
+    set<int> subtree_up;
+    unordered_map<int, set<int> > subtree_type_down;
+    unordered_map<int, int> num_osds_subtree;
+    int max_type = crush->get_max_type_id();
+
+    for (int i = 0; i < get_max_osd(); i++) {
+      if (!exists(i)) {
+        if (crush->item_exists(i)) {
+          osds.insert(i);
+        }
+	continue;
+      }
+      if (is_out(i))
+        continue;
+      ++num_in_osds;
+      if (down_in_osds.count(i) || up_in_osds.count(i))
+	continue;
+      if (!is_up(i)) {
+	down_in_osds.insert(i);
+	int parent_id = 0;
+	int current = i;
+	for (int type = 0; type <= max_type; type++) {
+	  if (!crush->get_type_name(type))
+	    continue;
+	  int r = crush->get_immediate_parent_id(current, &parent_id);
+	  if (r == -ENOENT)
+	    break;
+	  // break early if this parent is already marked as up
+	  if (subtree_up.count(parent_id))
+	    break;
+	  type = crush->get_bucket_type(parent_id);
+	  if (!subtree_type_is_down(
+		g_ceph_context, parent_id, type,
+		&down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
+	    break;
+	  current = parent_id;
+	}
+      }
+    }
+
+    // calculate the number of down osds in each down subtree and
+    // store it in num_osds_subtree
+    for (int type = 1; type <= max_type; type++) {
+      if (!crush->get_type_name(type))
+	continue;
+      for (auto j = subtree_type_down[type].begin();
+	   j != subtree_type_down[type].end();
+	   ++j) {
+	if (type == 1) {
+          list<int> children;
+          int num = crush->get_children(*j, &children);
+          num_osds_subtree[*j] = num;
+        } else {
+          list<int> children;
+          int num = 0;
+          int num_children = crush->get_children(*j, &children);
+          if (num_children == 0)
+	    continue;
+          for (auto l = children.begin(); l != children.end(); ++l) {
+            if (num_osds_subtree[*l] > 0) {
+              num = num + num_osds_subtree[*l];
+            }
+          }
+          num_osds_subtree[*j] = num;
+	}
+      }
+    }
+    num_down_in_osds = down_in_osds.size();
+    assert(num_down_in_osds <= num_in_osds);
+    if (num_down_in_osds > 0) {
+      // summary of down subtree types and osds
+      for (int type = max_type; type > 0; type--) {
+	if (!crush->get_type_name(type))
+	  continue;
+	if (subtree_type_down[type].size() > 0) {
+	  ostringstream ss;
+	  ss << subtree_type_down[type].size() << " "
+	     << crush->get_type_name(type);
+	  if (subtree_type_down[type].size() > 1) {
+	    ss << "s";
+	  }
+	  int sum_down_osds = 0;
+	  for (auto j = subtree_type_down[type].begin();
+	       j != subtree_type_down[type].end();
+	       ++j) {
+	    sum_down_osds = sum_down_osds + num_osds_subtree[*j];
+	  }
+          ss << " (" << sum_down_osds << " osds) down";
+	  string err = string("OSD_") +
+	    string(crush->get_type_name(type)) + "_DOWN";
+	  boost::to_upper(err);
+	  auto& d = checks->add(err, HEALTH_WARN, ss.str());
+	  for (auto j = subtree_type_down[type].rbegin();
+	       j != subtree_type_down[type].rend();
+	       ++j) {
+	    ostringstream ss;
+	    ss << crush->get_type_name(type);
+	    ss << " ";
+	    ss << crush->get_item_name(*j);
+	    // at the top level, do not print location
+	    if (type != max_type) {
+              ss << " (";
+              ss << crush->get_full_location_ordered_string(*j);
+              ss << ")";
+	    }
+	    int num = num_osds_subtree[*j];
+	    ss << " (" << num << " osds)";
+	    ss << " is down";
+	    d.detail.push_back(ss.str());
+	  }
+	}
+      }
+      ostringstream ss;
+      ss << down_in_osds.size() << " osds down";
+      auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
+      for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
+	ostringstream ss;
+	ss << "osd." << *it << " (";
+	ss << crush->get_full_location_ordered_string(*it);
+	ss << ") is down";
+	d.detail.push_back(ss.str());
+      }
+    }
+
+    if (!osds.empty()) {
+      ostringstream ss;
+      ss << osds.size() << " osds exist in the crush map but not in the osdmap";
+      auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
+      for (auto osd : osds) {
+	ostringstream ss;
+	ss << "osd." << osd << " exists in crush map but not in osdmap";
+	d.detail.push_back(ss.str());
+      }
+    }
+  }
+
+  // OSD_OUT_OF_ORDER_FULL
+  {
+    // An osd could configure failsafe ratio, to something different
+    // but for now assume it is the same here.
+    float fsr = g_conf->osd_failsafe_full_ratio;
+    if (fsr > 1.0) fsr /= 100;
+    float fr = get_full_ratio();
+    float br = get_backfillfull_ratio();
+    float nr = get_nearfull_ratio();
+
+    list<string> detail;
+    // These checks correspond to how OSDService::check_full_status() in an OSD
+    // handles the improper setting of these values.
+    if (br < nr) {
+      ostringstream ss;
+      ss << "backfillfull_ratio (" << br
+	 << ") < nearfull_ratio (" << nr << "), increased";
+      detail.push_back(ss.str());
+      br = nr;
+    }
+    if (fr < br) {
+      ostringstream ss;
+      ss << "full_ratio (" << fr << ") < backfillfull_ratio (" << br
+	 << "), increased";
+      detail.push_back(ss.str());
+      fr = br;
+    }
+    if (fsr < fr) {
+      ostringstream ss;
+      ss << "osd_failsafe_full_ratio (" << fsr << ") < full_ratio (" << fr
+	 << "), increased";
+      detail.push_back(ss.str());
+    }
+    if (!detail.empty()) {
+      auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
+			 "full ratio(s) out of order");
+      d.detail.swap(detail);
+    }
+  }
+
+  // OSD_FULL
+  // OSD_NEARFULL
+  // OSD_BACKFILLFULL
+  // OSD_FAILSAFE_FULL
+  {
+    set<int> full, backfillfull, nearfull;
+    get_full_osd_counts(&full, &backfillfull, &nearfull);
+    if (full.size()) {
+      ostringstream ss;
+      ss << full.size() << " full osd(s)";
+      auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
+      for (auto& i: full) {
+	ostringstream ss;
+	ss << "osd." << i << " is full";
+	d.detail.push_back(ss.str());
+      }
+    }
+    if (backfillfull.size()) {
+      ostringstream ss;
+      ss << backfillfull.size() << " backfillfull osd(s)";
+      auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
+      for (auto& i: backfillfull) {
+	ostringstream ss;
+	ss << "osd." << i << " is backfill full";
+	d.detail.push_back(ss.str());
+      }
+    }
+    if (nearfull.size()) {
+      ostringstream ss;
+      ss << nearfull.size() << " nearfull osd(s)";
+      auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
+      for (auto& i: nearfull) {
+	ostringstream ss;
+	ss << "osd." << i << " is near full";
+	d.detail.push_back(ss.str());
+      }
+    }
+  }
+
+  // OSDMAP_FLAGS
+  {
+    // warn about flags
+    uint64_t warn_flags =
+      CEPH_OSDMAP_FULL |
+      CEPH_OSDMAP_PAUSERD |
+      CEPH_OSDMAP_PAUSEWR |
+      CEPH_OSDMAP_PAUSEREC |
+      CEPH_OSDMAP_NOUP |
+      CEPH_OSDMAP_NODOWN |
+      CEPH_OSDMAP_NOIN |
+      CEPH_OSDMAP_NOOUT |
+      CEPH_OSDMAP_NOBACKFILL |
+      CEPH_OSDMAP_NORECOVER |
+      CEPH_OSDMAP_NOSCRUB |
+      CEPH_OSDMAP_NODEEP_SCRUB |
+      CEPH_OSDMAP_NOTIERAGENT |
+      CEPH_OSDMAP_NOREBALANCE;
+    if (test_flag(warn_flags)) {
+      ostringstream ss;
+      ss << get_flag_string(get_flags() & warn_flags)
+	 << " flag(s) set";
+      checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
+    }
+  }
+
+  // OSD_FLAGS
+  {
+    list<string> detail;
+    const unsigned flags =
+      CEPH_OSD_NOUP |
+      CEPH_OSD_NOIN |
+      CEPH_OSD_NODOWN |
+      CEPH_OSD_NOOUT;
+    for (int i = 0; i < max_osd; ++i) {
+      if (osd_state[i] & flags) {
+	ostringstream ss;
+	set<string> states;
+	OSDMap::calc_state_set(osd_state[i] & flags, states);
+	ss << "osd." << i << " has flags " << states;
+	detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " osd(s) have {NOUP,NODOWN,NOIN,NOOUT} flags set";
+      auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
+    }
+  }
+
+  // OLD_CRUSH_TUNABLES
+  if (g_conf->mon_warn_on_legacy_crush_tunables) {
+    string min = crush->get_min_required_version();
+    if (min < g_conf->mon_crush_min_required_version) {
+      ostringstream ss;
+      ss << "crush map has legacy tunables (require " << min
+	 << ", min is " << g_conf->mon_crush_min_required_version << ")";
+      auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
+      d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
+    }
+  }
+
+  // OLD_CRUSH_STRAW_CALC_VERSION
+  if (g_conf->mon_warn_on_crush_straw_calc_version_zero) {
+    if (crush->get_straw_calc_version() == 0) {
+      ostringstream ss;
+      ss << "crush map has straw_calc_version=0";
+      auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
+      d.detail.push_back(
+	"see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
+    }
+  }
+
+  // CACHE_POOL_NO_HIT_SET
+  if (g_conf->mon_warn_on_cache_pools_without_hit_sets) {
+    list<string> detail;
+    for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
+	 p != pools.end();
+	 ++p) {
+      const pg_pool_t& info = p->second;
+      if (info.cache_mode_requires_hit_set() &&
+	  info.hit_set_params.get_type() == HitSet::TYPE_NONE) {
+	ostringstream ss;
+	ss << "pool '" << get_pool_name(p->first)
+	   << "' with cache_mode " << info.get_cache_mode_name()
+	   << " needs hit_set_type to be set but it is not";
+	detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " cache pools are missing hit_sets";
+      auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
+    }
+  }
+
+  // OSD_NO_SORTBITWISE
+  if (!test_flag(CEPH_OSDMAP_SORTBITWISE) &&
+      (get_up_osd_features() &
+       CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT)) {
+    ostringstream ss;
+    ss << "no legacy OSD present but 'sortbitwise' flag is not set";
+    checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
+  }
+
+  // OSD_UPGRADE_FINISHED
+  // none of these (yet) since we don't run until luminous upgrade is done.
+
+  // POOL_FULL
+  {
+    list<string> detail;
+    for (auto it : get_pools()) {
+      const pg_pool_t &pool = it.second;
+      if (pool.has_flag(pg_pool_t::FLAG_FULL)) {
+	const string& pool_name = get_pool_name(it.first);
+	stringstream ss;
+	ss << "pool '" << pool_name << "' is full";
+	detail.push_back(ss.str());
+      }
+    }
+    if (!detail.empty()) {
+      ostringstream ss;
+      ss << detail.size() << " pool(s) full";
+      auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
+      d.detail.swap(detail);
+    }
+  }
+}
diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h
index b6301f1fdc3c4..6538c9e62964d 100644
--- a/src/osd/OSDMap.h
+++ b/src/osd/OSDMap.h
@@ -41,6 +41,7 @@ using namespace std;
 // forward declaration
 class CephContext;
 class CrushWrapper;
+class health_check_map_t;
 
 // FIXME C++11 does not have std::equal for two differently-typed containers.
 // use this until we move to c++14
@@ -1362,6 +1363,8 @@ class OSDMap {
   void dump(Formatter *f) const;
   static void generate_test_instances(list<OSDMap*>& o);
   bool check_new_blacklist_entries() const { return new_blacklist_entries; }
+
+  void check_health(health_check_map_t *checks) const;
 };
 WRITE_CLASS_ENCODER_FEATURES(OSDMap)
 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 725c5dd2d5afc..05ad63c69c121 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1677,7 +1677,7 @@ void PG::activate(ObjectStore::Transaction& t,
 	 * behind.
 	 */
 	// backfill
-	osd->clog->info() << info.pgid << " starting backfill to osd." << peer
+	osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
 			 << " from (" << pi.log_tail << "," << pi.last_update
 			  << "] " << pi.last_backfill
 			 << " to " << info.last_update;
diff --git a/src/pybind/mgr/dashboard/base.html b/src/pybind/mgr/dashboard/base.html
index e7256d1ba0aeb..18874fb565f06 100644
--- a/src/pybind/mgr/dashboard/base.html
+++ b/src/pybind/mgr/dashboard/base.html
@@ -39,7 +39,7 @@
 
             var refresh = function() {
                 $.get("/toplevel_data", function(data) {
-                    _.extend(toplevel_data.health, data.health);
+                    _.extend(toplevel_data, data);
                     setTimeout(refresh, refresh_interval);
                 });
             };
@@ -60,6 +60,14 @@
                 }
             }
 
+            rivets.formatters.health_ok = function(status_str) {
+                if (status_str == "HEALTH_OK") {
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+
             var truncate = function(n, max_width) {
                 var stringized = n.toString();
                 var parts = stringized.split(".");
@@ -106,7 +114,7 @@
             <!--rivet.formatters.mon_summary = function(mon_map) {-->
             <!--}-->
 
-            rivets.bind($("#health"), toplevel_data.health);
+            rivets.bind($("#health"), toplevel_data);
             rivets.bind($("section.sidebar"), toplevel_data);
             setTimeout(refresh, refresh_interval);
         });
@@ -140,10 +148,11 @@
                 <span class="sr-only">Toggle navigation</span>
             </a>
 
-            <div id="health" style="font-size: 20px; padding: 12px 12px;">
-                Health:&nbsp;
-                <span rv-style="overall_status | health_color">
-                    {overall_status}
+            <div id="health" style="font-size: 18px; padding: 12px 12px;">
+                <span rv-hide="health_status | health_ok" >
+                    <span rv-style="health_status | health_color">
+                        {health_status}
+                    </span>
                 </span>
             </div>
 
diff --git a/src/pybind/mgr/dashboard/health.html b/src/pybind/mgr/dashboard/health.html
index e41a1e2da5e2f..de5a794f27dc5 100644
--- a/src/pybind/mgr/dashboard/health.html
+++ b/src/pybind/mgr/dashboard/health.html
@@ -99,12 +99,16 @@ <h1>
     <!-- Main content -->
     <section class="content">
 
-    Overall status: <span rv-style="health.overall_status | health_color">{health.overall_status}</span>
+    <div class="box-body">
+    Overall status: <span rv-style="health.status | health_color">{health.status}</span>
 
     <ul>
-        <li rv-each-summary="health.summary">
-            {summary.severity}: {summary.summary}
-        </li>
+        <ul>
+            <li rv-each-check="health.checks">
+                <span rv-style="check.severity | health_color">{check.type}</span>:
+                {check.message}
+            </li>
+        </ul>
     </ul>
 
         <div class="row">
diff --git a/src/pybind/mgr/dashboard/module.py b/src/pybind/mgr/dashboard/module.py
index 2576680e395de..93300135f49c2 100644
--- a/src/pybind/mgr/dashboard/module.py
+++ b/src/pybind/mgr/dashboard/module.py
@@ -434,8 +434,8 @@ def _toplevel_data(self):
                 ]
 
                 return {
-                    'health': global_instance().get_sync_object(Health).data,
                     'rbd_pools': rbd_pools,
+                    'health_status': self._health_data()['status'],
                     'filesystems': filesystems
                 }
 
@@ -635,6 +635,21 @@ def _servers(self):
             def servers_data(self):
                 return self._servers()
 
+            def _health_data(self):
+                health = global_instance().get_sync_object(Health).data
+                # Transform the `checks` dict into a list for the convenience
+                # of rendering from javascript.
+                checks = []
+                for k, v in health['checks'].iteritems():
+                    v['type'] = k
+                    checks.append(v)
+
+                checks = sorted(checks, cmp=lambda a, b: a['severity'] > b['severity'])
+
+                health['checks'] = checks
+
+                return health
+
             def _health(self):
                 # Fuse osdmap with pg_summary to get description of pools
                 # including their PG states
@@ -670,7 +685,7 @@ def get_rate(series):
                 del osd_map['pg_temp']
 
                 return {
-                    "health": global_instance().get_sync_object(Health).data,
+                    "health": self._health_data(),
                     "mon_status": global_instance().get_sync_object(
                         MonStatus).data,
                     "osd_map": osd_map,