From 3c1f297032010a454cb008d3f998c8f567a20f98 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 30 Aug 2018 10:26:41 +0800 Subject: [PATCH] os/bluestore: fix deep-scrub operation againest disk silent errors Say a object who has data caches, but in a while later, caches' underlying physical device has silent disk erros accidentally, then caches and physical data are not same. In such case, deep-scrub operation still tries to read caches firstly and won't do crc checksum, then deep-scrub won't find such data corruptions timely. Here introduce a new flag 'CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE' which tells deep-scrub to bypass object caches. Note that we only bypass cache who is in STATE_CLEAN state. For STATE_WRITING caches, currently they are not written to physical device, so deep-scrub operation can not read physical device and can read these dirty caches safely. Once they are in STATE_CLEAN state(or not added to bluestore cache), next round deep-scurb can check them correctly. As to above discussions, I refactor BlueStore::BufferSpace::read sightly, adding a new 'flags' argument, whose value will be 0 or: enum { BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache }; flags 0: normal read, do not bypass clean or dirty cache flags BYPASS_CLEAN_CACHE: bypass clean cache, currently only for deep-scrube operation Test: I deliberately corrupt a object with cache, with this patch, deep-scrub can find data error very timely. Signed-off-by: Xiaoguang Wang (cherry picked from commit a7f1af25dd2ba88a322ed21828f073a277b09d02) Conflicts: src/include/rados.h src/os/bluestore/BlueStore.cc: trivial resolution --- src/include/rados.h | 1 + src/os/bluestore/BlueStore.cc | 22 +++++++++++++++++++--- src/os/bluestore/BlueStore.h | 7 ++++++- src/osd/ReplicatedBackend.cc | 3 ++- src/osd/osd_types.cc | 3 +++ 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/include/rados.h b/src/include/rados.h index 7fd874ff117178..e91236a011c47d 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -463,6 +463,7 @@ enum { CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */ + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */ }; #define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/ diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 07dbe7cae33d2a..aefd5b31b88c03 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1302,7 +1302,8 @@ void BlueStore::BufferSpace::read( uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, - interval_set& res_intervals) + interval_set& res_intervals, + int flags) { res.clear(); res_intervals.clear(); @@ -1316,7 +1317,13 @@ void BlueStore::BufferSpace::read( ++i) { Buffer *b = i->second.get(); assert(b->end() > offset); - if (b->is_writing() || b->is_clean()) { + + bool val = false; + if (flags & BYPASS_CLEAN_CACHE) + val = b->is_writing(); + else + val = b->is_writing() || b->is_clean(); + if (val) { if (b->offset < offset) { uint32_t skip = offset - b->offset; uint32_t l = MIN(length, b->length - skip); @@ -6693,6 +6700,7 @@ int BlueStore::_do_read( { FUNCTRACE(); int r = 0; + int read_cache_policy = 0; // do not bypass clean or dirty cache dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << " size 0x" << o->onode.size << " (" << std::dec @@ -6727,6 +6735,13 @@ int BlueStore::_do_read( ready_regions_t ready_regions; + // for deep-scrub, we only read dirty cache and bypass clean cache in + // order to read underlying block device in case there are silent disk errors. + if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { + dout(20) << __func__ << " will bypass cache and do direct read" << dendl; + read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; + } + // build blob-wise list to of stuff read (that isn't cached) blobs2read_t blobs2read; unsigned left = length; @@ -6752,7 +6767,8 @@ int BlueStore::_do_read( ready_regions_t cache_res; interval_set cache_interval; bptr->shared_blob->bc.read( - bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval); + bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval, + read_cache_policy); dout(20) << __func__ << " blob " << *bptr << std::hex << " need 0x" << b_off << "~" << b_len << " cache has 0x" << cache_interval diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index a3878779d6181c..6faa1e87164bbf 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -240,6 +240,10 @@ class BlueStore : public ObjectStore, /// map logical extent range (object) onto buffers struct BufferSpace { + enum { + BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache + }; + typedef boost::intrusive::list< Buffer, boost::intrusive::member_hook< @@ -339,7 +343,8 @@ class BlueStore : public ObjectStore, void read(Cache* cache, uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, - interval_set& res_intervals); + interval_set& res_intervals, + int flags = 0); void truncate(Cache* cache, uint32_t offset) { discard(cache, offset, (uint32_t)-1 - offset); diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 602a0f08f76b13..54a49aeaeea338 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -712,7 +712,8 @@ int ReplicatedBackend::be_deep_scrub( dout(10) << __func__ << " " << poid << " pos " << pos << dendl; int r; uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE; utime_t sleeptime; sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2c5e37fca95ba0..efcb169a485c44 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -103,6 +103,9 @@ const char * ceph_osd_op_flag_name(unsigned flag) case CEPH_OSD_OP_FLAG_WITH_REFERENCE: name = "with_reference"; break; + case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE: + name = "bypass_clean_cache"; + break; default: name = "???"; };