diff --git a/src/include/rados.h b/src/include/rados.h index 7fd874ff117178..e91236a011c47d 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -463,6 +463,7 @@ enum { CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */ CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */ + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */ }; #define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/ diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 07dbe7cae33d2a..aefd5b31b88c03 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -1302,7 +1302,8 @@ void BlueStore::BufferSpace::read( uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, - interval_set& res_intervals) + interval_set& res_intervals, + int flags) { res.clear(); res_intervals.clear(); @@ -1316,7 +1317,13 @@ void BlueStore::BufferSpace::read( ++i) { Buffer *b = i->second.get(); assert(b->end() > offset); - if (b->is_writing() || b->is_clean()) { + + bool val = false; + if (flags & BYPASS_CLEAN_CACHE) + val = b->is_writing(); + else + val = b->is_writing() || b->is_clean(); + if (val) { if (b->offset < offset) { uint32_t skip = offset - b->offset; uint32_t l = MIN(length, b->length - skip); @@ -6693,6 +6700,7 @@ int BlueStore::_do_read( { FUNCTRACE(); int r = 0; + int read_cache_policy = 0; // do not bypass clean or dirty cache dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length << " size 0x" << o->onode.size << " (" << std::dec @@ -6727,6 +6735,13 @@ int BlueStore::_do_read( ready_regions_t ready_regions; + // for deep-scrub, we only read dirty cache and bypass clean cache in + // order to read underlying block device in case there are silent disk errors. + if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) { + dout(20) << __func__ << " will bypass cache and do direct read" << dendl; + read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE; + } + // build blob-wise list to of stuff read (that isn't cached) blobs2read_t blobs2read; unsigned left = length; @@ -6752,7 +6767,8 @@ int BlueStore::_do_read( ready_regions_t cache_res; interval_set cache_interval; bptr->shared_blob->bc.read( - bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval); + bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval, + read_cache_policy); dout(20) << __func__ << " blob " << *bptr << std::hex << " need 0x" << b_off << "~" << b_len << " cache has 0x" << cache_interval diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index a3878779d6181c..6faa1e87164bbf 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -240,6 +240,10 @@ class BlueStore : public ObjectStore, /// map logical extent range (object) onto buffers struct BufferSpace { + enum { + BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache + }; + typedef boost::intrusive::list< Buffer, boost::intrusive::member_hook< @@ -339,7 +343,8 @@ class BlueStore : public ObjectStore, void read(Cache* cache, uint32_t offset, uint32_t length, BlueStore::ready_regions_t& res, - interval_set& res_intervals); + interval_set& res_intervals, + int flags = 0); void truncate(Cache* cache, uint32_t offset) { discard(cache, offset, (uint32_t)-1 - offset); diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc index 602a0f08f76b13..54a49aeaeea338 100644 --- a/src/osd/ReplicatedBackend.cc +++ b/src/osd/ReplicatedBackend.cc @@ -712,7 +712,8 @@ int ReplicatedBackend::be_deep_scrub( dout(10) << __func__ << " " << poid << " pos " << pos << dendl; int r; uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | + CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE; utime_t sleeptime; sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 2c5e37fca95ba0..efcb169a485c44 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -103,6 +103,9 @@ const char * ceph_osd_op_flag_name(unsigned flag) case CEPH_OSD_OP_FLAG_WITH_REFERENCE: name = "with_reference"; break; + case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE: + name = "bypass_clean_cache"; + break; default: name = "???"; };