Skip to content

Commit

Permalink
os/bluestore: fix deep-scrub operation againest disk silent errors
Browse files Browse the repository at this point in the history
Say a object who has data caches, but in a while later, caches' underlying
physical device has silent disk erros accidentally, then caches and physical
data are not same. In such case, deep-scrub operation still tries to read
caches firstly and won't do crc checksum, then deep-scrub won't find such
data corruptions timely.

Here introduce a new flag 'CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE' which tells
deep-scrub to bypass object caches. Note that we only bypass cache who is in
STATE_CLEAN state. For STATE_WRITING caches, currently they are not written
to physical device, so deep-scrub operation can not read physical device and
can read these dirty caches safely. Once they are in STATE_CLEAN state(or not
added to bluestore cache), next round deep-scurb can check them correctly.

As to above discussions, I refactor BlueStore::BufferSpace::read sightly,
adding a new 'flags' argument, whose value will be 0 or:
     enum {
       BYPASS_CLEAN_CACHE = 0x1,     // bypass clean cache
     };

flags 0: normal read, do not bypass clean or dirty cache
flags BYPASS_CLEAN_CACHE: bypass clean cache, currently only for deep-scrube
                        operation

Test:
   I deliberately corrupt a object with cache, with this patch, deep-scrub
   can find data error very timely.

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@easystack.cn>
(cherry picked from commit a7f1af2)

Conflicts:
	src/include/rados.h
	src/os/bluestore/BlueStore.cc: trivial resolution
  • Loading branch information
wangxiaoguang authored and tchaikov committed Oct 29, 2018
1 parent da3b62d commit a7bcb26
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 5 deletions.
1 change: 1 addition & 0 deletions src/include/rados.h
Expand Up @@ -463,6 +463,7 @@ enum {
CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in the near future */
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed in the near future */
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40, /* data will be accessed only once by this client */
CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE = 0x100, /* bypass ObjectStore cache, mainly for deep-scrub */
};

#define EOLDSNAPC 85 /* ORDERSNAP flag set; writer has old snapc*/
Expand Down
22 changes: 19 additions & 3 deletions src/os/bluestore/BlueStore.cc
Expand Up @@ -1302,7 +1302,8 @@ void BlueStore::BufferSpace::read(
uint32_t offset,
uint32_t length,
BlueStore::ready_regions_t& res,
interval_set<uint32_t>& res_intervals)
interval_set<uint32_t>& res_intervals,
int flags)
{
res.clear();
res_intervals.clear();
Expand All @@ -1316,7 +1317,13 @@ void BlueStore::BufferSpace::read(
++i) {
Buffer *b = i->second.get();
assert(b->end() > offset);
if (b->is_writing() || b->is_clean()) {

bool val = false;
if (flags & BYPASS_CLEAN_CACHE)
val = b->is_writing();
else
val = b->is_writing() || b->is_clean();
if (val) {
if (b->offset < offset) {
uint32_t skip = offset - b->offset;
uint32_t l = MIN(length, b->length - skip);
Expand Down Expand Up @@ -6693,6 +6700,7 @@ int BlueStore::_do_read(
{
FUNCTRACE();
int r = 0;
int read_cache_policy = 0; // do not bypass clean or dirty cache

dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
<< " size 0x" << o->onode.size << " (" << std::dec
Expand Down Expand Up @@ -6727,6 +6735,13 @@ int BlueStore::_do_read(

ready_regions_t ready_regions;

// for deep-scrub, we only read dirty cache and bypass clean cache in
// order to read underlying block device in case there are silent disk errors.
if (op_flags & CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE) {
dout(20) << __func__ << " will bypass cache and do direct read" << dendl;
read_cache_policy = BufferSpace::BYPASS_CLEAN_CACHE;
}

// build blob-wise list to of stuff read (that isn't cached)
blobs2read_t blobs2read;
unsigned left = length;
Expand All @@ -6752,7 +6767,8 @@ int BlueStore::_do_read(
ready_regions_t cache_res;
interval_set<uint32_t> cache_interval;
bptr->shared_blob->bc.read(
bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval);
bptr->shared_blob->get_cache(), b_off, b_len, cache_res, cache_interval,
read_cache_policy);
dout(20) << __func__ << " blob " << *bptr << std::hex
<< " need 0x" << b_off << "~" << b_len
<< " cache has 0x" << cache_interval
Expand Down
7 changes: 6 additions & 1 deletion src/os/bluestore/BlueStore.h
Expand Up @@ -240,6 +240,10 @@ class BlueStore : public ObjectStore,

/// map logical extent range (object) onto buffers
struct BufferSpace {
enum {
BYPASS_CLEAN_CACHE = 0x1, // bypass clean cache
};

typedef boost::intrusive::list<
Buffer,
boost::intrusive::member_hook<
Expand Down Expand Up @@ -339,7 +343,8 @@ class BlueStore : public ObjectStore,

void read(Cache* cache, uint32_t offset, uint32_t length,
BlueStore::ready_regions_t& res,
interval_set<uint32_t>& res_intervals);
interval_set<uint32_t>& res_intervals,
int flags = 0);

void truncate(Cache* cache, uint32_t offset) {
discard(cache, offset, (uint32_t)-1 - offset);
Expand Down
3 changes: 2 additions & 1 deletion src/osd/ReplicatedBackend.cc
Expand Up @@ -712,7 +712,8 @@ int ReplicatedBackend::be_deep_scrub(
dout(10) << __func__ << " " << poid << " pos " << pos << dendl;
int r;
uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED |
CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE;

utime_t sleeptime;
sleeptime.set_from_double(cct->_conf->osd_debug_deep_scrub_sleep);
Expand Down
3 changes: 3 additions & 0 deletions src/osd/osd_types.cc
Expand Up @@ -100,6 +100,9 @@ const char * ceph_osd_op_flag_name(unsigned flag)
case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
name = "fadvise_nocache";
break;
case CEPH_OSD_OP_FLAG_BYPASS_CLEAN_CACHE:
name = "bypass_clean_cache";
break;
default:
name = "???";
};
Expand Down

0 comments on commit a7bcb26

Please sign in to comment.