Skip to content

Commit

Permalink
Merge pull request #24647 from liewegas/wip-pr-23273-mimic
Browse files Browse the repository at this point in the history
mimic: os/bluestore: handle spurious read errors

Reviewed-by: Igor Fedotov <ifedotov@suse.com>
  • Loading branch information
yuriw committed Oct 29, 2018
2 parents da3aede + 34a9104 commit b20ee1a
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 4 deletions.
2 changes: 2 additions & 0 deletions src/common/legacy_config_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,7 @@ OPTION(bluestore_block_wal_size, OPT_U64) // rocksdb wal
OPTION(bluestore_block_wal_create, OPT_BOOL)
OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space if block/db_path/wal_path is file rather that block device.
OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
OPTION(bluestore_retry_disk_reads, OPT_U64)
OPTION(bluestore_min_alloc_size, OPT_U32)
OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
Expand Down Expand Up @@ -1066,6 +1067,7 @@ OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
OPTION(bluestore_shard_finishers, OPT_BOOL)
OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
OPTION(bluestore_debug_inject_bug21040, OPT_BOOL)
OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)

OPTION(kstore_max_ops, OPT_U64)
OPTION(kstore_max_bytes, OPT_U64)
Expand Down
12 changes: 12 additions & 0 deletions src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3877,6 +3877,13 @@ std::vector<Option> get_global_options() {
.set_description("Default checksum algorithm to use")
.set_long_description("crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming."),

Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(3)
.set_min_max(0, 255)
.set_flag(Option::FLAG_RUNTIME)
.set_description("Number of read retries on checksum validation error")
.set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),

Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_flag(Option::FLAG_CREATE)
Expand Down Expand Up @@ -4253,6 +4260,11 @@ std::vector<Option> get_global_options() {
Option("bluestore_debug_inject_bug21040", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),

Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(0.0)
.set_description("inject crc verification errors into bluestore device reads"),

// -----------------------------------------
// kstore

Expand Down
35 changes: 32 additions & 3 deletions src/os/bluestore/BlueStore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4157,6 +4157,8 @@ void BlueStore::_init_logger()
"collection");
b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
"Read EIO errors propagated to high level callers");
b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
"Read operations that required at least one retry due to failed checksum validation");
b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
"How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
logger = b.create_perf_counters();
Expand Down Expand Up @@ -7192,7 +7194,8 @@ int BlueStore::_do_read(
uint64_t offset,
size_t length,
bufferlist& bl,
uint32_t op_flags)
uint32_t op_flags,
uint64_t retry_count)
{
FUNCTRACE(cct);
int r = 0;
Expand Down Expand Up @@ -7413,7 +7416,14 @@ int BlueStore::_do_read(
bufferlist& compressed_bl = *p++;
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
b2r_it->second.front().logical_offset) < 0) {
return -EIO;
// Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under
// high memory pressure. Retrying the failing read succeeds in most cases.
// See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO;
}
return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
bufferlist raw_bl;
r = _decompress(compressed_bl, &raw_bl);
Expand All @@ -7431,7 +7441,14 @@ int BlueStore::_do_read(
for (auto& reg : b2r_it->second) {
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
reg.logical_offset) < 0) {
return -EIO;
// Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under
// high memory pressure. Retrying the failing read succeeds in most cases.
// See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO;
}
return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
if (buffered) {
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
Expand Down Expand Up @@ -7475,6 +7492,11 @@ int BlueStore::_do_read(
ceph_assert(pos == length);
ceph_assert(pr == pr_end);
r = bl.length();
if (retry_count) {
logger->inc(l_bluestore_reads_with_retries);
dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
<< " failed " << std::dec << retry_count << " times before succeeding" << dendl;
}
return r;
}

Expand All @@ -7487,6 +7509,13 @@ int BlueStore::_verify_csum(OnodeRef& o,
uint64_t bad_csum;
utime_t start = ceph_clock_now();
int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
(rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
bad = blob_xoffset;
r = -1;
bad_csum = 0xDEADBEEF;
}
if (r < 0) {
if (r == -1) {
PExtentVector pex;
Expand Down
4 changes: 3 additions & 1 deletion src/os/bluestore/BlueStore.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ enum {
l_bluestore_extent_compress,
l_bluestore_gc_merged,
l_bluestore_read_eio,
l_bluestore_reads_with_retries,
l_bluestore_fragmentation,
l_bluestore_last
};
Expand Down Expand Up @@ -2238,7 +2239,8 @@ class BlueStore : public ObjectStore,
uint64_t offset,
size_t len,
bufferlist& bl,
uint32_t op_flags = 0);
uint32_t op_flags = 0,
uint64_t retry_count = 0);

private:
int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
Expand Down
63 changes: 63 additions & 0 deletions src/test/objectstore/store_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7040,6 +7040,69 @@ TEST_P(StoreTest, BluestoreStatistics) {
f->flush(cout);
cout << std::endl;
}

TEST_P(StoreTest, SpuriousReadErrorTest) {
if (string(GetParam()) != "bluestore")
return;

int r;
auto logger = store->get_perf_counters();
coll_t cid;
auto ch = store->create_new_collection(cid);
ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
{
ObjectStore::Transaction t;
t.create_collection(cid, 0);
cerr << "Creating collection " << cid << std::endl;
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
bufferlist test_data;
bufferptr ap(0x2000);
memset(ap.c_str(), 'a', 0x2000);
test_data.append(ap);
{
ObjectStore::Transaction t;
t.write(cid, hoid, 0, 0x2000, test_data);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
// force cache clear
EXPECT_EQ(store->umount(), 0);
EXPECT_EQ(store->mount(), 0);
}

cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
SetVal(g_conf, "bluestore_retry_disk_reads", "0");
SetVal(g_conf, "bluestore_debug_inject_csum_err_probability", "1");
g_ceph_context->_conf->apply_changes(nullptr);
{
bufferlist in;
r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
ASSERT_EQ(-EIO, r);
ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
}

cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
SetVal(g_conf, "bluestore_retry_disk_reads", "255");
SetVal(g_conf, "bluestore_debug_inject_csum_err_probability", "0.8");
/**
* Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
* Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
* Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
*/
g_ceph_context->_conf->apply_changes(nullptr);
{
for (int i = 0; i < 25; ++i) {
bufferlist in;
r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
ASSERT_EQ(0x2000, r);
ASSERT_TRUE(bl_eq(test_data, in));
}
ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
}
}

#endif // WITH_BLUESTORE

int main(int argc, char **argv) {
Expand Down

0 comments on commit b20ee1a

Please sign in to comment.