Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mimic: os/bluestore: handle spurious read errors #24647

Merged
merged 1 commit into from Oct 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/common/legacy_config_opts.h
Expand Up @@ -971,6 +971,7 @@ OPTION(bluestore_block_wal_size, OPT_U64) // rocksdb wal
OPTION(bluestore_block_wal_create, OPT_BOOL)
OPTION(bluestore_block_preallocate_file, OPT_BOOL) //whether preallocate space if block/db_path/wal_path is file rather that block device.
OPTION(bluestore_csum_type, OPT_STR) // none|xxhash32|xxhash64|crc32c|crc32c_16|crc32c_8
OPTION(bluestore_retry_disk_reads, OPT_U64)
OPTION(bluestore_min_alloc_size, OPT_U32)
OPTION(bluestore_min_alloc_size_hdd, OPT_U32)
OPTION(bluestore_min_alloc_size_ssd, OPT_U32)
Expand Down Expand Up @@ -1066,6 +1067,7 @@ OPTION(bluestore_debug_permit_any_bdev_label, OPT_BOOL)
OPTION(bluestore_shard_finishers, OPT_BOOL)
OPTION(bluestore_debug_random_read_err, OPT_DOUBLE)
OPTION(bluestore_debug_inject_bug21040, OPT_BOOL)
OPTION(bluestore_debug_inject_csum_err_probability, OPT_FLOAT)

OPTION(kstore_max_ops, OPT_U64)
OPTION(kstore_max_bytes, OPT_U64)
Expand Down
12 changes: 12 additions & 0 deletions src/common/options.cc
Expand Up @@ -3877,6 +3877,13 @@ std::vector<Option> get_global_options() {
.set_description("Default checksum algorithm to use")
.set_long_description("crc32c, xxhash32, and xxhash64 are available. The _16 and _8 variants use only a subset of the bits for more compact (but less reliable) checksumming."),

Option("bluestore_retry_disk_reads", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(3)
.set_min_max(0, 255)
.set_flag(Option::FLAG_RUNTIME)
.set_description("Number of read retries on checksum validation error")
.set_long_description("Retries to read data from the disk this many times when checksum validation fails to handle spurious read errors gracefully."),

Option("bluestore_min_alloc_size", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
.set_default(0)
.set_flag(Option::FLAG_CREATE)
Expand Down Expand Up @@ -4253,6 +4260,11 @@ std::vector<Option> get_global_options() {
Option("bluestore_debug_inject_bug21040", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description(""),

Option("bluestore_debug_inject_csum_err_probability", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(0.0)
.set_description("inject crc verification errors into bluestore device reads"),

// -----------------------------------------
// kstore

Expand Down
35 changes: 32 additions & 3 deletions src/os/bluestore/BlueStore.cc
Expand Up @@ -4157,6 +4157,8 @@ void BlueStore::_init_logger()
"collection");
b.add_u64_counter(l_bluestore_read_eio, "bluestore_read_eio",
"Read EIO errors propagated to high level callers");
b.add_u64_counter(l_bluestore_reads_with_retries, "bluestore_reads_with_retries",
"Read operations that required at least one retry due to failed checksum validation");
b.add_u64(l_bluestore_fragmentation, "bluestore_fragmentation_micros",
"How fragmented bluestore free space is (free extents / max possible number of free extents) * 1000");
logger = b.create_perf_counters();
Expand Down Expand Up @@ -7192,7 +7194,8 @@ int BlueStore::_do_read(
uint64_t offset,
size_t length,
bufferlist& bl,
uint32_t op_flags)
uint32_t op_flags,
uint64_t retry_count)
{
FUNCTRACE(cct);
int r = 0;
Expand Down Expand Up @@ -7413,7 +7416,14 @@ int BlueStore::_do_read(
bufferlist& compressed_bl = *p++;
if (_verify_csum(o, &bptr->get_blob(), 0, compressed_bl,
b2r_it->second.front().logical_offset) < 0) {
return -EIO;
// Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under
// high memory pressure. Retrying the failing read succeeds in most cases.
// See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO;
}
return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
bufferlist raw_bl;
r = _decompress(compressed_bl, &raw_bl);
Expand All @@ -7431,7 +7441,14 @@ int BlueStore::_do_read(
for (auto& reg : b2r_it->second) {
if (_verify_csum(o, &bptr->get_blob(), reg.r_off, reg.bl,
reg.logical_offset) < 0) {
return -EIO;
// Handles spurious read errors caused by a kernel bug.
// We sometimes get all-zero pages as a result of the read under
// high memory pressure. Retrying the failing read succeeds in most cases.
// See also: http://tracker.ceph.com/issues/22464
if (retry_count >= cct->_conf->bluestore_retry_disk_reads) {
return -EIO;
}
return _do_read(c, o, offset, length, bl, op_flags, retry_count + 1);
}
if (buffered) {
bptr->shared_blob->bc.did_read(bptr->shared_blob->get_cache(),
Expand Down Expand Up @@ -7475,6 +7492,11 @@ int BlueStore::_do_read(
ceph_assert(pos == length);
ceph_assert(pr == pr_end);
r = bl.length();
if (retry_count) {
logger->inc(l_bluestore_reads_with_retries);
dout(5) << __func__ << " read at 0x" << std::hex << offset << "~" << length
<< " failed " << std::dec << retry_count << " times before succeeding" << dendl;
}
return r;
}

Expand All @@ -7487,6 +7509,13 @@ int BlueStore::_verify_csum(OnodeRef& o,
uint64_t bad_csum;
utime_t start = ceph_clock_now();
int r = blob->verify_csum(blob_xoffset, bl, &bad, &bad_csum);
if (cct->_conf->bluestore_debug_inject_csum_err_probability > 0 &&
(rand() % 10000) < cct->_conf->bluestore_debug_inject_csum_err_probability * 10000.0) {
derr << __func__ << " injecting bluestore checksum verifcation error" << dendl;
bad = blob_xoffset;
r = -1;
bad_csum = 0xDEADBEEF;
}
if (r < 0) {
if (r == -1) {
PExtentVector pex;
Expand Down
4 changes: 3 additions & 1 deletion src/os/bluestore/BlueStore.h
Expand Up @@ -119,6 +119,7 @@ enum {
l_bluestore_extent_compress,
l_bluestore_gc_merged,
l_bluestore_read_eio,
l_bluestore_reads_with_retries,
l_bluestore_fragmentation,
l_bluestore_last
};
Expand Down Expand Up @@ -2238,7 +2239,8 @@ class BlueStore : public ObjectStore,
uint64_t offset,
size_t len,
bufferlist& bl,
uint32_t op_flags = 0);
uint32_t op_flags = 0,
uint64_t retry_count = 0);

private:
int _fiemap(CollectionHandle &c_, const ghobject_t& oid,
Expand Down
63 changes: 63 additions & 0 deletions src/test/objectstore/store_test.cc
Expand Up @@ -7032,6 +7032,69 @@ TEST_P(StoreTest, BluestoreStatistics) {
f->flush(cout);
cout << std::endl;
}

TEST_P(StoreTest, SpuriousReadErrorTest) {
if (string(GetParam()) != "bluestore")
return;

int r;
auto logger = store->get_perf_counters();
coll_t cid;
auto ch = store->create_new_collection(cid);
ghobject_t hoid(hobject_t(sobject_t("foo", CEPH_NOSNAP)));
{
ObjectStore::Transaction t;
t.create_collection(cid, 0);
cerr << "Creating collection " << cid << std::endl;
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
}
bufferlist test_data;
bufferptr ap(0x2000);
memset(ap.c_str(), 'a', 0x2000);
test_data.append(ap);
{
ObjectStore::Transaction t;
t.write(cid, hoid, 0, 0x2000, test_data);
r = queue_transaction(store, ch, std::move(t));
ASSERT_EQ(r, 0);
// force cache clear
EXPECT_EQ(store->umount(), 0);
EXPECT_EQ(store->mount(), 0);
}

cerr << "Injecting CRC error with no retry, expecting EIO" << std::endl;
SetVal(g_conf, "bluestore_retry_disk_reads", "0");
SetVal(g_conf, "bluestore_debug_inject_csum_err_probability", "1");
g_ceph_context->_conf->apply_changes(nullptr);
{
bufferlist in;
r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
ASSERT_EQ(-EIO, r);
ASSERT_EQ(logger->get(l_bluestore_read_eio), 1u);
ASSERT_EQ(logger->get(l_bluestore_reads_with_retries), 0u);
}

cerr << "Injecting CRC error with retries, expecting success after several retries" << std::endl;
SetVal(g_conf, "bluestore_retry_disk_reads", "255");
SetVal(g_conf, "bluestore_debug_inject_csum_err_probability", "0.8");
/**
* Probabilistic test: 25 reads, each has a 80% chance of failing with 255 retries
* Probability of at least one retried read: 1 - (0.2 ** 25) = 100% - 3e-18
* Probability of a random test failure: 1 - ((1 - (0.8 ** 255)) ** 25) ~= 5e-24
*/
g_ceph_context->_conf->apply_changes(nullptr);
{
for (int i = 0; i < 25; ++i) {
bufferlist in;
r = store->read(ch, hoid, 0, 0x2000, in, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
ASSERT_EQ(0x2000, r);
ASSERT_TRUE(bl_eq(test_data, in));
}
ASSERT_GE(logger->get(l_bluestore_reads_with_retries), 1u);
}
}

#endif // WITH_BLUESTORE

int main(int argc, char **argv) {
Expand Down