From 5075b27f007a63e097d4c442f3b3160f086f164a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 3 Apr 2017 15:46:45 -0400 Subject: [PATCH 1/5] os/bluestore: release throttle before commit Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index b77f271ba8672..5edb8e347a83f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7497,8 +7497,6 @@ void BlueStore::_txc_committed_kv(TransContext *txc) if (!txc->oncommits.empty()) { finishers[n]->queue(txc->oncommits); } - throttle_ops.put(txc->ops); - throttle_bytes.put(txc->bytes); } void BlueStore::_txc_finish(TransContext *txc) @@ -7786,12 +7784,19 @@ void BlueStore::_kv_sync_thread() } } } - if (num_aios) { - for (auto txc : kv_committing) { - if (txc->had_ios) { - --txc->osr->txc_with_unstable_io; - } + for (auto txc : kv_committing) { + if (txc->had_ios) { + --txc->osr->txc_with_unstable_io; } + + // release throttle *before* we commit. this allows new ops + // to be prepared and enter pipeline while we are waiting on + // the kv commit sync/flush. then hopefully on the next + // iteration there will already be ops awake. otherwise, we + // end up going to sleep, and then wake up when the very first + // transaction is ready for commit. + throttle_ops.put(txc->ops); + throttle_bytes.put(txc->bytes); } PExtentVector bluefs_gift_extents; From b4b800f3e27895ddd801e8f50eb62264db7586fa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 3 Apr 2017 16:44:42 -0400 Subject: [PATCH 2/5] os/bluestore: unify throttling model Implement a super simple model for the cost of a transaction for the purposes of throttling. This replaces two independent throttles (one for transaction ops, one for bytes) and puts them both under the 'bytes' throttle. The txc model cost is expressed in terms of bytes as this is probably the simplest thing for users to reason about. Signed-off-by: Sage Weil --- src/common/config_opts.h | 3 ++ src/os/bluestore/BlueStore.cc | 77 +++++++++++++++++++++-------------- src/os/bluestore/BlueStore.h | 10 +++-- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index cf59928e03b63..4c07b5b5040ca 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1106,6 +1106,9 @@ OPTION(bluestore_fsck_on_mkfs_deep, OPT_BOOL, false) OPTION(bluestore_sync_submit_transaction, OPT_BOOL, false) // submit kv txn in queueing thread (not kv_sync_thread) OPTION(bluestore_max_ops, OPT_U64, 512) OPTION(bluestore_max_bytes, OPT_U64, 64*1024*1024) +OPTION(bluestore_throttle_cost_per_io_hdd, OPT_U64, 200000) +OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64, 4000) +OPTION(bluestore_throttle_cost_per_io, OPT_U64, 0) OPTION(bluestore_deferred_max_ops, OPT_U64, 512) OPTION(bluestore_deferred_max_bytes, OPT_U64, 128*1024*1024) OPTION(bluestore_deferred_batch_ops, OPT_U64, 8) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 5edb8e347a83f..09c30c2b56b4f 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -3170,11 +3170,7 @@ static void aio_cb(void *priv, void *priv2) BlueStore::BlueStore(CephContext *cct, const string& path) : ObjectStore(cct, path), - throttle_ops(cct, "bluestore_max_ops", cct->_conf->bluestore_max_ops), throttle_bytes(cct, "bluestore_max_bytes", cct->_conf->bluestore_max_bytes), - throttle_deferred_ops(cct, "bluestore_deferred_max_ops", - cct->_conf->bluestore_max_ops + - cct->_conf->bluestore_deferred_max_ops), throttle_deferred_bytes(cct, "bluestore_deferred_max_bytes", cct->_conf->bluestore_max_bytes + cct->_conf->bluestore_deferred_max_bytes), @@ -3201,11 +3197,7 @@ BlueStore::BlueStore(CephContext *cct, const string& path, uint64_t _min_alloc_size) : ObjectStore(cct, path), - throttle_ops(cct, "bluestore_max_ops", cct->_conf->bluestore_max_ops), throttle_bytes(cct, "bluestore_max_bytes", cct->_conf->bluestore_max_bytes), - throttle_deferred_ops(cct, "bluestore_deferred_max_ops", - cct->_conf->bluestore_max_ops + - cct->_conf->bluestore_deferred_max_ops), throttle_deferred_bytes(cct, "bluestore_deferred_max_bytes", cct->_conf->bluestore_max_bytes + cct->_conf->bluestore_deferred_max_bytes), @@ -3288,20 +3280,18 @@ void BlueStore::handle_conf_change(const struct md_config_t *conf, _set_alloc_sizes(); } } - if (changed.count("bluestore_max_ops")) { - throttle_ops.reset_max(conf->bluestore_max_ops); - throttle_deferred_ops.reset_max( - conf->bluestore_max_ops + conf->bluestore_deferred_max_ops); + if (changed.count("bluestore_throttle_cost_per_io") || + changed.count("bluestore_throttle_cost_per_io_hdd") || + changed.count("bluestore_throttle_cost_per_io_ssd")) { + if (bdev) { + _set_throttle_params(); + } } if (changed.count("bluestore_max_bytes")) { throttle_bytes.reset_max(conf->bluestore_max_bytes); throttle_deferred_bytes.reset_max( conf->bluestore_max_bytes + conf->bluestore_deferred_max_bytes); } - if (changed.count("bluestore_deferred_max_ops")) { - throttle_deferred_ops.reset_max( - conf->bluestore_max_ops + conf->bluestore_deferred_max_ops); - } if (changed.count("bluestore_deferred_max_bytes")) { throttle_deferred_bytes.reset_max( conf->bluestore_max_bytes + conf->bluestore_deferred_max_bytes); @@ -3352,6 +3342,23 @@ void BlueStore::_set_csum() << dendl; } +void BlueStore::_set_throttle_params() +{ + if (cct->_conf->bluestore_throttle_cost_per_io) { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io; + } else { + assert(bdev); + if (bdev->is_rotational()) { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_hdd; + } else { + throttle_cost_per_io = cct->_conf->bluestore_throttle_cost_per_io_ssd; + } + } + + dout(10) << __func__ << " throttle_cost_per_io " << throttle_cost_per_io + << dendl; +} + void BlueStore::_init_logger() { PerfCountersBuilder b(cct, "bluestore", @@ -7067,6 +7074,7 @@ int BlueStore::_open_super_meta() << std::dec << dendl; } _set_alloc_sizes(); + _set_throttle_params(); return 0; } @@ -7145,6 +7153,23 @@ BlueStore::TransContext *BlueStore::_txc_create(OpSequencer *osr) return txc; } +void BlueStore::_txc_calc_cost(TransContext *txc) +{ + // this is about the simplest model for trasnaction cost you can + // imagine. there is some fixed overhead cost by saying there is a + // minimum of one "io". and then we have some cost per "io" that is + // a configurable (with different hdd and ssd defaults), and add + // that to the bytes value. + int ios = 1; // one "io" for the kv commit + for (auto& p : txc->ioc.pending_aios) { + ios += p.iov.size(); + } + txc->cost = ios * throttle_cost_per_io + txc->bytes; + dout(10) << __func__ << " " << txc << " cost " << txc->cost << " (" + << ios << " ios * " << throttle_cost_per_io << " + " << txc->bytes + << " bytes)" << dendl; +} + void BlueStore::_txc_update_store_statfs(TransContext *txc) { if (txc->statfs_delta.is_empty()) @@ -7795,8 +7820,7 @@ void BlueStore::_kv_sync_thread() // iteration there will already be ops awake. otherwise, we // end up going to sleep, and then wake up when the very first // transaction is ready for commit. - throttle_ops.put(txc->ops); - throttle_bytes.put(txc->bytes); + throttle_bytes.put(txc->cost); } PExtentVector bluefs_gift_extents; @@ -7873,7 +7897,6 @@ void BlueStore::_kv_sync_thread() if (!deferred_aggressive) { std::lock_guard l(deferred_lock); if (deferred_queue_size >= (int)g_conf->bluestore_deferred_batch_ops || - throttle_deferred_ops.past_midpoint() || throttle_deferred_bytes.past_midpoint()) { _deferred_try_submit(); } @@ -8026,8 +8049,7 @@ int BlueStore::_deferred_finish(TransContext *txc) TransContext *txc = &i; txc->state = TransContext::STATE_DEFERRED_CLEANUP; txc->osr->qcond.notify_all(); - throttle_deferred_ops.put(txc->ops); - throttle_deferred_bytes.put(txc->bytes); + throttle_deferred_bytes.put(txc->cost); deferred_done_queue.push_back(txc); } finished.clear(); @@ -8119,10 +8141,10 @@ int BlueStore::queue_transactions( for (vector::iterator p = tls.begin(); p != tls.end(); ++p) { (*p).set_osr(osr); - txc->ops += (*p).get_num_ops(); txc->bytes += (*p).get_num_bytes(); _txc_add_transaction(txc, &(*p)); } + _txc_calc_cost(txc); _txc_write_nodes(txc, txc->t); @@ -8140,17 +8162,12 @@ int BlueStore::queue_transactions( handle->suspend_tp_timeout(); utime_t tstart = ceph_clock_now(); - throttle_ops.get(txc->ops); - throttle_bytes.get(txc->bytes); + throttle_bytes.get(txc->cost); if (txc->deferred_txn) { // ensure we do not block here because of deferred writes - if (!throttle_deferred_ops.get_or_fail(txc->ops)) { - deferred_try_submit(); - throttle_deferred_ops.get(txc->ops); - } - if (!throttle_deferred_bytes.get_or_fail(txc->bytes)) { + if (!throttle_deferred_bytes.get_or_fail(txc->cost)) { deferred_try_submit(); - throttle_deferred_bytes.get(txc->bytes); + throttle_deferred_bytes.get(txc->cost); } } utime_t tend = ceph_clock_now(); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 8543b8ed13139..818c1897ba728 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -124,6 +124,7 @@ class BlueStore : public ObjectStore, void _set_csum(); void _set_compression(); + void _set_throttle_params(); class TransContext; @@ -1439,7 +1440,7 @@ class BlueStore : public ObjectStore, OpSequencerRef osr; boost::intrusive::list_member_hook<> sequencer_item; - uint64_t ops = 0, bytes = 0; + uint64_t bytes = 0, cost = 0; set onodes; ///< these need to be updated/written set modified_objects; ///< objects we modified (and need a ref) @@ -1745,8 +1746,8 @@ class BlueStore : public ObjectStore, std::atomic blobid_last = {0}; std::atomic blobid_max = {0}; - Throttle throttle_ops, throttle_bytes; ///< submit to commit - Throttle throttle_deferred_ops, throttle_deferred_bytes; ///< submit to deferred complete + Throttle throttle_bytes; ///< submit to commit + Throttle throttle_deferred_bytes; ///< submit to deferred complete interval_set bluefs_extents; ///< block extents owned by bluefs interval_set bluefs_extents_reclaiming; ///< currently reclaiming @@ -1791,6 +1792,8 @@ class BlueStore : public ObjectStore, uint64_t max_alloc_size = 0; ///< maximum allocation unit (power of 2) + uint64_t throttle_cost_per_io = 0; ///< approx cost per io, in bytes + std::atomic comp_mode = {Compressor::COMP_NONE}; ///< compression mode CompressorRef compressor; std::atomic comp_min_blob_size = {0}; @@ -1892,6 +1895,7 @@ class BlueStore : public ObjectStore, TransContext *_txc_create(OpSequencer *osr); void _txc_update_store_statfs(TransContext *txc); void _txc_add_transaction(TransContext *txc, Transaction *t); + void _txc_calc_cost(TransContext *txc); void _txc_write_nodes(TransContext *txc, KeyValueDB::Transaction t); void _txc_state_proc(TransContext *txc); void _txc_aio_submit(TransContext *txc); From 36dd54989323cb01914ff341689c8561061128dd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 3 Apr 2017 16:53:33 -0400 Subject: [PATCH 3/5] os/bluestore: bluestore_deferred_batch_ops = 32 (not 8) A bit more batching. Signed-off-by: Sage Weil --- src/common/config_opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 4c07b5b5040ca..7891e8faa9de2 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -1111,7 +1111,7 @@ OPTION(bluestore_throttle_cost_per_io_ssd, OPT_U64, 4000) OPTION(bluestore_throttle_cost_per_io, OPT_U64, 0) OPTION(bluestore_deferred_max_ops, OPT_U64, 512) OPTION(bluestore_deferred_max_bytes, OPT_U64, 128*1024*1024) -OPTION(bluestore_deferred_batch_ops, OPT_U64, 8) +OPTION(bluestore_deferred_batch_ops, OPT_U64, 32) OPTION(bluestore_nid_prealloc, OPT_INT, 1024) OPTION(bluestore_blobid_prealloc, OPT_U64, 10240) OPTION(bluestore_clone_cow, OPT_BOOL, true) // do copy-on-write for clones From 4652f34c5feb6f22f207fd08ad07410fff2a9d2b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 3 Apr 2017 16:54:23 -0400 Subject: [PATCH 4/5] os/bluestore: do not wake kv thread if only deferred events pending No need to wake up if there is only deferred work; we can do it lazily. Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 09c30c2b56b4f..8bdee10a504c5 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7701,8 +7701,8 @@ void BlueStore::_kv_sync_thread() while (true) { assert(kv_committing.empty()); if (kv_queue.empty() && - deferred_done_queue.empty() && - deferred_stable_queue.empty()) { + ((deferred_done_queue.empty() && deferred_stable_queue.empty()) || + !deferred_aggressive)) { if (kv_stop) break; dout(20) << __func__ << " sleep" << dendl; From c38e83f69a2393350e5e90865c41a33bd5e5b046 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 7 Apr 2017 11:31:21 -0400 Subject: [PATCH 5/5] os/bluestore: make deferred_aggressive a counter Multiple threads may run _osr_drain_preceding; use a count to prevent the first finisher from prematurely disabling aggressive mode. Fixes: http://tracker.ceph.com/issues/19542 Signed-off-by: Sage Weil --- src/os/bluestore/BlueStore.cc | 8 ++++---- src/os/bluestore/BlueStore.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 8bdee10a504c5..599d6793240ab 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -7619,7 +7619,7 @@ void BlueStore::_osr_drain_preceding(TransContext *txc) { OpSequencer *osr = txc->osr.get(); dout(10) << __func__ << " " << txc << " osr " << osr << dendl; - deferred_aggressive = true; // FIXME: maybe osr-local aggressive flag? + ++deferred_aggressive; // FIXME: maybe osr-local aggressive flag? { // submit anything pending std::lock_guard l(deferred_lock); @@ -7633,7 +7633,7 @@ void BlueStore::_osr_drain_preceding(TransContext *txc) kv_cond.notify_one(); } osr->drain_preceding(txc); - deferred_aggressive = false; + --deferred_aggressive; dout(10) << __func__ << " " << osr << " done" << dendl; } @@ -7648,7 +7648,7 @@ void BlueStore::_osr_drain_all() } dout(20) << __func__ << " osr_set " << s << dendl; - deferred_aggressive = true; + ++deferred_aggressive; { // submit anything pending std::lock_guard l(deferred_lock); @@ -7663,7 +7663,7 @@ void BlueStore::_osr_drain_all() dout(20) << __func__ << " drain " << osr << dendl; osr->drain(); } - deferred_aggressive = false; + --deferred_aggressive; dout(10) << __func__ << " done" << dendl; } diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 818c1897ba728..475ccc9d93a0b 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1756,7 +1756,7 @@ class BlueStore : public ObjectStore, std::atomic deferred_seq = {0}; deferred_osr_queue_t deferred_queue; ///< osr's with deferred io pending int deferred_queue_size = 0; ///< num txc's queued across all osrs - atomic_bool deferred_aggressive = {false}; ///< aggressive wakeup of kv thread + atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread int m_finisher_num = 1; vector finishers;