From 8240be8f324b244061b17bb2c1676b192a12dca2 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 13 Sep 2022 15:50:35 +0800 Subject: [PATCH 01/21] crimson/os/seastore/cached_extent: record transaction id in pending CachedExtents Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.cc | 9 +++- src/crimson/os/seastore/cache.h | 14 +++-- src/crimson/os/seastore/cached_extent.h | 54 +++++++++++++++++-- src/crimson/os/seastore/logging.h | 6 ++- src/crimson/os/seastore/seastore_types.h | 3 ++ src/crimson/os/seastore/transaction.h | 16 ++++-- src/crimson/os/seastore/transaction_manager.h | 3 +- 7 files changed, 90 insertions(+), 15 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 86c3730fd8b4d..65c0f8ef5fe40 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -98,7 +98,8 @@ Cache::retire_extent_ret Cache::retire_extent_addr( ext->init(CachedExtent::extent_state_t::CLEAN, addr, PLACEMENT_HINT_NULL, - NULL_GENERATION); + NULL_GENERATION, + TRANS_ID_NULL); DEBUGT("retire {}~{} as placeholder, add extent -- {}", t, addr, length, *ext); const auto t_src = t.get_src(); @@ -1012,6 +1013,7 @@ CachedExtentRef Cache::duplicate_for_write( } auto ret = i->duplicate_for_write(); + ret->pending_for_transaction = t.get_trans_id(); ret->prior_instance = i; // duplicate_for_write won't occur after ool write finished assert(!i->prior_poffset); @@ -1448,6 +1450,7 @@ void Cache::complete_commit( i->set_paddr(final_block_start.add_relative(i->get_paddr())); } i->last_committed_crc = i->get_crc32c(); + i->pending_for_transaction = TRANS_ID_NULL; i->on_initial_write(); i->state = CachedExtent::extent_state_t::CLEAN; @@ -1489,6 +1492,7 @@ void Cache::complete_commit( assert(i->is_exist_mutation_pending() || i->prior_instance); i->on_delta_write(final_block_start); + i->pending_for_transaction = TRANS_ID_NULL; i->prior_instance = CachedExtentRef(); i->state = CachedExtent::extent_state_t::DIRTY; assert(i->version > 0); @@ -1593,7 +1597,8 @@ void Cache::init() root->init(CachedExtent::extent_state_t::CLEAN, P_ADDR_ROOT, PLACEMENT_HINT_NULL, - NULL_GENERATION); + NULL_GENERATION, + TRANS_ID_NULL); INFO("init root -- {}", *root); extents.insert(*root); } diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 253836276675d..3b73594157f8b 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -205,7 +205,8 @@ class Cache { last_commit, [this](Transaction& t) { return on_transaction_destruct(t); - } + }, + ++next_id ); SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}", *ret, name, src, is_weak); @@ -284,7 +285,8 @@ class Cache { ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, offset, PLACEMENT_HINT_NULL, - NULL_GENERATION); + NULL_GENERATION, + TRANS_ID_NULL); SUBDEBUG(seastore_cache, "{} {}~{} is absent, add extent and reading ... -- {}", T::TYPE, offset, length, *ret); @@ -303,7 +305,8 @@ class Cache { ret->init(CachedExtent::extent_state_t::CLEAN_PENDING, offset, PLACEMENT_HINT_NULL, - NULL_GENERATION); + NULL_GENERATION, + TRANS_ID_NULL); SUBDEBUG(seastore_cache, "{} {}~{} is absent(placeholder), reading ... -- {}", T::TYPE, offset, length, *ret); @@ -648,7 +651,8 @@ class Cache { ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING, result.paddr, hint, - result.gen); + result.gen, + t.get_trans_id()); t.add_fresh_extent(ret); SUBDEBUGT(seastore_cache, "allocated {} {}B extent at {}, hint={}, gen={} -- {}", @@ -990,6 +994,8 @@ class Cache { // FIXME: This is specific to the segmented implementation std::vector segment_providers_by_device_id; + transaction_id_t next_id = 0; + /** * dirty * diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 5ccafc5fa1961..7f76c26145302 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -76,9 +76,55 @@ using read_set_t = std::set< read_set_item_t, typename read_set_item_t::cmp_t>; +struct trans_spec_view_t { + // if the extent is pending, contains the id of the owning transaction; + // TRANS_ID_NULL otherwise + transaction_id_t pending_for_transaction = TRANS_ID_NULL; + + struct cmp_t { + bool operator()( + const trans_spec_view_t &lhs, + const trans_spec_view_t &rhs) const + { + return lhs.pending_for_transaction < rhs.pending_for_transaction; + } + bool operator()( + const transaction_id_t &lhs, + const trans_spec_view_t &rhs) const + { + return lhs < rhs.pending_for_transaction; + } + bool operator()( + const trans_spec_view_t &lhs, + const transaction_id_t &rhs) const + { + return lhs.pending_for_transaction < rhs; + } + }; + + using trans_view_hook_t = + boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + trans_view_hook_t trans_view_hook; + + using trans_view_member_options = + boost::intrusive::member_hook< + trans_spec_view_t, + trans_view_hook_t, + &trans_spec_view_t::trans_view_hook>; + using trans_view_set_t = boost::intrusive::set< + trans_spec_view_t, + trans_view_member_options, + boost::intrusive::constant_time_size, + boost::intrusive::compare>; +}; + class ExtentIndex; -class CachedExtent : public boost::intrusive_ref_counter< - CachedExtent, boost::thread_unsafe_counter> { +class CachedExtent + : public boost::intrusive_ref_counter< + CachedExtent, boost::thread_unsafe_counter>, + public trans_spec_view_t { enum class extent_state_t : uint8_t { INITIAL_WRITE_PENDING, // In Transaction::write_set and fresh_block_list MUTATION_PENDING, // In Transaction::write_set and mutated_block_list @@ -117,12 +163,14 @@ class CachedExtent : public boost::intrusive_ref_counter< void init(extent_state_t _state, paddr_t paddr, placement_hint_t hint, - rewrite_gen_t gen) { + rewrite_gen_t gen, + transaction_id_t trans_id) { assert(gen == NULL_GENERATION || is_rewrite_generation(gen)); state = _state; set_paddr(paddr); user_hint = hint; rewrite_generation = gen; + pending_for_transaction = trans_id; } void set_modify_time(sea_time_point t) { diff --git a/src/crimson/os/seastore/logging.h b/src/crimson/os/seastore/logging.h index ec9b7df5ed181..3f12ee72cfed7 100644 --- a/src/crimson/os/seastore/logging.h +++ b/src/crimson/os/seastore/logging.h @@ -8,9 +8,11 @@ #include "crimson/common/log.h" #define LOGT(level_, MSG, t, ...) \ - LOCAL_LOGGER.log(level_, "{} {}: " MSG, (void*)&t, FNAME , ##__VA_ARGS__) + LOCAL_LOGGER.log(level_, "{} trans.{} {}: " MSG, (void*)&t, \ + (t).get_trans_id(), FNAME , ##__VA_ARGS__) #define SUBLOGT(subname_, level_, MSG, t, ...) \ - LOGGER(subname_).log(level_, "{} {}: " MSG, (void*)&t, FNAME , ##__VA_ARGS__) + LOGGER(subname_).log(level_, "{} trans.{} {}: " MSG, (void*)&t, \ + (t).get_trans_id(), FNAME , ##__VA_ARGS__) #define TRACET(...) LOGT(seastar::log_level::trace, __VA_ARGS__) #define SUBTRACET(subname_, ...) SUBLOGT(subname_, seastar::log_level::trace, __VA_ARGS__) diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 99d17472eb3c9..a1efc729b96fa 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -24,6 +24,9 @@ namespace crimson::os::seastore { /* using a special xattr key "omap_header" to store omap header */ const std::string OMAP_HEADER_XATTR_KEY = "omap_header"; +using transaction_id_t = uint64_t; +constexpr transaction_id_t TRANS_ID_NULL = 0; + /* * Note: NULL value is usually the default and max value. */ diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 8899e105cefc5..e7eef11427a81 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -337,11 +337,13 @@ class Transaction { bool weak, src_t src, journal_seq_t initiated_after, - on_destruct_func_t&& f + on_destruct_func_t&& f, + transaction_id_t trans_id ) : weak(weak), handle(std::move(handle)), on_destruct(std::move(f)), - src(src) + src(src), + trans_id(trans_id) {} void invalidate_clear_write_set() { @@ -468,6 +470,10 @@ class Transaction { return existing_block_stats; } + transaction_id_t get_trans_id() const { + return trans_id; + } + private: friend class Cache; friend Ref make_test_transaction(); @@ -553,17 +559,21 @@ class Transaction { on_destruct_func_t on_destruct; const src_t src; + + transaction_id_t trans_id = TRANS_ID_NULL; }; using TransactionRef = Transaction::Ref; /// Should only be used with dummy staged-fltree node extent manager inline TransactionRef make_test_transaction() { + static transaction_id_t next_id = 0; return std::make_unique( get_dummy_ordering_handle(), false, Transaction::src_t::MUTATE, JOURNAL_SEQ_NULL, - [](Transaction&) {} + [](Transaction&) {}, + ++next_id ); } diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 47dcfc7129446..6cb18c560fab4 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -371,7 +371,8 @@ class TransactionManager : public ExtentCallbackInterface { ext->init(CachedExtent::extent_state_t::EXIST_CLEAN, existing_paddr, PLACEMENT_HINT_NULL, - NULL_GENERATION); + NULL_GENERATION, + t.get_trans_id()); t.add_fresh_extent(ext); From 06ae16141470a9052224fbe76c842fa13bbf4520 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 13 Sep 2022 15:53:53 +0800 Subject: [PATCH 02/21] crimson/os/seastore/cached_extent: duplicate_for_write accepts the transaction ref as its parameter Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/btree/fixed_kv_node.h | 4 ++-- src/crimson/os/seastore/cache.cc | 2 +- src/crimson/os/seastore/cached_extent.cc | 1 + src/crimson/os/seastore/cached_extent.h | 4 ++-- .../os/seastore/collection_manager/collection_flat_node.h | 2 +- src/crimson/os/seastore/object_data_handler.h | 2 +- .../os/seastore/omap_manager/btree/omap_btree_node_impl.h | 4 ++-- .../onode_manager/staged-fltree/node_extent_manager/dummy.h | 2 +- .../staged-fltree/node_extent_manager/seastore.h | 2 +- .../staged-fltree/node_extent_manager/test_replay.h | 2 +- src/crimson/os/seastore/root_block.h | 2 +- src/test/crimson/seastore/test_block.h | 4 ++-- 12 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 5658d0bf5d40f..75628f6fcbff7 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -109,7 +109,7 @@ struct FixedKVInternalNode ? &delta_buffer : nullptr; } - CachedExtentRef duplicate_for_write() override { + CachedExtentRef duplicate_for_write(Transaction&) override { assert(delta_buffer.empty()); return CachedExtentRef(new node_type_t(*this)); }; @@ -338,7 +338,7 @@ struct FixedKVLeafNode return this->is_mutation_pending() ? &delta_buffer : nullptr; } - CachedExtentRef duplicate_for_write() override { + CachedExtentRef duplicate_for_write(Transaction&) override { assert(delta_buffer.empty()); return CachedExtentRef(new node_type_t(*this)); }; diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 65c0f8ef5fe40..8209e37499404 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1012,7 +1012,7 @@ CachedExtentRef Cache::duplicate_for_write( return i; } - auto ret = i->duplicate_for_write(); + auto ret = i->duplicate_for_write(t); ret->pending_for_transaction = t.get_trans_id(); ret->prior_instance = i; // duplicate_for_write won't occur after ool write finished diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index dc7b8e6165b85..84f5f89c99b25 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -2,6 +2,7 @@ // vim: ts=8 sw=2 smarttab #include "crimson/os/seastore/cached_extent.h" +#include "crimson/os/seastore/transaction.h" #include "crimson/common/log.h" diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 7f76c26145302..f79eb3e644ee9 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -192,7 +192,7 @@ class CachedExtent * structure which defers updating the actual buffer until * on_delta_write(). */ - virtual CachedExtentRef duplicate_for_write() = 0; + virtual CachedExtentRef duplicate_for_write(Transaction &t) = 0; /** * prepare_write @@ -846,7 +846,7 @@ class RetiredExtentPlaceholder : public CachedExtent { extent_len_t get_length() const final { return length; } - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { ceph_assert(0 == "Should never happen for a placeholder"); return CachedExtentRef(); } diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.h b/src/crimson/os/seastore/collection_manager/collection_flat_node.h index c35500ffd6795..1652eb92f227b 100644 --- a/src/crimson/os/seastore/collection_manager/collection_flat_node.h +++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.h @@ -105,7 +105,7 @@ struct CollectionNode coll_map_t decoded; delta_buffer_t delta_buffer; - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { assert(delta_buffer.empty()); return CachedExtentRef(new CollectionNode(*this)); } diff --git a/src/crimson/os/seastore/object_data_handler.h b/src/crimson/os/seastore/object_data_handler.h index 510dd8f1449dc..6fd73dc762a3a 100644 --- a/src/crimson/os/seastore/object_data_handler.h +++ b/src/crimson/os/seastore/object_data_handler.h @@ -24,7 +24,7 @@ struct ObjectDataBlock : crimson::os::seastore::LogicalCachedExtent { ObjectDataBlock(const ObjectDataBlock &other) : LogicalCachedExtent(other) {} - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { return CachedExtentRef(new ObjectDataBlock(*this)); }; diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h index 574f29bea9668..a2b51bbb0e135 100644 --- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h +++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h @@ -46,7 +46,7 @@ struct OMapInnerNode bool extent_is_below_min() const { return below_min(); } uint32_t get_node_size() { return get_size(); } - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { assert(delta_buffer.empty()); return CachedExtentRef(new OMapInnerNode(*this)); } @@ -164,7 +164,7 @@ struct OMapLeafNode bool extent_is_below_min() const { return below_min(); } uint32_t get_node_size() { return get_size(); } - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { assert(delta_buffer.empty()); return CachedExtentRef(new OMapLeafNode(*this)); } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h index e18112ab6e6eb..24df8b548e933 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/dummy.h @@ -55,7 +55,7 @@ class DummyNodeExtent final: public NodeExtent { ceph_abort("impossible path"); } DeltaRecorder* get_recorder() const override { return nullptr; } - CachedExtentRef duplicate_for_write() override { + CachedExtentRef duplicate_for_write(Transaction&) override { ceph_abort("impossible path"); } extent_types_t get_type() const override { return extent_types_t::TEST_BLOCK; } diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h index b648baacf00a4..f7cfa8c2112d6 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h @@ -59,7 +59,7 @@ class SeastoreNodeExtent final: public NodeExtent { return recorder.get(); } - CachedExtentRef duplicate_for_write() override { + CachedExtentRef duplicate_for_write(Transaction&) override { return CachedExtentRef(new SeastoreNodeExtent(*this)); } ceph::bufferlist get_delta() override { diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h index b63a362f845ed..bce74e3814058 100644 --- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h +++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/test_replay.h @@ -47,7 +47,7 @@ class TestReplayExtent final: public NodeExtent { ceph_abort("impossible path"); } DeltaRecorder* get_recorder() const override { ceph_abort("impossible path"); } - CachedExtentRef duplicate_for_write() override { + CachedExtentRef duplicate_for_write(Transaction&) override { ceph_abort("impossible path"); } extent_types_t get_type() const override { return extent_types_t::TEST_BLOCK; } diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h index b686e56a46497..435860ff36bb1 100644 --- a/src/crimson/os/seastore/root_block.h +++ b/src/crimson/os/seastore/root_block.h @@ -42,7 +42,7 @@ struct RootBlock : CachedExtent { RootBlock(const RootBlock &rhs) = default; - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { return CachedExtentRef(new RootBlock(*this)); }; diff --git a/src/test/crimson/seastore/test_block.h b/src/test/crimson/seastore/test_block.h index 26588321d0979..ccdafb7843fec 100644 --- a/src/test/crimson/seastore/test_block.h +++ b/src/test/crimson/seastore/test_block.h @@ -54,7 +54,7 @@ struct TestBlock : crimson::os::seastore::LogicalCachedExtent { TestBlock(const TestBlock &other) : LogicalCachedExtent(other) {} - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { return CachedExtentRef(new TestBlock(*this)); }; @@ -93,7 +93,7 @@ struct TestBlockPhysical : crimson::os::seastore::CachedExtent{ TestBlockPhysical(const TestBlockPhysical &other) : CachedExtent(other) {} - CachedExtentRef duplicate_for_write() final { + CachedExtentRef duplicate_for_write(Transaction&) final { return CachedExtentRef(new TestBlockPhysical(*this)); }; From c56af30a9c199b4a5d2cba9d2208b8c1d8a6abfb Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 16 Aug 2022 16:48:28 +0800 Subject: [PATCH 03/21] crimson/os/seastore/cached_extent: add on_replace_prior interface Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.cc | 1 + src/crimson/os/seastore/cached_extent.h | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 8209e37499404..ebeb123cf5e43 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -802,6 +802,7 @@ void Cache::commit_replace_extent( add_to_dirty(next); } + next->on_replace_prior(t); invalidate_extent(t, *prev); } diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index f79eb3e644ee9..0245044955139 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -232,6 +232,16 @@ class CachedExtent */ virtual void on_delta_write(paddr_t record_block_offset) {} + /** + * on_replace_prior + * + * Called after the extent has replaced a previous one. State + * of the extent must be MUTATION_PENDING. Implementation + * may use this call to synchronize states that must be synchronized + * with the states of Cache and can't wait till transaction + * completes. + */ + virtual void on_replace_prior(Transaction &t) {} /** * get_type * From e5aabe6c306061b2ce3aff5ca30a9049c1a7f9bf Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 16 Aug 2022 17:05:52 +0800 Subject: [PATCH 04/21] crimson/os/seastore/cached_extent: add on_invalidated interface Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.cc | 4 ++-- src/crimson/os/seastore/cached_extent.cc | 8 ++++++++ src/crimson/os/seastore/cached_extent.h | 12 ++++++++++++ src/crimson/os/seastore/transaction.h | 8 ++++---- 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index ebeb123cf5e43..f0c02b08b12a2 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -812,7 +812,7 @@ void Cache::invalidate_extent( { if (!extent.may_conflict()) { assert(extent.transactions.empty()); - extent.state = CachedExtent::extent_state_t::INVALID; + extent.set_invalid(t); return; } @@ -829,7 +829,7 @@ void Cache::invalidate_extent( mark_transaction_conflicted(*i.t, extent); } } - extent.state = CachedExtent::extent_state_t::INVALID; + extent.set_invalid(t); } void Cache::mark_transaction_conflicted( diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 84f5f89c99b25..c84475344517d 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -90,6 +90,14 @@ std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const return print_detail_l(out); } +void CachedExtent::set_invalid(Transaction &t) { + state = extent_state_t::INVALID; + if (trans_view_hook.is_linked()) { + trans_view_hook.unlink(); + } + on_invalidated(t); +} + std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) { return out << "LBAPin(" << rhs.get_key() << "~" << rhs.get_length() diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 0245044955139..4603d5a2cd8c3 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -242,6 +242,16 @@ class CachedExtent * completes. */ virtual void on_replace_prior(Transaction &t) {} + + /** + * on_invalidated + * + * Called after the extent is invalidated, either by Cache::invalidate_extent + * or Transaction::add_to_retired_set. Implementation may use this + * call to adjust states that must be changed immediately once + * invalidated. + */ + virtual void on_invalidated(Transaction &t) {} /** * get_type * @@ -489,6 +499,8 @@ class CachedExtent return ret; } + void set_invalid(Transaction &t); + private: template friend class read_set_item_t; diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index e7eef11427a81..14ba61cee07df 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -117,13 +117,13 @@ class Transaction { if (ref->is_exist_clean() || ref->is_exist_mutation_pending()) { existing_block_stats.dec(ref); - ref->state = CachedExtent::extent_state_t::INVALID; + ref->set_invalid(*this); write_set.erase(*ref); } else if (ref->is_initial_pending()) { - ref->state = CachedExtent::extent_state_t::INVALID; + ref->set_invalid(*this); write_set.erase(*ref); } else if (ref->is_mutation_pending()) { - ref->state = CachedExtent::extent_state_t::INVALID; + ref->set_invalid(*this); write_set.erase(*ref); assert(ref->prior_instance); retired_set.insert(ref->prior_instance); @@ -348,7 +348,7 @@ class Transaction { void invalidate_clear_write_set() { for (auto &&i: write_set) { - i.state = CachedExtent::extent_state_t::INVALID; + i.set_invalid(*this); } write_set.clear(); } From 71051f997fca8ff0fdcd55586e3150bffa2a916a Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 11 Oct 2022 10:34:16 +0800 Subject: [PATCH 05/21] crimson/os/seastore/btree: introduce parent<->child pointers for fixed-kv-btree nodes maintain correct parent<->child pointers when modifying the btree Signed-off-by: Xuehan Xu --- .../os/seastore/backref/backref_tree_node.h | 1 + .../os/seastore/btree/btree_range_pin.h | 8 + .../os/seastore/btree/fixed_kv_btree.h | 43 +- src/crimson/os/seastore/btree/fixed_kv_node.h | 652 +++++++++++++++++- src/crimson/os/seastore/cache.cc | 2 + src/crimson/os/seastore/cached_extent.cc | 11 + src/crimson/os/seastore/cached_extent.h | 12 +- .../lba_manager/btree/lba_btree_node.cc | 9 +- .../lba_manager/btree/lba_btree_node.h | 1 + 9 files changed, 706 insertions(+), 33 deletions(-) diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h index 5a40675de98b5..bebbc0aa4b2f5 100644 --- a/src/crimson/os/seastore/backref/backref_tree_node.h +++ b/src/crimson/os/seastore/backref/backref_tree_node.h @@ -130,4 +130,5 @@ using BackrefLeafNodeRef = BackrefLeafNode::Ref; template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; #endif diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index da1a67ba5d359..5942e85f3175f 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -41,6 +41,10 @@ struct fixed_kv_node_meta_t { (end > other.begin); } + bool is_in_range(const bound_t key) const { + return begin <= key && end > key; + } + std::pair split_into(bound_t pivot) const { return std::make_pair( fixed_kv_node_meta_t{begin, pivot, depth}, @@ -116,9 +120,13 @@ struct fixed_kv_node_meta_le_t { template class btree_pin_set_t; +template +class FixedKVNode; + template class btree_range_pin_t : public boost::intrusive::set_base_hook<> { friend class btree_pin_set_t; + friend class FixedKVNode; fixed_kv_node_meta_t range; btree_pin_set_t *pins = nullptr; diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index e2e94f0454c44..4a5c22d6de02a 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -823,6 +823,21 @@ class FixedKVBtree { n_fixed_kv_extent->get_bptr().c_str()); n_fixed_kv_extent->set_modify_time(fixed_kv_extent.get_modify_time()); n_fixed_kv_extent->pin.set_range(n_fixed_kv_extent->get_node_meta()); + + if (fixed_kv_extent.get_type() == internal_node_t::TYPE) { + if (!fixed_kv_extent.is_pending()) { + n_fixed_kv_extent->copy_sources.emplace(&fixed_kv_extent); + n_fixed_kv_extent->prior_instance = &fixed_kv_extent; + } else { + ceph_assert(fixed_kv_extent.is_mutation_pending()); + n_fixed_kv_extent->copy_sources.emplace( + (typename internal_node_t::base_t* + )fixed_kv_extent.get_prior_instance().get()); + n_fixed_kv_extent->children = std::move(fixed_kv_extent.children); + n_fixed_kv_extent->prior_instance = fixed_kv_extent.get_prior_instance(); + n_fixed_kv_extent->adjust_ptracker_for_children(); + } + } /* This is a bit underhanded. Any relative addrs here must necessarily * be record relative as we are rewriting a dirty extent. Thus, we @@ -853,7 +868,8 @@ class FixedKVBtree { n_fixed_kv_extent->get_node_meta().depth, n_fixed_kv_extent->get_node_meta().begin, e->get_paddr(), - n_fixed_kv_extent->get_paddr() + n_fixed_kv_extent->get_paddr(), + n_fixed_kv_extent ).si_then([c, e] { c.cache.retire_extent(c.trans, e); }); @@ -877,17 +893,19 @@ class FixedKVBtree { depth_t depth, node_key_t laddr, paddr_t old_addr, - paddr_t new_addr) + paddr_t new_addr, + typename internal_node_t::base_ref nextent) { LOG_PREFIX(FixedKVBtree::update_internal_mapping); SUBTRACET( seastore_fixedkv_tree, - "updating laddr {} at depth {} from {} to {}", + "updating laddr {} at depth {} from {} to {}, nextent {}", c.trans, laddr, depth, old_addr, - new_addr); + new_addr, + *nextent); return lower_bound( c, laddr @@ -970,7 +988,7 @@ class FixedKVBtree { parent.node ); typename internal_node_t::Ref mparent = mut->cast(); - mparent->update(piter, new_addr); + mparent->update(piter, new_addr, nextent.get()); /* Note, iter is now invalid as we didn't udpate either the parent * node reference to the new mutable instance nor did we update the @@ -1439,11 +1457,13 @@ class FixedKVBtree { parent_node->update( parent_iter, - left->get_paddr()); + left->get_paddr(), + left.get()); parent_node->insert( parent_iter + 1, pivot, - right->get_paddr()); + right->get_paddr(), + right.get()); SUBTRACET( seastore_fixedkv_tree, @@ -1669,7 +1689,8 @@ class FixedKVBtree { parent_pos.node->update( liter, - replacement->get_paddr()); + replacement->get_paddr(), + replacement.get()); parent_pos.node->remove(riter); pos.node = replacement; @@ -1692,11 +1713,13 @@ class FixedKVBtree { parent_pos.node->update( liter, - replacement_l->get_paddr()); + replacement_l->get_paddr(), + replacement_l.get()); parent_pos.node->replace( riter, pivot, - replacement_r->get_paddr()); + replacement_r->get_paddr(), + replacement_r.get()); if (donor_is_left) { assert(parent_pos.pos > 0); diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 75628f6fcbff7..101ad4945d3f1 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -29,27 +29,435 @@ namespace crimson::os::seastore { template struct FixedKVNode : CachedExtent { using FixedKVNodeRef = TCachedExtentRef; + struct parent_tracker_t + : public boost::intrusive_ref_counter< + parent_tracker_t, boost::thread_unsafe_counter> { + parent_tracker_t(FixedKVNodeRef parent) + : parent(parent) {} + parent_tracker_t(FixedKVNode* parent) + : parent(parent) {} + FixedKVNodeRef parent = nullptr; + ~parent_tracker_t() { + // this is parent's tracker, reset it + if (parent->my_tracker == this) { + parent->my_tracker = nullptr; + } + } + }; + using parent_tracker_ref = boost::intrusive_ptr; btree_range_pin_t pin; - FixedKVNode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {} + struct copy_source_cmp_t { + using is_transparent = node_key_t; + bool operator()(const FixedKVNodeRef &l, const FixedKVNodeRef &r) const { + assert(l->pin.range.end <= r->pin.range.begin + || r->pin.range.end <= l->pin.range.begin + || (l->pin.range.begin == r->pin.range.begin + && l->pin.range.end == r->pin.range.end)); + return l->pin.range.begin < r->pin.range.begin; + } + bool operator()(const node_key_t &l, const FixedKVNodeRef &r) const { + return l < r->pin.range.begin; + } + bool operator()(const FixedKVNodeRef &l, const node_key_t &r) const { + return l->pin.range.begin < r; + } + }; + + /* + * + * Nodes of fixed-kv-btree connect to their child nodes by pointers following + * invariants below: + * + * 1. if nodes are stable: + * a. parent points at the node's stable parent + * b. prior_instance is empty + * c. child pointers point at stable children. Child resolution is done + * directly via this array. + * c. copy_sources is empty + * 2. if nodes are mutation_pending: + * a. parent is empty and needs to be fixed upon commit + * b. prior_instance points to its stable version + * c. child pointers are null except for initial_pending() children of + * this transaction. Child resolution is done by first checking this + * array, and then recursively resolving via the parent. We copy child + * pointers from parent on commit. + * c. copy_sources is empty + * 3. if nodes are initial_pending + * a. parent points at its pending parent on this transaction (must exist) + * b. prior_instance is empty or, if it's the result of rewrite, points to + * its stable predecessor + * c. child pointers are null except for initial_pending() children of + * this transaction (live due to 3a below). Child resolution is done + * by first checking this array, and then recursively resolving via + * the correct copy_sources entry. We copy child pointers from copy_sources + * on commit. + * d. copy_sources contains the set of stable nodes at the same tree-level(only + * its "prior_instance" if the node is the result of a rewrite), with which + * the lba range of this node overlaps. + */ + std::vector children; + std::set copy_sources; + uint16_t capacity = 0; + parent_tracker_t* my_tracker = nullptr; + parent_tracker_ref parent_tracker; + + FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr) + : CachedExtent(std::move(ptr)), + pin(this), + children(capacity, nullptr), + capacity(capacity) {} FixedKVNode(const FixedKVNode &rhs) - : CachedExtent(rhs), pin(rhs.pin, this) {} + : CachedExtent(rhs), + pin(rhs.pin, this), + children(rhs.capacity, nullptr), + capacity(rhs.capacity) {} virtual fixed_kv_node_meta_t get_node_meta() const = 0; + virtual uint16_t get_node_size() const = 0; virtual ~FixedKVNode() = default; + virtual node_key_t get_key_from_idx(uint16_t idx) const = 0; + + template + void update_child_ptr(iter_t iter, ChildableCachedExtent* child) { + children[iter.get_offset()] = child; + set_child_ptracker(child); + } + + template + void insert_child_ptr(iter_t iter, ChildableCachedExtent* child) { + auto raw_children = children.data(); + auto offset = iter.get_offset(); + std::memmove( + &raw_children[offset + 1], + &raw_children[offset], + (get_node_size() - offset) * sizeof(ChildableCachedExtent*)); + children[offset] = child; + set_child_ptracker(child); + } + + template + void remove_child_ptr(iter_t iter) { + LOG_PREFIX(FixedKVNode::remove_child_ptr); + auto raw_children = children.data(); + auto offset = iter.get_offset(); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, total size {}, extent {}", + this->pending_for_transaction, + offset, + get_node_size(), + (void*)raw_children[offset]); + // parent tracker of the child being removed will be + // reset when the child is invalidated, so no need to + // reset it here + std::memmove( + &raw_children[offset], + &raw_children[offset + 1], + (get_node_size() - offset - 1) * sizeof(ChildableCachedExtent*)); + } + + FixedKVNode& get_stable_for_key(node_key_t key) { + ceph_assert(is_pending()); + if (is_mutation_pending()) { + return (FixedKVNode&)*get_prior_instance(); + } else { + ceph_assert(!copy_sources.empty()); + auto it = copy_sources.upper_bound(key); + it--; + auto ©_source = *it; + ceph_assert(copy_source->get_node_meta().is_in_range(key)); + return *copy_source; + } + } + + static void push_copy_sources( + FixedKVNode &dest, + FixedKVNode &src) + { + ceph_assert(dest.is_initial_pending()); + if (!src.is_pending()) { + dest.copy_sources.emplace(&src); + } else if (src.is_mutation_pending()) { + dest.copy_sources.emplace( + src.get_prior_instance()->template cast()); + } else { + ceph_assert(src.is_initial_pending()); + dest.copy_sources.insert( + src.copy_sources.begin(), + src.copy_sources.end()); + } + } + + virtual uint16_t get_node_split_pivot() = 0; + + static void move_child_ptrs( + FixedKVNode &dest, + FixedKVNode &src, + size_t dest_start, + size_t src_start, + size_t src_end) + { + std::memmove( + dest.children.data() + dest_start, + src.children.data() + src_start, + (src_end - src_start) * sizeof(ChildableCachedExtent*)); + + ceph_assert(src_start < src_end); + ceph_assert(src.children.size() >= src_end); + for (auto it = src.children.begin() + src_start; + it != src.children.begin() + src_end; + it++) + { + auto child = *it; + if (is_valid_child_ptr(child)) { + dest.set_child_ptracker(child); + } + } + } + + void split_child_ptrs( + FixedKVNode &left, + FixedKVNode &right) + { + assert(!left.my_tracker); + assert(!right.my_tracker); + push_copy_sources(left, *this); + push_copy_sources(right, *this); + if (is_pending()) { + uint16_t pivot = get_node_split_pivot(); + move_child_ptrs(left, *this, 0, 0, pivot); + move_child_ptrs(right, *this, 0, pivot, get_node_size()); + my_tracker = nullptr; + } + } + + void merge_child_ptrs( + FixedKVNode &left, + FixedKVNode &right) + { + ceph_assert(!my_tracker); + push_copy_sources(*this, left); + push_copy_sources(*this, right); + + if (left.is_pending()) { + move_child_ptrs(*this, left, 0, 0, left.get_node_size()); + left.my_tracker = nullptr; + } + + if (right.is_pending()) { + move_child_ptrs(*this, right, left.get_node_size(), 0, right.get_node_size()); + right.my_tracker = nullptr; + } + } + + static void balance_child_ptrs( + FixedKVNode &left, + FixedKVNode &right, + bool prefer_left, + FixedKVNode &replacement_left, + FixedKVNode &replacement_right) + { + size_t l_size = left.get_node_size(); + size_t r_size = right.get_node_size(); + size_t total = l_size + r_size; + size_t pivot_idx = (l_size + r_size) / 2; + if (total % 2 && prefer_left) { + pivot_idx++; + } + + assert(!replacement_left.my_tracker); + assert(!replacement_right.my_tracker); + if (pivot_idx < l_size) { + // deal with left + push_copy_sources(replacement_left, left); + push_copy_sources(replacement_right, left); + if (left.is_pending()) { + move_child_ptrs(replacement_left, left, 0, 0, pivot_idx); + move_child_ptrs(replacement_right, left, 0, pivot_idx, l_size); + left.my_tracker = nullptr; + } + + // deal with right + push_copy_sources(replacement_right, right); + if (right.is_pending()) { + move_child_ptrs(replacement_right, right, l_size - pivot_idx, 0, r_size); + right.my_tracker= nullptr; + } + } else { + // deal with left + push_copy_sources(replacement_left, left); + if (left.is_pending()) { + move_child_ptrs(replacement_left, left, 0, 0, l_size); + left.my_tracker = nullptr; + } + + // deal with right + push_copy_sources(replacement_left, right); + push_copy_sources(replacement_right, right); + if (right.is_pending()) { + move_child_ptrs(replacement_left, right, l_size, 0, pivot_idx - l_size); + move_child_ptrs(replacement_right, right, 0, pivot_idx - l_size, r_size); + right.my_tracker= nullptr; + } + } + } + + void set_parent_tracker_from_prior_instance() { + if (pin.is_root()) { + return; + } + assert(is_mutation_pending()); + auto &prior = (FixedKVNode&)(*get_prior_instance()); + parent_tracker = prior.parent_tracker; + auto &parent = parent_tracker->parent; + assert(parent); + assert(parent->is_valid()); + //TODO: can this search be avoided? + auto off = parent->lower_bound_offset(get_node_meta().begin); + assert(parent->get_key_from_idx(off) == get_node_meta().begin); + parent->children[off] = this; + } + + bool is_children_empty() const { + for (auto it = children.begin(); + it != children.begin() + get_node_size(); + it++) { + if (is_valid_child_ptr(*it) + && (*it)->is_valid()) { + return false; + } + } + return true; + } + + void set_children_from_prior_instance() { + assert(get_prior_instance()); + auto &prior = (FixedKVNode&)(*get_prior_instance()); + assert(prior.my_tracker || prior.is_children_empty()); + + if (prior.my_tracker) { + prior.my_tracker->parent.reset(this); + my_tracker = prior.my_tracker; + // All my initial pending children is pointing to the original + // tracker which has been dropped by the above line, so need + // to adjust them to point to the new tracker + adjust_ptracker_for_children(); + } + assert(my_tracker || is_children_empty()); + } + + void adjust_ptracker_for_children() { + auto begin = children.begin(); + auto end = begin + get_node_size(); + ceph_assert(end <= children.end()); + for (auto it = begin; it != end; it++) { + auto child = *it; + if (child) { + set_child_ptracker((FixedKVNode*)child); + } + } + } void on_delta_write(paddr_t record_block_offset) final { // All in-memory relative addrs are necessarily record-relative assert(get_prior_instance()); + assert(pending_for_transaction); pin.take_pin(get_prior_instance()->template cast()->pin); resolve_relative_addrs(record_block_offset); } + virtual uint16_t lower_bound_offset(node_key_t) const = 0; + virtual uint16_t upper_bound_offset(node_key_t) const = 0; + virtual uint16_t child_pos_for_key(node_key_t) const = 0; + + virtual bool validate_stable_children() = 0; + + template + uint16_t copy_children_from_stable_source( + FixedKVNode &source, + iter_t foreign_start_it, + iter_t foreign_end_it, + iter_t local_start_it) { + auto foreign_it = foreign_start_it, local_it = local_start_it; + while (foreign_it != foreign_end_it + && local_it.get_offset() < get_node_size()) + { + auto &child = children[local_it.get_offset()]; + if (foreign_it.get_key() == local_it.get_key()) { + // the foreign key is preserved + if (!child) { + child = source.children[foreign_it.get_offset()]; + } + foreign_it++; + local_it++; + } else if (foreign_it.get_key() < local_it.get_key()) { + // the foreign key has been removed, because, if it hasn't, + // there must have been a local key before the one pointed + // by the current "local_it" that's equal to this foreign key + // and has pushed the foreign_it forward. + foreign_it++; + } else { + // the local key must be a newly inserted one. + local_it++; + } + } + return local_it.get_offset(); + } + + template + void copy_children_from_stable_sources(Func &&get_iter) { + if (!copy_sources.empty()) { + auto it = --copy_sources.upper_bound(get_node_meta().begin); + auto &cs = *it; + uint16_t start_pos = cs->lower_bound_offset( + get_node_meta().begin); + if (start_pos == cs->get_node_size()) { + it++; + start_pos = 0; + } + uint16_t local_next_pos = 0; + for (; it != copy_sources.end(); it++) { + auto& copy_source = *it; + auto end_pos = copy_source->get_node_size(); + if (copy_source->get_node_meta().is_in_range(get_node_meta().end)) { + end_pos = copy_source->upper_bound_offset(get_node_meta().end); + } + auto local_start_iter = get_iter(*this, local_next_pos); + auto foreign_start_iter = get_iter(*copy_source, start_pos); + auto foreign_end_iter = get_iter(*copy_source, end_pos); + local_next_pos = copy_children_from_stable_source( + *copy_source, foreign_start_iter, foreign_end_iter, local_start_iter); + if (end_pos != copy_source->get_node_size()) { + break; + } + start_pos = 0; + } + } + } + + void on_invalidated(Transaction &t) final { + parent_tracker.reset(); + } + + bool is_rewrite() { + return is_initial_pending() && get_prior_instance(); + } + void on_initial_write() final { // All in-memory relative addrs are necessarily block-relative resolve_relative_addrs(get_paddr()); + ceph_assert( + parent_tracker + ? (parent_tracker->parent && parent_tracker->parent->is_valid()) + : true); + } + + void set_child_ptracker(FixedKVNode *child) { + if (!my_tracker) { + my_tracker = new parent_tracker_t(this); + } + child->parent_tracker.reset(my_tracker); } void on_clean_read() final { @@ -81,6 +489,8 @@ struct FixedKVInternalNode NODE_KEY, NODE_KEY_LE, paddr_t, paddr_le_t> { using Ref = TCachedExtentRef; + using base_t = FixedKVNode; + using base_ref = typename FixedKVNode::FixedKVNodeRef; using node_layout_t = common::FixedKVNodeLayout< CAPACITY, @@ -92,17 +502,101 @@ struct FixedKVInternalNode paddr_le_t>; using internal_const_iterator_t = typename node_layout_t::const_iterator; using internal_iterator_t = typename node_layout_t::iterator; - template - FixedKVInternalNode(T&&... t) : - FixedKVNode(std::forward(t)...), - node_layout_t(this->get_bptr().c_str()) {} + using this_type_t = FixedKVInternalNode< + CAPACITY, + NODE_KEY, + NODE_KEY_LE, + node_size, + node_type_t>; - virtual ~FixedKVInternalNode() {} + FixedKVInternalNode(ceph::bufferptr &&ptr) + : FixedKVNode(CAPACITY, std::move(ptr)), + node_layout_t(this->get_bptr().c_str()) {} + FixedKVInternalNode(const FixedKVInternalNode &rhs) + : FixedKVNode(rhs), + node_layout_t(this->get_bptr().c_str()) {} + + uint16_t get_node_split_pivot() final { + return this->get_split_pivot().get_offset(); + } + + void prepare_write() final { + if (this->is_initial_pending()) { + if (this->is_rewrite()) { + this->set_children_from_prior_instance(); + } + this->copy_children_from_stable_sources( + [this](base_t &node, uint16_t pos) { + ceph_assert(node.get_type() == this->get_type()); + auto &n = static_cast(node); + return n.iter_idx(pos); + } + ); + if (this->is_rewrite()) { + this->reset_prior_instance(); + } else { + this->adjust_ptracker_for_children(); + } + assert(this->validate_stable_children()); + this->copy_sources.clear(); + } + } + + bool validate_stable_children() final { + LOG_PREFIX(FixedKVInternalNode::validate_stable_children); + if (this->children.empty()) { + return false; + } + + for (auto i : *this) { + auto child = (FixedKVNode*)this->children[i.get_offset()]; + if (child && child->range.begin != i.get_key()) { + SUBERROR(seastore_fixedkv_tree, + "stable child not valid: child {}, child meta{}, key {}", + *child, + child->get_node_meta(), + i.get_key()); + ceph_abort(); + return false; + } + } + return true; + } + + virtual ~FixedKVInternalNode() { + if (!this->pin.is_root() + && this->is_valid() + && !this->is_pending()) { + ceph_assert(this->parent_tracker); + auto &parent = this->parent_tracker->parent; + ceph_assert(parent); + auto off = parent->lower_bound_offset(this->get_meta().begin); + assert(parent->get_key_from_idx(off) == get_node_meta().begin); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } + } + + uint16_t lower_bound_offset(NODE_KEY key) const final { + return this->lower_bound(key).get_offset(); + } + + uint16_t upper_bound_offset(NODE_KEY key) const final { + return this->upper_bound(key).get_offset(); + } + + NODE_KEY get_key_from_idx(uint16_t idx) const final { + return this->iter_idx(idx).get_key(); + } fixed_kv_node_meta_t get_node_meta() const { return this->get_meta(); } + uint16_t get_node_size() const final { + return this->get_size(); + } + typename node_layout_t::delta_buffer_t delta_buffer; typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() { return this->is_mutation_pending() @@ -114,9 +608,30 @@ struct FixedKVInternalNode return CachedExtentRef(new node_type_t(*this)); }; + void on_replace_prior(Transaction&) final { + ceph_assert(!this->is_rewrite()); + this->set_children_from_prior_instance(); + auto &prior = (this_type_t&)(*this->get_prior_instance()); + auto copied = this->copy_children_from_stable_source( + prior, + prior.begin(), + prior.end(), + this->begin()); + ceph_assert(copied <= get_node_size()); + assert(this->validate_stable_children()); + this->set_parent_tracker_from_prior_instance(); + } + void update( internal_const_iterator_t iter, - paddr_t addr) { + paddr_t addr, + FixedKVNode* nextent) { + LOG_PREFIX(FixedKVInternalNode::update); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}", + this->pending_for_transaction, + iter.get_offset(), + *nextent); + this->update_child_ptr(iter, nextent); return this->journal_update( iter, this->maybe_generate_relative(addr), @@ -126,7 +641,15 @@ struct FixedKVInternalNode void insert( internal_const_iterator_t iter, NODE_KEY pivot, - paddr_t addr) { + paddr_t addr, + FixedKVNode* nextent) { + LOG_PREFIX(FixedKVInternalNode::insert); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, {}", + this->pending_for_transaction, + iter.get_offset(), + pivot, + *nextent); + this->insert_child_ptr(iter, nextent); return this->journal_insert( iter, pivot, @@ -135,6 +658,12 @@ struct FixedKVInternalNode } void remove(internal_const_iterator_t iter) { + LOG_PREFIX(FixedKVInternalNode::remove); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key()); + this->remove_child_ptr(iter); return this->journal_remove( iter, maybe_get_delta_buffer()); @@ -143,7 +672,16 @@ struct FixedKVInternalNode void replace( internal_const_iterator_t iter, NODE_KEY pivot, - paddr_t addr) { + paddr_t addr, + FixedKVNode* nextent) { + LOG_PREFIX(FixedKVInternalNode::replace); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, old key {}, key {}, {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key(), + pivot, + *nextent); + this->update_child_ptr(iter, nextent); return this->journal_replace( iter, pivot, @@ -157,6 +695,7 @@ struct FixedKVInternalNode c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto right = c.cache.template alloc_new_extent( c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + this->split_child_ptrs(*left, *right); auto pivot = this->split_into(*left, *right); left->pin.set_range(left->get_meta()); right->pin.set_range(right->get_meta()); @@ -171,6 +710,7 @@ struct FixedKVInternalNode Ref &right) { auto replacement = c.cache.template alloc_new_extent( c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + replacement->merge_child_ptrs(*this, *right); replacement->merge_from(*this, *right->template cast()); replacement->pin.set_range(replacement->get_meta()); return replacement; @@ -194,6 +734,12 @@ struct FixedKVInternalNode prefer_left, *replacement_left, *replacement_right); + this->balance_child_ptrs( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); replacement_left->pin.set_range(replacement_left->get_meta()); replacement_right->pin.set_range(replacement_right->get_meta()); @@ -251,8 +797,17 @@ struct FixedKVInternalNode std::ostream &print_detail(std::ostream &out) const { - return out << ", size=" << this->get_size() - << ", meta=" << this->get_meta(); + out << ", size=" << this->get_size() + << ", meta=" << this->get_meta() + << ", parent_tracker=" << (void*)this->parent_tracker.get(); + if (this->parent_tracker) { + out << ", parent=" << (void*)this->parent_tracker->parent.get(); + } + out << ", my_tracker=" << (void*)this->my_tracker; + if (this->my_tracker) { + out << ", my_tracker->parent=" << (void*)this->my_tracker->parent.get(); + } + return out; } ceph::bufferlist get_delta() { @@ -322,17 +877,60 @@ struct FixedKVLeafNode VAL, VAL_LE>; using internal_const_iterator_t = typename node_layout_t::const_iterator; - template - FixedKVLeafNode(T&&... t) : - FixedKVNode(std::forward(t)...), - node_layout_t(this->get_bptr().c_str()) {} + FixedKVLeafNode(ceph::bufferptr &&ptr) + : FixedKVNode(0, std::move(ptr)), + node_layout_t(this->get_bptr().c_str()) {} + FixedKVLeafNode(const FixedKVLeafNode &rhs) + : FixedKVNode(rhs), + node_layout_t(this->get_bptr().c_str()) {} - virtual ~FixedKVLeafNode() {} + uint16_t get_node_split_pivot() final { + return this->get_split_pivot().get_offset(); + } + + bool validate_stable_children() final { + return true; + } + + virtual ~FixedKVLeafNode() { + if (!this->pin.is_root() + && this->is_valid() + && !this->is_pending()) { + ceph_assert(this->parent_tracker); + auto &parent = this->parent_tracker->parent; + ceph_assert(parent); + auto off = parent->lower_bound_offset(this->get_meta().begin); + assert(parent->get_key_from_idx(off) == get_node_meta().begin); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } + } + + void on_replace_prior(Transaction &t) final { + this->set_parent_tracker(); + assert(this->mutate_state.empty()); + } + + uint16_t lower_bound_offset(NODE_KEY key) const final { + return this->lower_bound(key).get_offset(); + } + + uint16_t upper_bound_offset(NODE_KEY key) const final { + return this->upper_bound(key).get_offset(); + } + + NODE_KEY get_key_from_idx(uint16_t idx) const final { + return this->iter_idx(idx).get_key(); + } fixed_kv_node_meta_t get_node_meta() const { return this->get_meta(); } + uint16_t get_node_size() const final { + return this->get_size(); + } + typename node_layout_t::delta_buffer_t delta_buffer; virtual typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() { return this->is_mutation_pending() ? &delta_buffer : nullptr; @@ -426,8 +1024,13 @@ struct FixedKVLeafNode std::ostream &print_detail(std::ostream &out) const { - return out << ", size=" << this->get_size() - << ", meta=" << this->get_meta(); + out << ", size=" << this->get_size() + << ", meta=" << this->get_meta() + << ", parent_tracker=" << (void*)this->parent_tracker.get(); + if (this->parent_tracker) { + out << ", parent=" << (void*)this->parent_tracker->parent.get(); + } + return out; } constexpr static size_t get_min_capacity() { @@ -451,3 +1054,14 @@ struct FixedKVLeafNode }; } // namespace crimson::os::seastore + +#if FMT_VERSION >= 90000 +template <> +struct fmt::formatter< + crimson::os::seastore::FixedKVNode< + crimson::os::seastore::laddr_t>> : fmt::ostream_formatter {}; +template <> +struct fmt::formatter< + crimson::os::seastore::FixedKVNode< + crimson::os::seastore::paddr_t>> : fmt::ostream_formatter {}; +#endif diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index f0c02b08b12a2..c41d29b941c6f 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1018,6 +1018,8 @@ CachedExtentRef Cache::duplicate_for_write( ret->prior_instance = i; // duplicate_for_write won't occur after ool write finished assert(!i->prior_poffset); + auto [iter, inserted] = i->mutation_pendings.insert(*ret); + ceph_assert(inserted); t.add_mutated_extent(ret); if (ret->get_type() == extent_types_t::ROOT) { t.root = ret->cast(); diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index c84475344517d..f1ea69d633260 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -79,6 +79,17 @@ CachedExtent::~CachedExtent() } } +CachedExtent* CachedExtent::get_transactional_view(Transaction &t) { + auto it = mutation_pendings.find( + t.get_trans_id(), + trans_spec_view_t::cmp_t()); + if (it != mutation_pendings.end()) { + return (CachedExtent*)&(*it); + } else { + return this; + } +} + std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const { out << ", laddr=" << laddr; diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 4603d5a2cd8c3..ea1a749b7f23a 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -501,6 +501,10 @@ class CachedExtent void set_invalid(Transaction &t); + CachedExtentRef get_prior_instance() { + return prior_instance; + } + private: template friend class read_set_item_t; @@ -585,6 +589,8 @@ class CachedExtent rewrite_gen_t rewrite_generation = NULL_GENERATION; protected: + trans_view_set_t mutation_pendings; + CachedExtent(CachedExtent &&other) = delete; CachedExtent(ceph::bufferptr &&ptr) : ptr(std::move(ptr)) {} CachedExtent(const CachedExtent &other) @@ -605,6 +611,8 @@ class CachedExtent struct retired_placeholder_t{}; CachedExtent(retired_placeholder_t) : state(extent_state_t::INVALID) {} + CachedExtent& get_transactional_view(Transaction &t); + friend class Cache; template static TCachedExtentRef make_cached_extent_ref( @@ -612,8 +620,8 @@ class CachedExtent return new T(std::forward(args)...); } - CachedExtentRef get_prior_instance() { - return prior_instance; + void reset_prior_instance() { + prior_instance.reset(); } /// Sets last_committed_crc diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc index e3e69421fc641..e3eed252ebe43 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc @@ -29,8 +29,13 @@ std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v) std::ostream &LBALeafNode::print_detail(std::ostream &out) const { - return out << ", size=" << get_size() - << ", meta=" << get_meta(); + out << ", size=" << get_size() + << ", meta=" << get_meta() + << ", parent_tracker=" << (void*)parent_tracker.get(); + if (parent_tracker) { + return out << ", parent=" << (void*)parent_tracker->parent.get(); + } + return out; } void LBALeafNode::resolve_relative_addrs(paddr_t base) diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index f39d47735441d..571e906fa8fed 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -219,6 +219,7 @@ using LBALeafNodeRef = TCachedExtentRef; } #if FMT_VERSION >= 90000 +template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; From a86c7bd651bc7a89a49e57f9a87e8baeb4291c41 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 31 Jan 2023 14:36:42 +0800 Subject: [PATCH 06/21] crimson/os/seastore/btree: don't go to leaf nodes when updating internal mappings Signed-off-by: Xuehan Xu --- .../os/seastore/btree/fixed_kv_btree.h | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 4a5c22d6de02a..71e77a5ae970c 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -335,7 +335,8 @@ class FixedKVBtree { iterator_fut lower_bound( op_context_t c, node_key_t addr, - mapped_space_visitor_t *visitor=nullptr) const + mapped_space_visitor_t *visitor=nullptr, + depth_t min_depth = 1) const { LOG_PREFIX(FixedKVBtree::lower_bound); return lookup( @@ -359,14 +360,17 @@ class FixedKVBtree { ret == leaf.end()); return ret; }, + min_depth, visitor - ).si_then([FNAME, c](auto &&ret) { + ).si_then([FNAME, c, min_depth](auto &&ret) { SUBTRACET( seastore_fixedkv_tree, "ret.leaf.pos {}", c.trans, ret.leaf.pos); - ret.assert_valid(); + if (min_depth == 1) { + ret.assert_valid(); + } return std::move(ret); }); } @@ -908,7 +912,7 @@ class FixedKVBtree { *nextent); return lower_bound( - c, laddr + c, laddr, nullptr, depth + 1 ).si_then([=, this](auto iter) { assert(iter.get_depth() >= depth); if (depth == iter.get_depth()) { @@ -1312,17 +1316,19 @@ class FixedKVBtree { op_context_t c, LI &&lookup_internal, LL &&lookup_leaf, + depth_t min_depth, mapped_space_visitor_t *visitor ) const { LOG_PREFIX(FixedKVBtree::lookup); + assert(min_depth > 0); return seastar::do_with( iterator{root.get_depth()}, std::forward
  • (lookup_internal), std::forward(lookup_leaf), - [FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) { + [FNAME, this, visitor, c, min_depth](auto &iter, auto &li, auto &ll) { return lookup_root( c, iter, visitor - ).si_then([FNAME, this, visitor, c, &iter, &li, &ll] { + ).si_then([FNAME, this, visitor, c, &iter, &li, &ll, min_depth] { if (iter.get_depth() > 1) { auto &root_entry = *(iter.internal.rbegin()); root_entry.pos = li(*(root_entry.node)).get_offset(); @@ -1336,12 +1342,15 @@ class FixedKVBtree { c, iter, root.get_depth() - 1, - 0, + min_depth - 1, li, ll, visitor - ).si_then([c, visitor, &iter] { - if (iter.at_boundary()) { + ).si_then([c, visitor, &iter, min_depth] { + // It's only when the lookup is triggered by + // update_internal_mapping() that min_depth is + // NOT 1 + if (min_depth == 1 && iter.at_boundary()) { return iter.handle_boundary(c, visitor); } else { return lookup_iertr::now(); From 1b4c591ef51920211de978f490b4125177571e08 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 12 Oct 2022 14:37:39 +0800 Subject: [PATCH 07/21] crimson/os/seastore/cached_extent: improve the representation of "has_been_invalidated" Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cached_extent.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index ea1a749b7f23a..70ebc394bd584 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -402,7 +402,7 @@ class CachedExtent /// Returns true if extent or prior_instance has been invalidated bool has_been_invalidated() const { - return !is_valid() || (prior_instance && !prior_instance->is_valid()); + return !is_valid() || (is_mutation_pending() && !prior_instance->is_valid()); } /// Returns true if extent is a plcaeholder From 686d1206532200733d69b9430227bdb6bf2d9eb3 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 13 Oct 2022 10:57:09 +0800 Subject: [PATCH 08/21] crimson/os/seastore/cache: invalidate out-dated extent when initiating Cache Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 3b73594157f8b..3b90516e614c6 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -815,6 +815,7 @@ class Cache { if (!is_alive) { SUBDEBUGT(seastore_cache, "extent is not alive, remove extent -- {}", t, *e); remove_extent(e); + e->set_invalid(t); } else { SUBDEBUGT(seastore_cache, "extent is alive -- {}", t, *e); } From 7c3305f0149808f800dc6c852d4f1f755a490ec4 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 13 Oct 2022 11:50:17 +0800 Subject: [PATCH 09/21] crimson/os/seastore/btree: search fixed-kv-btree by parent<->child pointers Signed-off-by: Xuehan Xu --- .../os/seastore/btree/fixed_kv_btree.h | 217 ++++++++++++++---- src/crimson/os/seastore/btree/fixed_kv_node.h | 52 +++++ src/crimson/os/seastore/cache.h | 25 +- src/crimson/os/seastore/cached_extent.h | 75 ++++-- src/crimson/os/seastore/transaction.h | 11 +- 5 files changed, 305 insertions(+), 75 deletions(-) diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 71e77a5ae970c..ca6e9359e7a96 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -219,6 +219,12 @@ class FixedKVBtree { typename NodeType::Ref node; uint16_t pos = INVALID; + node_position_t() = default; + node_position_t( + typename NodeType::Ref node, + uint16_t pos) + : node(node), pos(pos) {} + void reset() { *this = node_position_t{}; } @@ -1009,6 +1015,9 @@ class FixedKVBtree { phy_tree_root_t root; bool root_dirty = false; + template + using node_position_t = typename iterator::template node_position_t; + using get_internal_node_iertr = base_iertr; using get_internal_node_ret = get_internal_node_iertr::future; static get_internal_node_ret get_internal_node( @@ -1016,7 +1025,8 @@ class FixedKVBtree { depth_t depth, paddr_t offset, node_key_t begin, - node_key_t end) + node_key_t end, + typename std::optional> parent_pos) { LOG_PREFIX(FixedKVBtree::get_internal_node); SUBTRACET( @@ -1028,10 +1038,16 @@ class FixedKVBtree { begin, end); assert(depth > 1); - auto init_internal = [c, depth, begin, end](internal_node_t &node) { + auto init_internal = [c, depth, begin, end, + parent_pos=std::move(parent_pos)] + (internal_node_t &node) { assert(!node.is_pending()); assert(!node.pin.is_linked()); node.pin.set_range(fixed_kv_node_meta_t{begin, end, depth}); + if (parent_pos) { + auto &parent = parent_pos->node; + parent->link_child(&node, parent_pos->pos); + } if (c.pins) { c.pins->add_pin(node.pin); } @@ -1078,7 +1094,8 @@ class FixedKVBtree { op_context_t c, paddr_t offset, node_key_t begin, - node_key_t end) + node_key_t end, + typename std::optional> parent_pos) { LOG_PREFIX(FixedKVBtree::get_leaf_node); SUBTRACET( @@ -1088,10 +1105,16 @@ class FixedKVBtree { offset, begin, end); - auto init_leaf = [c, begin, end](leaf_node_t &node) { + auto init_leaf = [c, begin, end, + parent_pos=std::move(parent_pos)] + (leaf_node_t &node) { assert(!node.is_pending()); assert(!node.pin.is_linked()); node.pin.set_range(fixed_kv_node_meta_t{begin, end, 1}); + if (parent_pos) { + auto &parent = parent_pos->node; + parent->link_child(&node, parent_pos->pos); + } if (c.pins) { c.pins->add_pin(node.pin); } @@ -1143,7 +1166,8 @@ class FixedKVBtree { root.get_depth(), root.get_location(), min_max_t::min, - min_max_t::max + min_max_t::max, + std::nullopt ).si_then([this, visitor, &iter](InternalNodeRef root_node) { iter.get_internal(root.get_depth()).node = root_node; if (visitor) (*visitor)( @@ -1158,7 +1182,8 @@ class FixedKVBtree { c, root.get_location(), min_max_t::min, - min_max_t::max + min_max_t::max, + std::nullopt ).si_then([visitor, &iter, this](LeafNodeRef root_node) { iter.leaf.node = root_node; if (visitor) (*visitor)( @@ -1181,22 +1206,13 @@ class FixedKVBtree { F &f, mapped_space_visitor_t *visitor ) { + LOG_PREFIX(FixedKVBtree::lookup_internal_level); assert(depth > 1); auto &parent_entry = iter.get_internal(depth + 1); auto parent = parent_entry.node; auto node_iter = parent->iter_idx(parent_entry.pos); - auto next_iter = node_iter + 1; - auto begin = node_iter->get_key(); - auto end = next_iter == parent->end() - ? parent->get_node_meta().end - : next_iter->get_key(); - return get_internal_node( - c, - depth, - node_iter->get_val().maybe_relative_to(parent->get_paddr()), - begin, - end - ).si_then([depth, visitor, &iter, &f](InternalNodeRef node) { + + auto on_found = [depth, visitor, &iter, &f](InternalNodeRef node) { auto &entry = iter.get_internal(depth); entry.node = node; auto node_iter = f(*node); @@ -1209,6 +1225,50 @@ class FixedKVBtree { depth, node->get_type()); return seastar::now(); + }; + + auto child_pos = parent->get_child(c.trans, node_iter); + auto &child = child_pos.child; + if (child) { + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); + + ceph_assert(child->is_valid()); + if (!child->is_pending_in_trans(c.trans.get_trans_id())) { + c.trans.add_to_read_set(child); + if (!child->is_mutation_pending()) { + c.cache.touch_extent(*child); + } + } + return child->wait_io().then( + [child, on_found=std::move(on_found), node_iter]() mutable { + auto &cnode = (typename internal_node_t::base_t &)*child; + assert(cnode.get_node_meta().begin == node_iter.get_key()); + assert(cnode.get_node_meta().end > node_iter.get_key()); + return on_found(child->template cast()); + }); + } + + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + return get_internal_node( + c, + depth, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end, + std::make_optional>( + child_pos.stable_parent->template cast(), + child_pos.pos) + ).si_then([on_found=std::move(on_found)](InternalNodeRef node) { + return on_found(node); }); } @@ -1221,22 +1281,13 @@ class FixedKVBtree { F &f, mapped_space_visitor_t *visitor ) { + LOG_PREFIX(FixedKVBtree::lookup_leaf); auto &parent_entry = iter.get_internal(2); auto parent = parent_entry.node; assert(parent); auto node_iter = parent->iter_idx(parent_entry.pos); - auto next_iter = node_iter + 1; - auto begin = node_iter->get_key(); - auto end = next_iter == parent->end() - ? parent->get_node_meta().end - : next_iter->get_key(); - return get_leaf_node( - c, - node_iter->get_val().maybe_relative_to(parent->get_paddr()), - begin, - end - ).si_then([visitor, &iter, &f](LeafNodeRef node) { + auto on_found = [visitor, &iter, &f](LeafNodeRef node) { iter.leaf.node = node; auto node_iter = f(*node); iter.leaf.pos = node_iter->get_offset(); @@ -1247,6 +1298,50 @@ class FixedKVBtree { 1, node->get_type()); return seastar::now(); + }; + + auto child_pos = parent->get_child(c.trans, node_iter); + auto &child = child_pos.child; + if (child) { + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); + + ceph_assert(child->is_valid()); + if (!child->is_pending_in_trans(c.trans.get_trans_id())) { + c.trans.add_to_read_set(child); + if (!child->is_mutation_pending()) { + c.cache.touch_extent(*child); + } + } + return child->wait_io().then( + [child, on_found=std::move(on_found), node_iter]() mutable { + auto &cnode = (typename internal_node_t::base_t &)*child; + assert(cnode.get_node_meta().begin == node_iter.get_key()); + assert(cnode.get_node_meta().end > node_iter.get_key()); + return on_found(child->template cast()); + }); + } + + auto next_iter = node_iter + 1; + auto begin = node_iter->get_key(); + auto end = next_iter == parent->end() + ? parent->get_node_meta().end + : next_iter->get_key(); + + return get_leaf_node( + c, + node_iter->get_val().maybe_relative_to(parent->get_paddr()), + begin, + end, + std::make_optional>( + child_pos.stable_parent->template cast(), + child_pos.pos) + ).si_then([on_found=std::move(on_found)](LeafNodeRef node) { + return on_found(node); }); } @@ -1627,9 +1722,6 @@ class FixedKVBtree { }); } - template - using node_position_t = typename iterator::template node_position_t; - template , int> = 0> base_iertr::future get_node( @@ -1637,9 +1729,10 @@ class FixedKVBtree { depth_t depth, paddr_t addr, node_key_t begin, - node_key_t end) { + node_key_t end, + typename std::optional> parent_pos) { assert(depth == 1); - return get_leaf_node(c, addr, begin, end); + return get_leaf_node(c, addr, begin, end, std::move(parent_pos)); } template > parent_pos) { + return get_internal_node(c, depth, addr, begin, end, std::move(parent_pos)); } template @@ -1678,13 +1772,7 @@ class FixedKVBtree { : next_iter->get_key(); SUBTRACET(seastore_fixedkv_tree, "parent: {}, node: {}", c.trans, *parent_pos.node, *pos.node); - return get_node( - c, - depth, - donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), - begin, - end - ).si_then([c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( + auto do_merge = [c, iter, donor_iter, donor_is_left, &parent_pos, &pos]( typename NodeType::Ref donor) { LOG_PREFIX(FixedKVBtree::merge_level); auto [l, r] = donor_is_left ? @@ -1756,6 +1844,49 @@ class FixedKVBtree { } return seastar::now(); + }; + + auto child_pos = parent_pos.node->get_child(c.trans, donor_iter); + auto &child = child_pos.child; + if (child) { + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_pos.node, + donor_iter.get_offset(), + *child); + + ceph_assert(child->is_valid()); + if (!child->is_pending_in_trans(c.trans.get_trans_id())) { + c.trans.add_to_read_set(child); + if (!child->is_mutation_pending()) { + c.cache.touch_extent(*child); + } + } + return child->wait_io().then( + [child, do_merge=std::move(do_merge), &pos, + donor_iter, donor_is_left]() mutable { + auto &node = (typename internal_node_t::base_t&)*child; + assert(donor_is_left ? + node.get_node_meta().end == pos.node->get_node_meta().begin : + node.get_node_meta().begin == pos.node->get_node_meta().end); + assert(node.get_node_meta().begin == donor_iter.get_key()); + assert(node.get_node_meta().end > donor_iter.get_key()); + return do_merge(child->template cast()); + }); + } + + return get_node( + c, + depth, + donor_iter.get_val().maybe_relative_to(parent_pos.node->get_paddr()), + begin, + end, + std::make_optional>( + child_pos.stable_parent->template cast(), + child_pos.pos) + ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) { + return do_merge(donor); }); } }; diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 101ad4945d3f1..b171db6965b5a 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -216,6 +216,47 @@ struct FixedKVNode : CachedExtent { } } + struct child_pos_t { + FixedKVNodeRef stable_parent; + uint16_t pos = std::numeric_limits::max(); + CachedExtentRef child; + child_pos_t(CachedExtentRef child) : child(child) {} + child_pos_t(FixedKVNodeRef stable_parent, uint16_t pos) + : stable_parent(stable_parent), pos(pos) {} + }; + + void link_child(FixedKVNode* child, uint16_t pos) { + assert(pos < get_node_size()); + assert(child); + ceph_assert(!is_pending()); + ceph_assert(child->is_valid() && !child->is_pending()); + assert(!children[pos]); + children[pos] = child; + set_child_ptracker(child); + } + + template + child_pos_t get_child(Transaction &t, iter_t iter) { + auto pos = iter.get_offset(); + assert(children.capacity()); + auto child = children[pos]; + if (child) { + return child_pos_t(child->get_transactional_view(t)); + } else if (is_pending()) { + auto key = iter.get_key(); + auto &sparent = get_stable_for_key(key); + auto spos = sparent.child_pos_for_key(key); + auto child = sparent.children[spos]; + if (child) { + return child_pos_t(child->get_transactional_view(t)); + } else { + return child_pos_t(&sparent, spos); + } + } else { + return child_pos_t(this, pos); + } + } + void split_child_ptrs( FixedKVNode &left, FixedKVNode &right) @@ -585,6 +626,13 @@ struct FixedKVInternalNode return this->upper_bound(key).get_offset(); } + uint16_t child_pos_for_key(NODE_KEY key) const final { + auto it = this->upper_bound(key); + assert(it != this->begin()); + --it; + return it.get_offset(); + } + NODE_KEY get_key_from_idx(uint16_t idx) const final { return this->iter_idx(idx).get_key(); } @@ -919,6 +967,10 @@ struct FixedKVLeafNode return this->upper_bound(key).get_offset(); } + uint16_t child_pos_for_key(NODE_KEY key) const final { + return lower_bound_offset(key); + } + NODE_KEY get_key_from_idx(uint16_t idx) const final { return this->iter_idx(idx).get_key(); } diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 3b90516e614c6..32ff392b62c1b 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -985,6 +985,19 @@ class Cache { uint64_t get_omap_tree_depth() { return stats.omap_tree_depth; } + + /// Update lru for access to ref + void touch_extent( + CachedExtent &ext, + const Transaction::src_t* p_src=nullptr) + { + if (p_src && is_background_transaction(*p_src)) + return; + if (ext.is_clean() && !ext.is_placeholder()) { + lru.move_to_top(ext); + } + } + private: ExtentPlacementManager& epm; RootBlockRef root; ///< ref to current root @@ -1268,18 +1281,6 @@ class Cache { return bp; } - /// Update lru for access to ref - void touch_extent( - CachedExtent &ext, - const Transaction::src_t* p_src=nullptr) - { - if (p_src && is_background_transaction(*p_src)) - return; - if (ext.is_clean() && !ext.is_placeholder()) { - lru.move_to_top(ext); - } - } - void backref_batch_update( std::vector &&, const journal_seq_t &); diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 70ebc394bd584..170a81377467b 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -24,6 +24,15 @@ class SegmentedAllocator; class TransactionManager; class ExtentPlacementManager; +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size> +class FixedKVBtree; + // #define DEBUG_CACHED_EXTENT_REF #ifdef DEBUG_CACHED_EXTENT_REF @@ -45,11 +54,14 @@ namespace onode { template class read_set_item_t { - boost::intrusive::list_member_hook<> list_hook; - using list_hook_options = boost::intrusive::member_hook< + using set_hook_t = boost::intrusive::set_member_hook< + boost::intrusive::link_mode< + boost::intrusive::auto_unlink>>; + set_hook_t trans_hook; + using set_hook_options = boost::intrusive::member_hook< read_set_item_t, - boost::intrusive::list_member_hook<>, - &read_set_item_t::list_hook>; + set_hook_t, + &read_set_item_t::trans_hook>; public: struct cmp_t { @@ -59,9 +71,29 @@ class read_set_item_t { bool operator()(const read_set_item_t &lhs, const paddr_t &rhs) const; }; - using list = boost::intrusive::list< + struct trans_cmp_t { + bool operator()( + const read_set_item_t &lhs, + const read_set_item_t &rhs) const { + return lhs.t < rhs.t; + } + bool operator()( + const Transaction *lhs, + const read_set_item_t &rhs) const { + return lhs < rhs.t; + } + bool operator()( + const read_set_item_t &lhs, + const Transaction *rhs) const { + return lhs.t < rhs; + } + }; + + using trans_set_t = boost::intrusive::set< read_set_item_t, - list_hook_options>; + set_hook_options, + boost::intrusive::constant_time_size, + boost::intrusive::compare>; T *t = nullptr; CachedExtentRef ref; @@ -69,7 +101,7 @@ class read_set_item_t { read_set_item_t(T *t, CachedExtentRef ref); read_set_item_t(const read_set_item_t &) = delete; read_set_item_t(read_set_item_t &&) = default; - ~read_set_item_t(); + ~read_set_item_t() = default; }; template using read_set_t = std::set< @@ -151,6 +183,14 @@ class CachedExtent friend class onode::DummyNodeExtent; friend class onode::TestReplayExtent; + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size> + friend class FixedKVBtree; uint32_t last_committed_crc = 0; // Points at current version while in state MUTATION_PENDING @@ -530,6 +570,11 @@ class CachedExtent return extent_index_hook.is_linked(); } + /// Returns true if the extent part of the open transaction + bool is_pending_in_trans(transaction_id_t id) const { + return is_pending() && pending_for_transaction == id; + } + /// hook for intrusive ref list (mainly dirty or lru list) boost::intrusive::list_member_hook<> primary_ref_list_hook; using primary_ref_list_member_options = boost::intrusive::member_hook< @@ -580,7 +625,9 @@ class CachedExtent } } - read_set_item_t::list transactions; + CachedExtent* get_transactional_view(Transaction &t); + + read_set_item_t::trans_set_t transactions; placement_hint_t user_hint = PLACEMENT_HINT_NULL; @@ -611,8 +658,6 @@ class CachedExtent struct retired_placeholder_t{}; CachedExtent(retired_placeholder_t) : state(extent_state_t::INVALID) {} - CachedExtent& get_transactional_view(Transaction &t); - friend class Cache; template static TCachedExtentRef make_cached_extent_ref( @@ -1000,15 +1045,7 @@ struct ref_laddr_cmp { template read_set_item_t::read_set_item_t(T *t, CachedExtentRef ref) : t(t), ref(ref) -{ - ref->transactions.push_back(*this); -} - -template -read_set_item_t::~read_set_item_t() -{ - ref->transactions.erase(ref->transactions.s_iterator_to(*this)); -} +{} template inline bool read_set_item_t::cmp_t::operator()( diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index 14ba61cee07df..ed9d1d1a0c4ae 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -140,8 +140,16 @@ class Transaction { void add_to_read_set(CachedExtentRef ref) { if (is_weak()) return; + assert(ref->is_valid()); + + auto it = ref->transactions.lower_bound( + this, read_set_item_t::trans_cmp_t()); + if (it != ref->transactions.end() && it->t == this) return; + auto [iter, inserted] = read_set.emplace(this, ref); ceph_assert(inserted); + ref->transactions.insert_before( + it, const_cast&>(*iter)); } void add_fresh_extent( @@ -231,7 +239,8 @@ class Transaction { assert(where != read_set.end()); assert(where->ref.get() == &placeholder); where = read_set.erase(where); - read_set.emplace_hint(where, this, &extent); + auto it = read_set.emplace_hint(where, this, &extent); + extent.transactions.insert(const_cast&>(*it)); } { auto where = retired_set.find(&placeholder); From c29051c4c747c5b8415b8600affc24c0025507fb Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 13 Oct 2022 14:27:34 +0800 Subject: [PATCH 10/21] crimson/os/seastore/btree: avoid searching transactions' read_set when retrieving btree nodes Signed-off-by: Xuehan Xu --- .../os/seastore/btree/fixed_kv_btree.h | 6 +- src/crimson/os/seastore/cache.h | 170 ++++++++++++++++-- src/crimson/os/seastore/transaction_manager.h | 4 +- 3 files changed, 156 insertions(+), 24 deletions(-) diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index ca6e9359e7a96..b7056e4657862 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -1052,13 +1052,13 @@ class FixedKVBtree { c.pins->add_pin(node.pin); } }; - return c.cache.template get_extent( + return c.cache.template get_absent_extent( c.trans, offset, node_size, init_internal ).si_then([FNAME, c, offset, init_internal, depth, begin, end]( - typename internal_node_t::Ref ret) { + typename internal_node_t::Ref ret) { SUBTRACET( seastore_fixedkv_tree, "read internal at offset {} {}", @@ -1119,7 +1119,7 @@ class FixedKVBtree { c.pins->add_pin(node.pin); } }; - return c.cache.template get_extent( + return c.cache.template get_absent_extent( c.trans, offset, node_size, diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 32ff392b62c1b..a04693e73d3e3 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -424,32 +424,71 @@ class Cache { auto result = t.get_extent(offset, &ret); if (result != Transaction::get_extent_ret::ABSENT) { SUBTRACET(seastore_cache, "{} {}~{} is {} on t -- {}", - t, - T::TYPE, - offset, - length, - result == Transaction::get_extent_ret::PRESENT ? "present" : "retired", - *ret); + t, + T::TYPE, + offset, + length, + result == Transaction::get_extent_ret::PRESENT ? "present" : "retired", + *ret); assert(result != Transaction::get_extent_ret::RETIRED); return ret->wait_io().then([ret] { return seastar::make_ready_future>( ret->cast()); }); - } else { - SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", - t, T::TYPE, offset, length); - auto f = [&t, this](CachedExtent &ext) { - t.add_to_read_set(CachedExtentRef(&ext)); - touch_extent(ext); - }; - auto metric_key = std::make_pair(t.get_src(), T::TYPE); - return trans_intr::make_interruptible( - get_extent( - offset, length, &metric_key, - std::forward(extent_init_func), std::move(f)) - ); } + + SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", + t, T::TYPE, offset, length); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto metric_key = std::make_pair(t.get_src(), T::TYPE); + return trans_intr::make_interruptible( + get_extent( + offset, length, &metric_key, + std::forward(extent_init_func), std::move(f)) + ); } + + /* + * get_absent_extent + * + * Mostly the same as Cache::get_extent(), with the only difference + * that get_absent_extent won't search the transaction's context for + * the specific CachedExtent + */ + template + get_extent_iertr::future> get_absent_extent( + Transaction &t, + paddr_t offset, + extent_len_t length, + Func &&extent_init_func) { + CachedExtentRef ret; + LOG_PREFIX(Cache::get_extent); + +#ifndef NDEBUG + auto r = t.get_extent(offset, &ret); + if (r != Transaction::get_extent_ret::ABSENT) { + SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret); + ceph_abort(); + } +#endif + + SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...", + t, T::TYPE, offset, length); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto metric_key = std::make_pair(t.get_src(), T::TYPE); + return trans_intr::make_interruptible( + get_extent( + offset, length, &metric_key, + std::forward(extent_init_func), std::move(f)) + ); + } + template get_extent_iertr::future> get_extent( Transaction &t, @@ -458,6 +497,21 @@ class Cache { return get_extent(t, offset, length, [](T &){}); } + /* + * get_absent_extent + * + * Mostly the same as Cache::get_extent(), with the only difference + * that get_absent_extent won't search the transaction's context for + * the specific CachedExtent + */ + template + get_extent_iertr::future> get_absent_extent( + Transaction &t, + paddr_t offset, + extent_len_t length) { + return get_absent_extent(t, offset, length, [](T &){}); + } + extent_len_t get_block_size() const { return epm.get_block_size(); } @@ -539,6 +593,39 @@ class Cache { } } + get_extent_by_type_ret _get_absent_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length, + extent_init_func_t &&extent_init_func + ) { + LOG_PREFIX(Cache::_get_absent_extent_by_type); + +#ifndef NDEBUG + CachedExtentRef ret; + auto r = t.get_extent(offset, &ret); + if (r != Transaction::get_extent_ret::ABSENT) { + SUBERRORT(seastore_cache, "unexpected non-absent extent {}", t, *ret); + ceph_abort(); + } +#endif + + SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...", + t, type, offset, length, laddr); + auto f = [&t, this](CachedExtent &ext) { + t.add_to_read_set(CachedExtentRef(&ext)); + touch_extent(ext); + }; + auto src = t.get_src(); + return trans_intr::make_interruptible( + _get_extent_by_type( + type, offset, laddr, length, &src, + std::move(extent_init_func), std::move(f)) + ); + } + backref_entryrefs_by_seq_t backref_entryrefs_by_seq; backref_entry_mset_t backref_entry_mset; @@ -603,6 +690,32 @@ class Cache { length, extent_init_func_t(std::forward(extent_init_func))); } + + /* + * get_absent_extent_by_type + * + * Mostly the same as Cache::get_extent_by_type(), with the only difference + * that get_absent_extent_by_type won't search the transaction's context for + * the specific CachedExtent + */ + template + get_extent_by_type_ret get_absent_extent_by_type( + Transaction &t, ///< [in] transaction + extent_types_t type, ///< [in] type tag + paddr_t offset, ///< [in] starting addr + laddr_t laddr, ///< [in] logical address if logical + extent_len_t length, ///< [in] length + Func &&extent_init_func ///< [in] extent init func + ) { + return _get_absent_extent_by_type( + t, + type, + offset, + laddr, + length, + extent_init_func_t(std::forward(extent_init_func))); + } + get_extent_by_type_ret get_extent_by_type( Transaction &t, extent_types_t type, @@ -614,6 +727,25 @@ class Cache { t, type, offset, laddr, length, [](CachedExtent &) {}); } + + /* + * get_absent_extent_by_type + * + * Mostly the same as Cache::get_extent_by_type(), with the only difference + * that get_absent_extent_by_type won't search the transaction's context for + * the specific CachedExtent + */ + get_extent_by_type_ret get_absent_extent_by_type( + Transaction &t, + extent_types_t type, + paddr_t offset, + laddr_t laddr, + extent_len_t length + ) { + return get_absent_extent_by_type( + t, type, offset, laddr, length, [](CachedExtent &) {}); + } + void trim_backref_bufs(const journal_seq_t &trim_to) { LOG_PREFIX(Cache::trim_backref_bufs); SUBDEBUG(seastore_cache, "trimming to {}", trim_to); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index 6cb18c560fab4..e00290d88e2a2 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -134,7 +134,7 @@ class TransactionManager : public ExtentCallbackInterface { static_assert(is_logical_type(T::TYPE)); using ret = pin_to_extent_ret; auto &pref = *pin; - return cache->get_extent( + return cache->get_absent_extent( t, pref.get_val(), pref.get_length(), @@ -168,7 +168,7 @@ class TransactionManager : public ExtentCallbackInterface { SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type); assert(is_logical_type(type)); auto &pref = *pin; - return cache->get_extent_by_type( + return cache->get_absent_extent_by_type( t, type, pref.get_val(), From 45440fadd20fa21deaddbe6db4e0c4e84015c9bf Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Wed, 17 Aug 2022 18:07:42 +0800 Subject: [PATCH 11/21] crimson/os/seastore/backref_manager: retrieve live backref extents throught the backref tree After involving intra-fixed-kv-btree parent-child pointers, we need to keep the invariant that it's only when extents are not in transactions' read_set that we can directly query cache with inspecting the transaction Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/async_cleaner.cc | 10 ++++- .../seastore/backref/btree_backref_manager.cc | 44 +++++++++++++------ .../seastore/backref/btree_backref_manager.h | 5 ++- src/crimson/os/seastore/backref_manager.h | 7 ++- .../os/seastore/btree/fixed_kv_btree.h | 6 ++- src/crimson/os/seastore/cache.cc | 5 ++- src/crimson/os/seastore/cache.h | 11 +++-- .../os/seastore/transaction_manager.cc | 6 ++- 8 files changed, 71 insertions(+), 23 deletions(-) diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc index 4bac744e4a067..84677747b77cd 100644 --- a/src/crimson/os/seastore/async_cleaner.cc +++ b/src/crimson/os/seastore/async_cleaner.cc @@ -1424,23 +1424,27 @@ bool SegmentCleaner::check_usage() t, [&tracker]( paddr_t paddr, + paddr_t backref_key, extent_len_t len, extent_types_t type, laddr_t laddr) { if (paddr.get_addr_type() == paddr_types_t::SEGMENT) { if (is_backref_node(type)) { - assert(laddr == L_ADDR_NULL); + assert(laddr == L_ADDR_NULL); + assert(backref_key != P_ADDR_NULL); tracker->allocate( paddr.as_seg_paddr().get_segment_id(), paddr.as_seg_paddr().get_segment_off(), len); } else if (laddr == L_ADDR_NULL) { + assert(backref_key == P_ADDR_NULL); tracker->release( paddr.as_seg_paddr().get_segment_id(), paddr.as_seg_paddr().get_segment_off(), len); } else { + assert(backref_key == P_ADDR_NULL); tracker->allocate( paddr.as_seg_paddr().get_segment_id(), paddr.as_seg_paddr().get_segment_off(), @@ -1724,6 +1728,7 @@ bool RBMCleaner::check_usage() t, [&tracker, &rbms]( paddr_t paddr, + paddr_t backref_key, extent_len_t len, extent_types_t type, laddr_t laddr) @@ -1732,14 +1737,17 @@ bool RBMCleaner::check_usage() if (rbm->get_device_id() == paddr.get_device_id()) { if (is_backref_node(type)) { assert(laddr == L_ADDR_NULL); + assert(backref_key != P_ADDR_NULL); tracker.allocate( paddr, len); } else if (laddr == L_ADDR_NULL) { + assert(backref_key == P_ADDR_NULL); tracker.release( paddr, len); } else { + assert(backref_key == P_ADDR_NULL); tracker.allocate( paddr, len); diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index c1405ebc18d46..3c8eb38c62b85 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -328,6 +328,7 @@ BtreeBackrefManager::scan_mapped_space( ceph_assert(pos.get_val().laddr != L_ADDR_NULL); scan_visitor( pos.get_key(), + P_ADDR_NULL, pos.get_val().len, pos.get_val().type, pos.get_val().laddr); @@ -362,6 +363,7 @@ BtreeBackrefManager::scan_mapped_space( ceph_assert(!is_backref_node(backref_entry.type)); scan_visitor( backref_entry.paddr, + P_ADDR_NULL, backref_entry.len, backref_entry.type, backref_entry.laddr); @@ -369,7 +371,7 @@ BtreeBackrefManager::scan_mapped_space( }).si_then([this, &scan_visitor, block_size, c, FNAME] { BackrefBtree::mapped_space_visitor_t f = [&scan_visitor, block_size, FNAME, c]( - paddr_t paddr, extent_len_t len, + paddr_t paddr, paddr_t key, extent_len_t len, depth_t depth, extent_types_t type) { TRACET("tree node {}~{} {}, depth={} used", c.trans, paddr, len, type, depth); @@ -377,7 +379,7 @@ BtreeBackrefManager::scan_mapped_space( ceph_assert(len > 0 && len % block_size == 0); ceph_assert(depth >= 1); ceph_assert(is_backref_node(type)); - return scan_visitor(paddr, len, type, L_ADDR_NULL); + return scan_visitor(paddr, key, len, type, L_ADDR_NULL); }; return seastar::do_with( std::move(f), @@ -534,9 +536,10 @@ BtreeBackrefManager::get_cached_backref_entries_in_range( void BtreeBackrefManager::cache_new_backref_extent( paddr_t paddr, + paddr_t key, extent_types_t type) { - return cache.add_backref_extent(paddr, type); + return cache.add_backref_extent(paddr, key, type); } BtreeBackrefManager::retrieve_backref_extents_in_range_ret @@ -545,10 +548,11 @@ BtreeBackrefManager::retrieve_backref_extents_in_range( paddr_t start, paddr_t end) { + auto backref_extents = cache.get_backref_extents_in_range(start, end); return seastar::do_with( std::vector(), - [this, &t, start, end](auto &extents) { - auto backref_extents = cache.get_backref_extents_in_range(start, end); + std::move(backref_extents), + [this, &t](auto &extents, auto &backref_extents) { return trans_intr::parallel_for_each( backref_extents, [this, &extents, &t](auto &ent) { @@ -556,14 +560,28 @@ BtreeBackrefManager::retrieve_backref_extents_in_range( // so it must be alive assert(is_backref_node(ent.type)); LOG_PREFIX(BtreeBackrefManager::retrieve_backref_extents_in_range); - DEBUGT("getting backref extent of type {} at {}", - t, - ent.type, - ent.paddr); - return cache.get_extent_by_type( - t, ent.type, ent.paddr, L_ADDR_NULL, BACKREF_NODE_SIZE - ).si_then([&extents](auto ext) { - extents.emplace_back(std::move(ext)); + DEBUGT("getting backref extent of type {} at {}, key {}", + t, + ent.type, + ent.paddr, + ent.key); + + auto c = get_context(t); + return with_btree_ret( + cache, + c, + [c, &ent](auto &btree) { + if (ent.type == extent_types_t::BACKREF_INTERNAL) { + return btree.get_internal_if_live( + c, ent.paddr, ent.key, BACKREF_NODE_SIZE); + } else { + assert(ent.type == extent_types_t::BACKREF_LEAF); + return btree.get_leaf_if_live( + c, ent.paddr, ent.key, BACKREF_NODE_SIZE); + } + }).si_then([&extents](auto ext) { + ceph_assert(ext); + extents.emplace_back(std::move(ext)); }); }).si_then([&extents] { return std::move(extents); diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index d2241a5dcc3ae..95a1c03113db8 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -108,7 +108,10 @@ class BtreeBackrefManager : public BackrefManager { paddr_t start, paddr_t end) final; - void cache_new_backref_extent(paddr_t paddr, extent_types_t type) final; + void cache_new_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) final; private: Cache &cache; diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h index 5db637086008f..68c02b11a812c 100644 --- a/src/crimson/os/seastore/backref_manager.h +++ b/src/crimson/os/seastore/backref_manager.h @@ -96,7 +96,10 @@ class BackrefManager { paddr_t start, paddr_t end) = 0; - virtual void cache_new_backref_extent(paddr_t paddr, extent_types_t type) = 0; + virtual void cache_new_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) = 0; /** * merge in-cache paddr_t -> laddr_t mappings to the on-disk backref tree @@ -132,7 +135,7 @@ class BackrefManager { using scan_mapped_space_iertr = base_iertr; using scan_mapped_space_ret = scan_mapped_space_iertr::future<>; using scan_mapped_space_func_t = std::function< - void(paddr_t, extent_len_t, extent_types_t, laddr_t)>; + void(paddr_t, paddr_t, extent_len_t, extent_types_t, laddr_t)>; virtual scan_mapped_space_ret scan_mapped_space( Transaction &t, scan_mapped_space_func_t &&f) = 0; diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index b7056e4657862..47eda89782c78 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -58,7 +58,7 @@ class FixedKVBtree { using iterator_fut = base_iertr::future; using mapped_space_visitor_t = std::function< - void(paddr_t, extent_len_t, depth_t, extent_types_t)>; + void(paddr_t, node_key_t, extent_len_t, depth_t, extent_types_t)>; class iterator { public: @@ -1172,6 +1172,7 @@ class FixedKVBtree { iter.get_internal(root.get_depth()).node = root_node; if (visitor) (*visitor)( root_node->get_paddr(), + root_node->get_node_meta().begin, root_node->get_length(), root.get_depth(), internal_node_t::TYPE); @@ -1188,6 +1189,7 @@ class FixedKVBtree { iter.leaf.node = root_node; if (visitor) (*visitor)( root_node->get_paddr(), + root_node->get_node_meta().begin, root_node->get_length(), root.get_depth(), leaf_node_t::TYPE); @@ -1221,6 +1223,7 @@ class FixedKVBtree { if (visitor) (*visitor)( node->get_paddr(), + node->get_node_meta().begin, node->get_length(), depth, node->get_type()); @@ -1294,6 +1297,7 @@ class FixedKVBtree { if (visitor) (*visitor)( node->get_paddr(), + node->get_node_meta().begin, node->get_length(), 1, node->get_type()); diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index c41d29b941c6f..b189b96c23848 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1480,7 +1480,10 @@ void Cache::complete_commit( i->get_type(), start_seq)); } else if (is_backref_node(i->get_type())) { - add_backref_extent(i->get_paddr(), i->get_type()); + add_backref_extent( + i->get_paddr(), + i->cast()->get_node_meta().begin, + i->get_type()); } else { ERRORT("{}", t, *i); ceph_abort("not possible"); diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index a04693e73d3e3..789b14074b882 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -1061,9 +1061,11 @@ class Cache { struct backref_extent_entry_t { backref_extent_entry_t( paddr_t paddr, + paddr_t key, extent_types_t type) - : paddr(paddr), type(type) {} + : paddr(paddr), key(key), type(type) {} paddr_t paddr = P_ADDR_NULL; + paddr_t key = P_ADDR_NULL; extent_types_t type = extent_types_t::ROOT; struct cmp_t { using is_transparent = paddr_t; @@ -1155,9 +1157,12 @@ class Cache { backref_extent_entry_t::cmp_t>; backref_extent_entry_query_set_t backref_extents; - void add_backref_extent(paddr_t paddr, extent_types_t type) { + void add_backref_extent( + paddr_t paddr, + paddr_t key, + extent_types_t type) { assert(!paddr.is_relative()); - auto [iter, inserted] = backref_extents.emplace(paddr, type); + auto [iter, inserted] = backref_extents.emplace(paddr, key, type); boost::ignore_unused(inserted); assert(inserted); } diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 28d8346e5c731..0f083340ce27d 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -125,18 +125,22 @@ TransactionManager::mount_ertr::future<> TransactionManager::mount() t, [this]( paddr_t paddr, + paddr_t backref_key, extent_len_t len, extent_types_t type, laddr_t laddr) { if (is_backref_node(type)) { assert(laddr == L_ADDR_NULL); - backref_manager->cache_new_backref_extent(paddr, type); + assert(backref_key != P_ADDR_NULL); + backref_manager->cache_new_backref_extent(paddr, backref_key, type); cache->update_tree_extents_num(type, 1); epm->mark_space_used(paddr, len); } else if (laddr == L_ADDR_NULL) { + assert(backref_key == P_ADDR_NULL); cache->update_tree_extents_num(type, -1); epm->mark_space_free(paddr, len); } else { + assert(backref_key == P_ADDR_NULL); cache->update_tree_extents_num(type, 1); epm->mark_space_used(paddr, len); } From 25b001db29bbd58c627f3ab64428f5aaac14a399 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 20 Oct 2022 13:35:08 +0800 Subject: [PATCH 12/21] crimson/os/seastore: more debug logs Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/backref/btree_backref_manager.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index 3c8eb38c62b85..341afef36406a 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -192,10 +192,10 @@ BtreeBackrefManager::new_mapping( state.last_end, val ).si_then([&state, c, addr, len, key](auto &&p) { - LOG_PREFIX(BtreeBackrefManager::alloc_extent); + LOG_PREFIX(BtreeBackrefManager::new_mapping); auto [iter, inserted] = std::move(p); - TRACET("{}~{}, paddr={}, inserted at {}", - c.trans, addr, len, key, state.last_end); + TRACET("{}~{}, paddr={}, inserted at {}, leaf {}", + c.trans, addr, len, key, state.last_end, *iter.get_leaf_node()); ceph_assert(inserted); state.ret = iter; }); @@ -473,7 +473,8 @@ BtreeBackrefManager::remove_mapping( -> remove_mapping_ret { if (iter.is_end() || iter.get_key() != addr) { LOG_PREFIX(BtreeBackrefManager::remove_mapping); - DEBUGT("paddr={} doesn't exist", c.trans, addr); + WARNT("paddr={} doesn't exist, state: {}, leaf {}", + c.trans, addr, iter.get_key(), *iter.get_leaf_node()); return remove_mapping_iertr::make_ready_future< remove_mapping_result_t>(remove_mapping_result_t()); } From 4d9b60e75047fa0d7e5acce601bc15f49d253349 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 20 Oct 2022 17:41:25 +0800 Subject: [PATCH 13/21] crimson/os/seastore/btree: link fixed-kv-btree and root_block with pointers Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/CMakeLists.txt | 1 + .../seastore/backref/btree_backref_manager.cc | 54 +++- .../os/seastore/btree/fixed_kv_btree.h | 249 ++++++++++++------ src/crimson/os/seastore/btree/fixed_kv_node.h | 59 +++-- src/crimson/os/seastore/cache.h | 28 ++ src/crimson/os/seastore/cached_extent.cc | 9 +- src/crimson/os/seastore/cached_extent.h | 1 + .../lba_manager/btree/btree_lba_manager.cc | 57 +++- .../lba_manager/btree/lba_btree_node.cc | 2 +- src/crimson/os/seastore/root_block.cc | 27 ++ src/crimson/os/seastore/root_block.h | 17 +- .../seastore/test_btree_lba_manager.cc | 25 +- 12 files changed, 399 insertions(+), 130 deletions(-) create mode 100644 src/crimson/os/seastore/root_block.cc diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index 6dd19a1563a52..baa2e7ca954eb 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -7,6 +7,7 @@ set(crimson_seastore_srcs transaction_manager.cc transaction.cc cache.cc + root_block.cc lba_manager.cc async_cleaner.cc backref_manager.cc diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index 341afef36406a..7db8318cb9d78 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -19,6 +19,57 @@ phy_tree_root_t& get_phy_tree_root< return r.backref_root; } +template<> +const get_phy_tree_root_node_ret get_phy_tree_root_node< + crimson::os::seastore::backref::BackrefBtree>( + const RootBlockRef &root_block, op_context_t c) { + auto backref_root = root_block->backref_root_node; + if (backref_root) { + ceph_assert(backref_root->is_initial_pending() + == root_block->is_pending()); + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, backref_root))}; + } else if (root_block->is_pending()) { + auto &prior = static_cast(*root_block->get_prior_instance()); + backref_root = prior.backref_root_node; + if (backref_root) { + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, backref_root))}; + } else { + return {false, + trans_intr::make_interruptible( + seastar::make_ready_future< + CachedExtentRef>(CachedExtentRef()))}; + } + } else { + return {false, + trans_intr::make_interruptible( + seastar::make_ready_future< + CachedExtentRef>(CachedExtentRef()))}; + } +} + +template +void link_phy_tree_root_node(RootBlockRef &root_block, ROOT* backref_root) { + root_block->backref_root_node = backref_root; + ceph_assert(backref_root != nullptr); + backref_root->root_block = root_block; +} + +template void link_phy_tree_root_node( + RootBlockRef &root_block, backref::BackrefInternalNode* backref_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, backref::BackrefLeafNode* backref_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, backref::BackrefNode* backref_root); + +template <> +void unlink_phy_tree_root_node(RootBlockRef &root_block) { + root_block->backref_root_node = nullptr; +} + } namespace crimson::os::seastore::backref { @@ -36,7 +87,8 @@ BtreeBackrefManager::mkfs( LOG_PREFIX(BtreeBackrefManager::mkfs); INFOT("start", t); return cache.get_root(t).si_then([this, &t](auto croot) { - croot->get_root().backref_root = BackrefBtree::mkfs(get_context(t)); + assert(croot->is_mutation_pending()); + croot->get_root().backref_root = BackrefBtree::mkfs(croot, get_context(t)); return mkfs_iertr::now(); }).handle_error_interruptible( mkfs_iertr::pass_further{}, diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 47eda89782c78..0e51d00ed7678 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -13,6 +13,7 @@ #include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/seastore_types.h" #include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/root_block.h" namespace crimson::os::seastore::lba_manager::btree { struct lba_map_val_t; @@ -20,6 +21,9 @@ struct lba_map_val_t; namespace crimson::os::seastore { +template +phy_tree_root_t& get_phy_tree_root(root_t& r); + template struct op_context_t { Cache &cache; @@ -27,6 +31,23 @@ struct op_context_t { btree_pin_set_t *pins = nullptr; }; +using get_phy_tree_root_node_ret = + std::pair>; + +template +const get_phy_tree_root_node_ret get_phy_tree_root_node( + const RootBlockRef &root_block, + op_context_t c); + +template +void link_phy_tree_root_node(RootBlockRef &root_block, ROOT_T* root_node); + +template +void unlink_phy_tree_root_node(RootBlockRef &root_block); + + template Transaction::tree_stats_t& get_tree_stats(Transaction &t); @@ -303,20 +324,30 @@ class FixedKVBtree { } }; - FixedKVBtree(phy_tree_root_t root) : root(root) {} + FixedKVBtree(RootBlockRef &root_block) : root_block(root_block) {} - bool is_root_dirty() const { - return root_dirty; + auto& get_root() { + return get_phy_tree_root(root_block->get_root()); } - phy_tree_root_t get_root_undirty() { - ceph_assert(root_dirty); - root_dirty = false; - return root; + + auto& get_root() const { + return get_phy_tree_root(root_block->get_root()); + } + + template + void set_root_node(const TCachedExtentRef &root_node) { + static_assert(std::is_base_of_v); + link_phy_tree_root_node(root_block, root_node.get()); + } + + auto get_root_node(op_context_t c) const { + return get_phy_tree_root_node(root_block, c); } /// mkfs using mkfs_ret = phy_tree_root_t; - static mkfs_ret mkfs(op_context_t c) { + static mkfs_ret mkfs(RootBlockRef &root_block, op_context_t c) { + assert(root_block->is_mutation_pending()); auto root_leaf = c.cache.template alloc_new_extent( c.trans, node_size, @@ -328,6 +359,7 @@ class FixedKVBtree { root_leaf->pin.set_range(meta); get_tree_stats(c.trans).depth = 1u; get_tree_stats(c.trans).extents_num_delta++; + link_phy_tree_root_node(root_block, root_leaf.get()); return phy_tree_root_t{root_leaf->get_paddr(), 1u}; } @@ -934,11 +966,11 @@ class FixedKVBtree { depth, old_addr, new_addr, - root.get_location()); + get_root().get_location()); ceph_assert(0 == "impossible"); } - if (root.get_location() != old_addr) { + if (get_root().get_location() != old_addr) { SUBERRORT( seastore_fixedkv_tree, "updating root laddr {} at depth {} from {} to {}," @@ -948,12 +980,14 @@ class FixedKVBtree { depth, old_addr, new_addr, - root.get_location()); + get_root().get_location()); ceph_assert(0 == "impossible"); } - root.set_location(new_addr); - root_dirty = true; + root_block = c.cache.duplicate_for_write( + c.trans, root_block)->template cast(); + get_root().set_location(new_addr); + set_root_node(nextent); } else { auto &parent = iter.get_internal(depth + 1); assert(parent.node); @@ -1012,8 +1046,7 @@ class FixedKVBtree { private: - phy_tree_root_t root; - bool root_dirty = false; + RootBlockRef root_block; template using node_position_t = typename iterator::template node_position_t; @@ -1047,6 +1080,16 @@ class FixedKVBtree { if (parent_pos) { auto &parent = parent_pos->node; parent->link_child(&node, parent_pos->pos); + } else { + assert(node.range.is_root()); + auto root_block = c.cache.get_root_fast(c.trans); + if (root_block->is_mutation_pending()) { + auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance(); + link_phy_tree_root_node(stable_root, &node); + } else { + assert(!root_block->is_pending()); + link_phy_tree_root_node(root_block, &node); + } } if (c.pins) { c.pins->add_pin(node.pin); @@ -1114,6 +1157,16 @@ class FixedKVBtree { if (parent_pos) { auto &parent = parent_pos->node; parent->link_child(&node, parent_pos->pos); + } else { + assert(node.range.is_root()); + auto root_block = c.cache.get_root_fast(c.trans); + if (root_block->is_mutation_pending()) { + auto &stable_root = (RootBlockRef&)*root_block->get_prior_instance(); + link_phy_tree_root_node(stable_root, &node); + } else { + assert(!root_block->is_pending()); + link_phy_tree_root_node(root_block, &node); + } } if (c.pins) { c.pins->add_pin(node.pin); @@ -1160,41 +1213,79 @@ class FixedKVBtree { op_context_t c, iterator &iter, mapped_space_visitor_t *visitor) const { - if (root.get_depth() > 1) { - return get_internal_node( - c, - root.get_depth(), - root.get_location(), - min_max_t::min, - min_max_t::max, - std::nullopt - ).si_then([this, visitor, &iter](InternalNodeRef root_node) { - iter.get_internal(root.get_depth()).node = root_node; - if (visitor) (*visitor)( - root_node->get_paddr(), - root_node->get_node_meta().begin, - root_node->get_length(), - root.get_depth(), - internal_node_t::TYPE); - return lookup_root_iertr::now(); + LOG_PREFIX(FixedKVBtree::lookup_root); + SUBTRACET(seastore_fixedkv_tree, + "looking up root on {}", + c.trans, + *root_block); + auto [found, fut] = get_root_node(c); + + auto on_found_internal = + [this, visitor, &iter](InternalNodeRef &root_node) { + iter.get_internal(get_root().get_depth()).node = root_node; + if (visitor) (*visitor)( + root_node->get_paddr(), + root_node->get_node_meta().begin, + root_node->get_length(), + get_root().get_depth(), + internal_node_t::TYPE); + return lookup_root_iertr::now(); + }; + auto on_found_leaf = + [visitor, &iter, this](LeafNodeRef root_node) { + iter.leaf.node = root_node; + if (visitor) (*visitor)( + root_node->get_paddr(), + root_node->get_node_meta().begin, + root_node->get_length(), + get_root().get_depth(), + leaf_node_t::TYPE); + return lookup_root_iertr::now(); + }; + + if (found) { + return fut.then_interruptible( + [this, c, on_found_internal=std::move(on_found_internal), + on_found_leaf=std::move(on_found_leaf)](auto root) { + LOG_PREFIX(FixedKVBtree::lookup_root); + ceph_assert(root); + SUBTRACET(seastore_fixedkv_tree, + "got root node on {}, res: {}", + c.trans, + *root_block, + *root); + + if (get_root().get_depth() > 1) { + auto root_node = root->template cast(); + return on_found_internal(root_node); + } else { + auto root_node = root->template cast(); + return on_found_leaf(root_node); + } }); } else { - return get_leaf_node( - c, - root.get_location(), - min_max_t::min, - min_max_t::max, - std::nullopt - ).si_then([visitor, &iter, this](LeafNodeRef root_node) { - iter.leaf.node = root_node; - if (visitor) (*visitor)( - root_node->get_paddr(), - root_node->get_node_meta().begin, - root_node->get_length(), - root.get_depth(), - leaf_node_t::TYPE); - return lookup_root_iertr::now(); - }); + if (get_root().get_depth() > 1) { + return get_internal_node( + c, + get_root().get_depth(), + get_root().get_location(), + min_max_t::min, + min_max_t::max, + std::nullopt + ).si_then([on_found=std::move(on_found_internal)](InternalNodeRef root_node) { + return on_found(root_node); + }); + } else { + return get_leaf_node( + c, + get_root().get_location(), + min_max_t::min, + min_max_t::max, + std::nullopt + ).si_then([on_found=std::move(on_found_leaf)](LeafNodeRef root_node) { + return on_found(root_node); + }); + } } } @@ -1421,7 +1512,7 @@ class FixedKVBtree { LOG_PREFIX(FixedKVBtree::lookup); assert(min_depth > 0); return seastar::do_with( - iterator{root.get_depth()}, + iterator{get_root().get_depth()}, std::forward
  • (lookup_internal), std::forward(lookup_leaf), [FNAME, this, visitor, c, min_depth](auto &iter, auto &li, auto &ll) { @@ -1436,11 +1527,12 @@ class FixedKVBtree { auto riter = ll(*(root_entry.node)); root_entry.pos = riter->get_offset(); } - SUBTRACET(seastore_fixedkv_tree, "got root, depth {}", c.trans, root.get_depth()); + SUBTRACET(seastore_fixedkv_tree, "got root, depth {}", + c.trans, get_root().get_depth()); return lookup_depth_range( c, iter, - root.get_depth() - 1, + get_root().get_depth() - 1, min_depth - 1, li, ll, @@ -1542,16 +1634,19 @@ class FixedKVBtree { nroot->journal_insert( nroot->begin(), min_max_t::min, - root.get_location(), + get_root().get_location(), nullptr); iter.internal.push_back({nroot, 0}); - root.set_location(nroot->get_paddr()); - root.set_depth(iter.get_depth()); - ceph_assert(root.get_depth() <= MAX_FIXEDKVBTREE_DEPTH); get_tree_stats(c.trans).depth = iter.get_depth(); get_tree_stats(c.trans).extents_num_delta++; - root_dirty = true; + + root_block = c.cache.duplicate_for_write( + c.trans, root_block)->template cast(); + get_root().set_location(nroot->get_paddr()); + get_root().set_depth(iter.get_depth()); + ceph_assert(get_root().get_depth() <= MAX_FIXEDKVBTREE_DEPTH); + set_root_node(nroot); } /* pos may be either node_position_t or @@ -1693,13 +1788,22 @@ class FixedKVBtree { c.cache.retire_extent(c.trans, pos.node); assert(pos.pos == 0); auto node_iter = pos.get_iter(); - root.set_location( - node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); iter.internal.pop_back(); - root.set_depth(iter.get_depth()); get_tree_stats(c.trans).depth = iter.get_depth(); get_tree_stats(c.trans).extents_num_delta--; - root_dirty = true; + + root_block = c.cache.duplicate_for_write( + c.trans, root_block + )->template cast(); + get_root().set_location( + node_iter->get_val().maybe_relative_to(pos.node->get_paddr())); + get_root().set_depth(iter.get_depth()); + if (iter.get_depth() > 1) { + auto root_node = iter.get_internal(iter.get_depth()).node; + set_root_node(root_node); + } else { + set_root_node(iter.leaf.node); + } } else { SUBTRACET(seastore_fixedkv_tree, "no need to collapse root", c.trans); } @@ -1914,9 +2018,6 @@ struct is_fixed_kv_tree< pin_t, node_size>> : std::true_type {}; -template -phy_tree_root_t& get_phy_tree_root(root_t& r); - template < typename tree_type_t, typename node_key_t, @@ -1926,27 +2027,13 @@ auto with_btree( Cache &cache, op_context_t c, F &&f) { - using base_ertr = crimson::errorator< - crimson::ct_error::input_output_error>; - using base_iertr = trans_iertr; return cache.get_root( c.trans - ).si_then([c, f=std::forward(f), &cache](RootBlockRef croot) mutable { + ).si_then([f=std::forward(f)](RootBlockRef croot) mutable { return seastar::do_with( - tree_type_t(get_phy_tree_root(croot->get_root())), - [c, croot, f=std::move(f), &cache](auto &btree) mutable { - return f( - btree - ).si_then([c, croot, &btree, &cache] { - if (btree.is_root_dirty()) { - auto mut_croot = cache.duplicate_for_write( - c.trans, croot - )->template cast(); - get_phy_tree_root(mut_croot->get_root()) = - btree.get_root_undirty(); - } - return base_iertr::now(); - }); + tree_type_t(croot), + [f=std::move(f)](auto &btree) mutable { + return f(btree); }); }); } diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index b171db6965b5a..bee5b2fa512eb 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -18,6 +18,7 @@ #include "crimson/os/seastore/btree/btree_range_pin.h" #include "crimson/os/seastore/btree/fixed_kv_btree.h" +#include "crimson/os/seastore/root_block.h" namespace crimson::os::seastore { @@ -102,6 +103,7 @@ struct FixedKVNode : CachedExtent { uint16_t capacity = 0; parent_tracker_t* my_tracker = nullptr; parent_tracker_ref parent_tracker; + RootBlockRef root_block; FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), @@ -345,11 +347,16 @@ struct FixedKVNode : CachedExtent { } void set_parent_tracker_from_prior_instance() { + assert(is_mutation_pending()); + auto &prior = (FixedKVNode&)(*get_prior_instance()); if (pin.is_root()) { + ceph_assert(prior.root_block); + ceph_assert(pending_for_transaction); + root_block = prior.root_block; + link_phy_tree_root_node(root_block, this); return; } - assert(is_mutation_pending()); - auto &prior = (FixedKVNode&)(*get_prior_instance()); + ceph_assert(!root_block); parent_tracker = prior.parent_tracker; auto &parent = parent_tracker->parent; assert(parent); @@ -605,16 +612,19 @@ struct FixedKVInternalNode } virtual ~FixedKVInternalNode() { - if (!this->pin.is_root() - && this->is_valid() - && !this->is_pending()) { - ceph_assert(this->parent_tracker); - auto &parent = this->parent_tracker->parent; - ceph_assert(parent); - auto off = parent->lower_bound_offset(this->get_meta().begin); - assert(parent->get_key_from_idx(off) == get_node_meta().begin); - assert(parent->children[off] == this); - parent->children[off] = nullptr; + if (this->is_valid() && !this->is_pending()) { + if (this->pin.is_root()) { + ceph_assert(this->root_block); + unlink_phy_tree_root_node(this->root_block); + } else { + ceph_assert(this->parent_tracker); + auto &parent = this->parent_tracker->parent; + ceph_assert(parent); + auto off = parent->lower_bound_offset(this->get_meta().begin); + assert(parent->get_key_from_idx(off) == this->get_meta().begin); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } } } @@ -855,7 +865,7 @@ struct FixedKVInternalNode if (this->my_tracker) { out << ", my_tracker->parent=" << (void*)this->my_tracker->parent.get(); } - return out; + return out << ", root_block=" << (void*)this->root_block.get(); } ceph::bufferlist get_delta() { @@ -941,16 +951,19 @@ struct FixedKVLeafNode } virtual ~FixedKVLeafNode() { - if (!this->pin.is_root() - && this->is_valid() - && !this->is_pending()) { - ceph_assert(this->parent_tracker); - auto &parent = this->parent_tracker->parent; - ceph_assert(parent); - auto off = parent->lower_bound_offset(this->get_meta().begin); - assert(parent->get_key_from_idx(off) == get_node_meta().begin); - assert(parent->children[off] == this); - parent->children[off] = nullptr; + if (this->is_valid() && !this->is_pending()) { + if (this->pin.is_root()) { + ceph_assert(this->root_block); + unlink_phy_tree_root_node(this->root_block); + } else { + ceph_assert(this->parent_tracker); + auto &parent = this->parent_tracker->parent; + ceph_assert(parent); + auto off = parent->lower_bound_offset(this->get_meta().begin); + assert(parent->get_key_from_idx(off) == this->get_meta().begin); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } } } diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 789b14074b882..ec3c27b160674 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -512,6 +512,34 @@ class Cache { return get_absent_extent(t, offset, length, [](T &){}); } + seastar::future get_extent_viewable_by_trans( + Transaction &t, + CachedExtentRef extent) + { + auto p_extent = extent->get_transactional_view(t); + if (!p_extent->is_pending_in_trans(t.get_trans_id())) { + t.add_to_read_set(p_extent); + if (!p_extent->is_mutation_pending()) { + touch_extent(*p_extent); + } + } + return p_extent->wait_io( + ).then([p_extent] { + return CachedExtentRef(p_extent); + }); + } + + template + seastar::future> get_extent_viewable_by_trans( + Transaction &t, + TCachedExtentRef extent) + { + return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get()) + ).then([](auto p_extent) { + return p_extent->template cast(); + }); + } + extent_len_t get_block_size() const { return epm.get_block_size(); } diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index f1ea69d633260..dfa4c6561684f 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -78,11 +78,12 @@ CachedExtent::~CachedExtent() parent_index->erase(*this); } } - CachedExtent* CachedExtent::get_transactional_view(Transaction &t) { - auto it = mutation_pendings.find( - t.get_trans_id(), - trans_spec_view_t::cmp_t()); + return get_transactional_view(t.get_trans_id()); +} + +CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) { + auto it = mutation_pendings.find(tid, trans_spec_view_t::cmp_t()); if (it != mutation_pendings.end()) { return (CachedExtent*)&(*it); } else { diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 170a81377467b..a2544d5e996b0 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -626,6 +626,7 @@ class CachedExtent } CachedExtent* get_transactional_view(Transaction &t); + CachedExtent* get_transactional_view(transaction_id_t tid); read_set_item_t::trans_set_t transactions; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 63130f7b476c0..2c159535f3874 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -33,6 +33,58 @@ phy_tree_root_t& get_phy_tree_root< return r.lba_root; } +template <> +const get_phy_tree_root_node_ret get_phy_tree_root_node< + crimson::os::seastore::lba_manager::btree::LBABtree>( + const RootBlockRef &root_block, op_context_t c) +{ + auto lba_root = root_block->lba_root_node; + if (lba_root) { + ceph_assert(lba_root->is_initial_pending() + == root_block->is_pending()); + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, lba_root))}; + } else if (root_block->is_pending()) { + auto &prior = static_cast(*root_block->get_prior_instance()); + lba_root = prior.lba_root_node; + if (lba_root) { + return {true, + trans_intr::make_interruptible( + c.cache.get_extent_viewable_by_trans(c.trans, lba_root))}; + } else { + return {false, + trans_intr::make_interruptible( + seastar::make_ready_future< + CachedExtentRef>(CachedExtentRef()))}; + } + } else { + return {false, + trans_intr::make_interruptible( + seastar::make_ready_future< + CachedExtentRef>(CachedExtentRef()))}; + } +} + +template +void link_phy_tree_root_node(RootBlockRef &root_block, ROOT* lba_root) { + root_block->lba_root_node = lba_root; + ceph_assert(lba_root != nullptr); + lba_root->root_block = root_block; +} + +template void link_phy_tree_root_node( + RootBlockRef &root_block, lba_manager::btree::LBAInternalNode* lba_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, lba_manager::btree::LBALeafNode* lba_root); +template void link_phy_tree_root_node( + RootBlockRef &root_block, lba_manager::btree::LBANode* lba_root); + +template <> +void unlink_phy_tree_root_node(RootBlockRef &root_block) { + root_block->lba_root_node = nullptr; +} + } namespace crimson::os::seastore::lba_manager::btree { @@ -43,7 +95,8 @@ BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( LOG_PREFIX(BtreeLBAManager::mkfs); INFOT("start", t); return cache.get_root(t).si_then([this, &t](auto croot) { - croot->get_root().lba_root = LBABtree::mkfs(get_context(t)); + assert(croot->is_mutation_pending()); + croot->get_root().lba_root = LBABtree::mkfs(croot, get_context(t)); return mkfs_iertr::now(); }).handle_error_interruptible( mkfs_iertr::pass_further{}, @@ -301,6 +354,8 @@ BtreeLBAManager::base_iertr::future<> _init_cached_extent( if (!iter.is_end() && iter.get_key() == logn->get_laddr() && iter.get_val().paddr == logn->get_paddr()) { + assert(!iter.get_leaf_node()->is_pending()); + iter.get_leaf_node()->link_child(logn.get(), iter.get_leaf_pos()); logn->set_pin(iter.get_pin()); ceph_assert(iter.get_val().len == e->get_length()); if (c.pins) { diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc index e3eed252ebe43..c502ef338a1f9 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc @@ -35,7 +35,7 @@ std::ostream &LBALeafNode::print_detail(std::ostream &out) const if (parent_tracker) { return out << ", parent=" << (void*)parent_tracker->parent.get(); } - return out; + return out << ", root_block=" << (void*)root_block.get(); } void LBALeafNode::resolve_relative_addrs(paddr_t base) diff --git a/src/crimson/os/seastore/root_block.cc b/src/crimson/os/seastore/root_block.cc new file mode 100644 index 0000000000000..dc928e81b0fa2 --- /dev/null +++ b/src/crimson/os/seastore/root_block.cc @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/root_block.h" +#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h" +#include "crimson/os/seastore/backref/backref_tree_node.h" + +namespace crimson::os::seastore { + +void RootBlock::on_replace_prior(Transaction &t) { + if (!lba_root_node) { + auto &prior = static_cast(*get_prior_instance()); + lba_root_node = prior.lba_root_node; + if (lba_root_node) { + ((lba_manager::btree::LBANode*)lba_root_node)->root_block = this; + } + } + if (!backref_root_node) { + auto &prior = static_cast(*get_prior_instance()); + backref_root_node = prior.backref_root_node; + if (backref_root_node) { + ((backref::BackrefNode*)backref_root_node)->root_block = this; + } + } +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h index 435860ff36bb1..bf3dfb5426568 100644 --- a/src/crimson/os/seastore/root_block.h +++ b/src/crimson/os/seastore/root_block.h @@ -38,9 +38,17 @@ struct RootBlock : CachedExtent { root_t root; + CachedExtent* lba_root_node = nullptr; + CachedExtent* backref_root_node = nullptr; + RootBlock() : CachedExtent(0) {} - RootBlock(const RootBlock &rhs) = default; + RootBlock(const RootBlock &rhs) + : CachedExtent(rhs), + root(rhs.root), + lba_root_node(nullptr), + backref_root_node(nullptr) + {} CachedExtentRef duplicate_for_write(Transaction&) final { return CachedExtentRef(new RootBlock(*this)); @@ -51,6 +59,8 @@ struct RootBlock : CachedExtent { return extent_types_t::ROOT; } + void on_replace_prior(Transaction &t) final; + /// dumps root as delta ceph::bufferlist get_delta() final { ceph::bufferlist bl; @@ -84,6 +94,11 @@ struct RootBlock : CachedExtent { root_t &get_root() { return root; } + std::ostream &print_detail(std::ostream &out) const final { + return out << ", root_block(lba_root_node=" << (void*)lba_root_node + << ", backref_root_node=" << (void*)backref_root_node + << ")"; + } }; using RootBlockRef = RootBlock::Ref; diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index ab9ef79745987..3853d53e7a4ff 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -198,19 +198,11 @@ struct lba_btree_test : btree_test_base { auto mut_croot = cache->duplicate_for_write( t, croot )->cast(); - mut_croot->root.lba_root = LBABtree::mkfs(get_op_context(t)); + mut_croot->root.lba_root = + LBABtree::mkfs(mut_croot, get_op_context(t)); }); } - void update_if_dirty(Transaction &t, LBABtree &btree, RootBlockRef croot) { - if (btree.is_root_dirty()) { - auto mut_croot = cache->duplicate_for_write( - t, croot - )->cast(); - mut_croot->root.lba_root = btree.get_root_undirty(); - } - } - template auto lba_btree_update(F &&f) { auto tref = cache->create_transaction( @@ -221,16 +213,13 @@ struct lba_btree_test : btree_test_base { [this, tref=std::move(tref), f=std::forward(f)](auto &t) mutable { return cache->get_root( t - ).si_then([this, f=std::move(f), &t](RootBlockRef croot) { + ).si_then([f=std::move(f), &t](RootBlockRef croot) { return seastar::do_with( - LBABtree(croot->root.lba_root), - [this, croot, f=std::move(f), &t](auto &btree) mutable { + LBABtree(croot), + [f=std::move(f), &t](auto &btree) mutable { return std::invoke( std::move(f), btree, t - ).si_then([this, croot, &t, &btree] { - update_if_dirty(t, btree, croot); - return seastar::now(); - }); + ); }); }).si_then([this, tref=std::move(tref)]() mutable { return submit_transaction(std::move(tref)); @@ -249,7 +238,7 @@ struct lba_btree_test : btree_test_base { t ).si_then([f=std::move(f), &t](RootBlockRef croot) mutable { return seastar::do_with( - LBABtree(croot->root.lba_root), + LBABtree(croot), [f=std::move(f), &t](auto &btree) mutable { return std::invoke( std::move(f), btree, t From 55e1924e3818e7fb574893372ba7dfad4fa54014 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 27 Oct 2022 15:21:32 +0800 Subject: [PATCH 14/21] crimson/os/seastore/btree: "templatize" btree leaf node to distinguish leaf nodes with(out) children Signed-off-by: Xuehan Xu --- .../os/seastore/backref/backref_tree_node.h | 3 +- .../seastore/backref/btree_backref_manager.h | 2 +- .../os/seastore/btree/fixed_kv_btree.h | 15 ++- src/crimson/os/seastore/btree/fixed_kv_node.h | 5 +- src/crimson/os/seastore/cache.cc | 4 +- src/crimson/os/seastore/cache.h | 2 + src/crimson/os/seastore/cached_extent.h | 6 +- src/crimson/os/seastore/lba_manager.cc | 6 +- src/crimson/os/seastore/lba_manager.h | 1 + .../lba_manager/btree/btree_lba_manager.cc | 98 ++++++++++--------- .../lba_manager/btree/btree_lba_manager.h | 16 ++- .../lba_manager/btree/lba_btree_node.h | 3 +- src/crimson/os/seastore/seastore_types.h | 30 +++--- .../seastore/test_btree_lba_manager.cc | 2 +- 14 files changed, 115 insertions(+), 78 deletions(-) diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h index bebbc0aa4b2f5..db9f1febff125 100644 --- a/src/crimson/os/seastore/backref/backref_tree_node.h +++ b/src/crimson/os/seastore/backref/backref_tree_node.h @@ -76,7 +76,8 @@ class BackrefLeafNode paddr_t, paddr_le_t, backref_map_val_t, backref_map_val_le_t, BACKREF_NODE_SIZE, - BackrefLeafNode> { + BackrefLeafNode, + false> { public: template BackrefLeafNode(T&&... t) : diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 95a1c03113db8..0306d0e8bbe30 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -33,7 +33,7 @@ class BtreeBackrefPin : public BtreeNodePin { using BackrefBtree = FixedKVBtree< paddr_t, backref_map_val_t, BackrefInternalNode, - BackrefLeafNode, BtreeBackrefPin, BACKREF_BLOCK_SIZE>; + BackrefLeafNode, BtreeBackrefPin, BACKREF_BLOCK_SIZE, false>; class BtreeBackrefManager : public BackrefManager { public: diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 0e51d00ed7678..6c3372819f324 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -57,7 +57,8 @@ template < typename internal_node_t, typename leaf_node_t, typename pin_t, - size_t node_size> + size_t node_size, + bool leaf_has_children> class FixedKVBtree { static constexpr size_t MAX_DEPTH = 16; using self_type = FixedKVBtree< @@ -66,7 +67,8 @@ class FixedKVBtree { internal_node_t, leaf_node_t, pin_t, - node_size>; + node_size, + leaf_has_children>; public: using InternalNodeRef = TCachedExtentRef; using LeafNodeRef = TCachedExtentRef; @@ -866,7 +868,8 @@ class FixedKVBtree { n_fixed_kv_extent->set_modify_time(fixed_kv_extent.get_modify_time()); n_fixed_kv_extent->pin.set_range(n_fixed_kv_extent->get_node_meta()); - if (fixed_kv_extent.get_type() == internal_node_t::TYPE) { + if (fixed_kv_extent.get_type() == internal_node_t::TYPE || + leaf_node_t::do_has_children) { if (!fixed_kv_extent.is_pending()) { n_fixed_kv_extent->copy_sources.emplace(&fixed_kv_extent); n_fixed_kv_extent->prior_instance = &fixed_kv_extent; @@ -2008,7 +2011,8 @@ template < typename internal_node_t, typename leaf_node_t, typename pin_t, - size_t node_size> + size_t node_size, + bool leaf_has_children> struct is_fixed_kv_tree< FixedKVBtree< node_key_t, @@ -2016,7 +2020,8 @@ struct is_fixed_kv_tree< internal_node_t, leaf_node_t, pin_t, - node_size>> : std::true_type {}; + node_size, + leaf_has_children>> : std::true_type {}; template < typename tree_type_t, diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index bee5b2fa512eb..202a270a33602 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -915,7 +915,8 @@ template < typename VAL, typename VAL_LE, size_t node_size, - typename node_type_t> + typename node_type_t, + bool has_children> struct FixedKVLeafNode : FixedKVNode, common::FixedKVNodeLayout< @@ -942,6 +943,8 @@ struct FixedKVLeafNode : FixedKVNode(rhs), node_layout_t(this->get_bptr().c_str()) {} + static constexpr bool do_has_children = has_children; + uint16_t get_node_split_pivot() final { return this->get_split_pivot().get_offset(); } diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index b189b96c23848..65f7f1d400f3d 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -146,6 +146,7 @@ void Cache::register_metrics() {extent_types_t::ROOT, sm::label_instance("ext", "ROOT")}, {extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")}, {extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")}, + {extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")}, {extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")}, {extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")}, {extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")}, @@ -969,7 +970,8 @@ CachedExtentRef Cache::alloc_new_extent_by_type( case extent_types_t::LADDR_INTERNAL: return alloc_new_extent(t, length, hint, gen); case extent_types_t::LADDR_LEAF: - return alloc_new_extent(t, length, hint, gen); + return alloc_new_extent( + t, length, hint, gen); case extent_types_t::ONODE_BLOCK_STAGED: return alloc_new_extent(t, length, hint, gen); case extent_types_t::OMAP_INNER: diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index ec3c27b160674..0e004761bab89 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -1119,6 +1119,8 @@ class Cache { switch (type) { case extent_types_t::LADDR_INTERNAL: [[fallthrough]]; + case extent_types_t::DINK_LADDR_LEAF: + [[fallthrough]]; case extent_types_t::LADDR_LEAF: stats.lba_tree_extents_num += delta; ceph_assert(stats.lba_tree_extents_num >= 0); diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index a2544d5e996b0..5ee08f9bbb609 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -30,7 +30,8 @@ template < typename internal_node_t, typename leaf_node_t, typename pin_t, - size_t node_size> + size_t node_size, + bool leaf_has_children> class FixedKVBtree; // #define DEBUG_CACHED_EXTENT_REF @@ -189,7 +190,8 @@ class CachedExtent typename internal_node_t, typename leaf_node_t, typename pin_t, - size_t node_size> + size_t node_size, + bool leaf_has_children> friend class FixedKVBtree; uint32_t last_committed_crc = 0; diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc index 3e3251f062a52..b35e2d0ead8c3 100644 --- a/src/crimson/os/seastore/lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager.cc @@ -22,8 +22,12 @@ LBAManager::update_mappings( }); } +template LBAManagerRef lba_manager::create_lba_manager(Cache &cache) { - return LBAManagerRef(new btree::BtreeLBAManager(cache)); + return LBAManagerRef(new btree::BtreeLBAManager(cache)); } +template LBAManagerRef lba_manager::create_lba_manager(Cache &cache); +template LBAManagerRef lba_manager::create_lba_manager(Cache &cache); + } diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index f512b73f677e1..f495eb0753483 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -206,6 +206,7 @@ using LBAManagerRef = std::unique_ptr; class Cache; namespace lba_manager { +template LBAManagerRef create_lba_manager(Cache &cache); } diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 2c159535f3874..64dd3103ce2bf 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -21,18 +21,27 @@ SET_SUBSYS(seastore_lba); namespace crimson::os::seastore { -template<> -Transaction::tree_stats_t& get_tree_stats< - crimson::os::seastore::lba_manager::btree::LBABtree>(Transaction &t) { +template +Transaction::tree_stats_t& get_tree_stats(Transaction &t) +{ return t.get_lba_tree_stats(); } -template<> -phy_tree_root_t& get_phy_tree_root< - crimson::os::seastore::lba_manager::btree::LBABtree>(root_t &r) { +template Transaction::tree_stats_t& +get_tree_stats< + crimson::os::seastore::lba_manager::btree::LBABtree>( + Transaction &t); + +template +phy_tree_root_t& get_phy_tree_root(root_t &r) +{ return r.lba_root; } +template phy_tree_root_t& +get_phy_tree_root< + crimson::os::seastore::lba_manager::btree::LBABtree>(root_t &r); + template <> const get_phy_tree_root_node_ret get_phy_tree_root_node< crimson::os::seastore::lba_manager::btree::LBABtree>( @@ -89,7 +98,8 @@ void unlink_phy_tree_root_node(RootBlockRef &root_block) { namespace crimson::os::seastore::lba_manager::btree { -BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs( +BtreeLBAManager::mkfs_ret +BtreeLBAManager::mkfs( Transaction &t) { LOG_PREFIX(BtreeLBAManager::mkfs); @@ -125,7 +135,7 @@ BtreeLBAManager::get_mappings( if (pos.is_end() || pos.get_key() >= (offset + length)) { TRACET("{}~{} done with {} results", c.trans, offset, length, ret.size()); - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::yes); } @@ -133,14 +143,13 @@ BtreeLBAManager::get_mappings( c.trans, offset, length, pos.get_key(), pos.get_val()); ceph_assert((pos.get_key() + pos.get_val().len) > offset); ret.push_back(pos.get_pin()); - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::no); }); }); } - BtreeLBAManager::get_mappings_ret BtreeLBAManager::get_mappings( Transaction &t, @@ -155,7 +164,7 @@ BtreeLBAManager::get_mappings( l->begin(), l->end(), [this, &t, &ret](const auto &p) { - return get_mappings(t, p.first, p.second).si_then( + return this->get_mappings(t, p.first, p.second).si_then( [&ret](auto res) { ret.splice(ret.end(), res, res.begin(), res.end()); return get_mappings_iertr::now(); @@ -205,8 +214,8 @@ BtreeLBAManager::alloc_extent( struct state_t { laddr_t last_end; - std::optional insert_iter; - std::optional ret; + std::optional insert_iter; + std::optional ret; state_t(laddr_t hint) : last_end(hint) {} }; @@ -232,7 +241,7 @@ BtreeLBAManager::alloc_extent( stats.num_alloc_extents_iter_nexts - lookup_attempts, state.last_end); state.insert_iter = pos; - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::yes); } else if (pos.get_key() >= (state.last_end + len)) { @@ -243,7 +252,7 @@ BtreeLBAManager::alloc_extent( state.last_end, pos.get_val()); state.insert_iter = pos; - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::yes); } else { @@ -252,7 +261,7 @@ BtreeLBAManager::alloc_extent( t, addr, len, hint, pos.get_key(), pos.get_val().len, pos.get_val()); - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::no); } @@ -280,7 +289,8 @@ static bool is_lba_node(const CachedExtent &e) return is_lba_node(e.get_type()); } -btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e) +btree_range_pin_t &BtreeLBAManager::get_pin( + CachedExtent &e) { if (is_lba_node(e)) { return e.cast()->pin; @@ -338,7 +348,8 @@ void BtreeLBAManager::complete_transaction( } } -BtreeLBAManager::base_iertr::future<> _init_cached_extent( +BtreeLBAManager::base_iertr::template future<> +_init_cached_extent( op_context_t c, const CachedExtentRef &e, LBABtree &btree, @@ -377,7 +388,8 @@ BtreeLBAManager::base_iertr::future<> _init_cached_extent( } } -BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( +BtreeLBAManager::init_cached_extent_ret +BtreeLBAManager::init_cached_extent( Transaction &t, CachedExtentRef e) { @@ -385,16 +397,19 @@ BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent( TRACET("{}", t, *e); return seastar::do_with(bool(), [this, e, &t](bool &ret) { auto c = get_context(t); - return with_btree(cache, c, [c, e, &ret](auto &btree) - -> base_iertr::future<> { - LOG_PREFIX(BtreeLBAManager::init_cached_extent); - DEBUGT("extent {}", c.trans, *e); - return _init_cached_extent(c, e, btree, ret); - }).si_then([&ret] { return ret; }); + return with_btree( + cache, c, + [c, e, &ret](auto &btree) -> base_iertr::future<> { + LOG_PREFIX(BtreeLBAManager::init_cached_extent); + DEBUGT("extent {}", c.trans, *e); + return _init_cached_extent(c, e, btree, ret); + } + ).si_then([&ret] { return ret; }); }); } -BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( +BtreeLBAManager::scan_mappings_ret +BtreeLBAManager::scan_mappings( Transaction &t, laddr_t begin, laddr_t end, @@ -413,20 +428,21 @@ BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( btree.upper_bound_right(c, begin), [f=std::move(f), begin, end](auto &pos) { if (pos.is_end() || pos.get_key() >= end) { - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::yes); } ceph_assert((pos.get_key() + pos.get_val().len) > begin); f(pos.get_key(), pos.get_val().paddr, pos.get_val().len); - return LBABtree::iterate_repeat_ret_inner( + return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::no); }); }); } -BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent( +BtreeLBAManager::rewrite_extent_ret +BtreeLBAManager::rewrite_extent( Transaction &t, CachedExtentRef extent) { @@ -504,18 +520,13 @@ BtreeLBAManager::get_physical_extent_if_live( if (type == extent_types_t::LADDR_INTERNAL) { return btree.get_internal_if_live(c, addr, laddr, len); } else { - assert(type == extent_types_t::LADDR_LEAF); + assert(type == extent_types_t::LADDR_LEAF || + type == extent_types_t::DINK_LADDR_LEAF); return btree.get_leaf_if_live(c, addr, laddr, len); } }); } -BtreeLBAManager::BtreeLBAManager(Cache &cache) - : cache(cache) -{ - register_metrics(); -} - void BtreeLBAManager::register_metrics() { LOG_PREFIX(BtreeLBAManager::register_metrics); @@ -539,7 +550,8 @@ void BtreeLBAManager::register_metrics() ); } -BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount( +BtreeLBAManager::update_refcount_ret +BtreeLBAManager::update_refcount( Transaction &t, laddr_t addr, int delta) @@ -565,7 +577,8 @@ BtreeLBAManager::update_refcount_ret BtreeLBAManager::update_refcount( }); } -BtreeLBAManager::_update_mapping_ret BtreeLBAManager::_update_mapping( +BtreeLBAManager::_update_mapping_ret +BtreeLBAManager::_update_mapping( Transaction &t, laddr_t addr, update_func_t &&f) @@ -606,13 +619,4 @@ BtreeLBAManager::_update_mapping_ret BtreeLBAManager::_update_mapping( }); } -BtreeLBAManager::~BtreeLBAManager() -{ - pin_set.scan([](auto &i) { - LOG_PREFIX(BtreeLBAManager::~BtreeLBAManager); - ERROR("Found {}, has_ref={} -- {}", - i, i.has_ref(), i.get_extent()); - }); -} - } diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 3b0aa4aafd4ed..58dbe1e0581f3 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -42,7 +42,7 @@ class BtreeLBAPin : public BtreeNodePin { using LBABtree = FixedKVBtree< laddr_t, lba_map_val_t, LBAInternalNode, - LBALeafNode, BtreeLBAPin, LBA_BLOCK_SIZE>; + LBALeafNode, BtreeLBAPin, LBA_BLOCK_SIZE, true>; /** * BtreeLBAManager @@ -63,7 +63,11 @@ using LBABtree = FixedKVBtree< */ class BtreeLBAManager : public LBAManager { public: - BtreeLBAManager(Cache &cache); + BtreeLBAManager(Cache &cache) + : cache(cache) + { + register_metrics(); + } mkfs_ret mkfs( Transaction &t) final; @@ -144,7 +148,13 @@ class BtreeLBAManager : public LBAManager { bpin->set_parent(nullptr); } - ~BtreeLBAManager(); + ~BtreeLBAManager() { + pin_set.scan([](auto &i) { + LOG_PREFIX(BtreeLBAManager::~BtreeLBAManager); + SUBERROR(seastore_lba, "Found {}, has_ref={} -- {}", + i, i.has_ref(), i.get_extent()); + }); + } private: Cache &cache; diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index 571e906fa8fed..ff61829cb2e94 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -142,7 +142,8 @@ struct LBALeafNode laddr_t, laddr_le_t, lba_map_val_t, lba_map_val_le_t, LBA_BLOCK_SIZE, - LBALeafNode> { + LBALeafNode, + true> { using Ref = TCachedExtentRef; using internal_iterator_t = const_iterator; template diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index a1efc729b96fa..9b5e8801e3144 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1062,23 +1062,24 @@ enum class extent_types_t : uint8_t { ROOT = 0, LADDR_INTERNAL = 1, LADDR_LEAF = 2, - OMAP_INNER = 3, - OMAP_LEAF = 4, - ONODE_BLOCK_STAGED = 5, - COLL_BLOCK = 6, - OBJECT_DATA_BLOCK = 7, - RETIRED_PLACEHOLDER = 8, + DINK_LADDR_LEAF = 3, + OMAP_INNER = 4, + OMAP_LEAF = 5, + ONODE_BLOCK_STAGED = 6, + COLL_BLOCK = 7, + OBJECT_DATA_BLOCK = 8, + RETIRED_PLACEHOLDER = 9, // the following two types are not extent types, // they are just used to indicates paddr allocation deltas - ALLOC_INFO = 9, - JOURNAL_TAIL = 10, + ALLOC_INFO = 10, + JOURNAL_TAIL = 11, // Test Block Types - TEST_BLOCK = 11, - TEST_BLOCK_PHYSICAL = 12, - BACKREF_INTERNAL = 13, - BACKREF_LEAF = 14, + TEST_BLOCK = 12, + TEST_BLOCK_PHYSICAL = 13, + BACKREF_INTERNAL = 14, + BACKREF_LEAF = 15, // None and the number of valid extent_types_t - NONE = 15, + NONE = 16, }; using extent_types_le_t = uint8_t; constexpr auto EXTENT_TYPES_MAX = static_cast(extent_types_t::NONE); @@ -1108,7 +1109,8 @@ constexpr bool is_retired_placeholder(extent_types_t type) constexpr bool is_lba_node(extent_types_t type) { return type == extent_types_t::LADDR_INTERNAL || - type == extent_types_t::LADDR_LEAF; + type == extent_types_t::LADDR_LEAF || + type == extent_types_t::DINK_LADDR_LEAF; } constexpr bool is_backref_node(extent_types_t type) diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 3853d53e7a4ff..64a847f30a729 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -324,7 +324,7 @@ TEST_F(lba_btree_test, basic) } struct btree_lba_manager_test : btree_test_base { - BtreeLBAManagerRef lba_manager; + BtreeLBAManagerRef lba_manager; btree_lba_manager_test() = default; From cce850d75609c7c34bce0920e4e12ba9b9513229 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Tue, 25 Oct 2022 14:03:43 +0800 Subject: [PATCH 15/21] crimson/os/seastore/lba_manager: link lba leaf nodes with logical extents by pointers Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/CMakeLists.txt | 2 + .../os/seastore/backref/backref_tree_node.h | 6 +- .../seastore/backref/btree_backref_manager.cc | 3 +- .../seastore/backref/btree_backref_manager.h | 2 + .../os/seastore/btree/btree_range_pin.cc | 36 ++++ .../os/seastore/btree/btree_range_pin.h | 12 +- .../os/seastore/btree/fixed_kv_btree.h | 29 ++- .../os/seastore/btree/fixed_kv_node.cc | 12 ++ src/crimson/os/seastore/btree/fixed_kv_node.h | 203 ++++++++++++------ src/crimson/os/seastore/cached_extent.cc | 49 ++++- src/crimson/os/seastore/cached_extent.h | 91 +++++++- src/crimson/os/seastore/lba_manager.cc | 10 +- src/crimson/os/seastore/lba_manager.h | 7 +- .../lba_manager/btree/btree_lba_manager.cc | 30 ++- .../lba_manager/btree/btree_lba_manager.h | 11 +- .../lba_manager/btree/lba_btree_node.cc | 14 +- .../lba_manager/btree/lba_btree_node.h | 107 +++++++-- src/crimson/os/seastore/seastore_types.cc | 2 + src/crimson/os/seastore/seastore_types.h | 2 +- .../os/seastore/transaction_manager.cc | 3 +- src/crimson/os/seastore/transaction_manager.h | 10 +- .../seastore/test_btree_lba_manager.cc | 6 +- 22 files changed, 494 insertions(+), 153 deletions(-) create mode 100644 src/crimson/os/seastore/btree/btree_range_pin.cc create mode 100644 src/crimson/os/seastore/btree/fixed_kv_node.cc diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt index baa2e7ca954eb..5b1c6187ca2a2 100644 --- a/src/crimson/os/seastore/CMakeLists.txt +++ b/src/crimson/os/seastore/CMakeLists.txt @@ -18,6 +18,8 @@ set(crimson_seastore_srcs omap_manager.cc omap_manager/btree/btree_omap_manager.cc omap_manager/btree/omap_btree_node_impl.cc + btree/btree_range_pin.cc + btree/fixed_kv_node.cc onode.cc onode_manager/staged-fltree/node.cc onode_manager/staged-fltree/node_extent_manager.cc diff --git a/src/crimson/os/seastore/backref/backref_tree_node.h b/src/crimson/os/seastore/backref/backref_tree_node.h index db9f1febff125..c3ff52520ce2a 100644 --- a/src/crimson/os/seastore/backref/backref_tree_node.h +++ b/src/crimson/os/seastore/backref/backref_tree_node.h @@ -92,7 +92,8 @@ class BackrefLeafNode const_iterator insert( const_iterator iter, paddr_t key, - backref_map_val_t val) final { + backref_map_val_t val, + LogicalCachedExtent*) final { journal_insert( iter, key, @@ -103,7 +104,8 @@ class BackrefLeafNode void update( const_iterator iter, - backref_map_val_t val) final { + backref_map_val_t val, + LogicalCachedExtent*) final { return journal_update( iter, val, diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index 7db8318cb9d78..0980cb2ed2b5a 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -242,7 +242,8 @@ BtreeBackrefManager::new_mapping( c, *state.insert_iter, state.last_end, - val + val, + nullptr ).si_then([&state, c, addr, len, key](auto &&p) { LOG_PREFIX(BtreeBackrefManager::new_mapping); auto [iter, inserted] = std::move(p); diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 0306d0e8bbe30..1f3347c8cdd1f 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -17,10 +17,12 @@ class BtreeBackrefPin : public BtreeNodePin { BtreeBackrefPin() = default; BtreeBackrefPin( CachedExtentRef parent, + uint16_t pos, backref_map_val_t &val, backref_node_meta_t &&meta) : BtreeNodePin( parent, + pos, val.laddr, val.len, std::forward(meta)), diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc new file mode 100644 index 0000000000000..9565a853b83c8 --- /dev/null +++ b/src/crimson/os/seastore/btree/btree_range_pin.cc @@ -0,0 +1,36 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/btree/btree_range_pin.h" +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore { + +template +void BtreeNodePin::link_extent(LogicalCachedExtent *ref) { + assert(ref->is_valid()); + // it's only when reading logical extents from disk that we need to + // link them to lba leaves + if (!ref->is_pending() && !ref->is_exist_clean()) { + assert(parent); + assert(pos != std::numeric_limits::max()); + if (parent->is_initial_pending()) { + auto &p = ((FixedKVNode&)*parent).get_stable_for_key( + pin.range.begin); + p.link_child(ref, pos); + } else if (parent->is_mutation_pending()) { + auto &p = (FixedKVNode&)*parent->get_prior_instance(); + p.link_child(ref, pos); + } else { + assert(!parent->is_pending() && parent->is_valid()); + auto &p = (FixedKVNode&)*parent; + p.link_child(ref, pos); + } + pos = std::numeric_limits::max(); + } + pin.set_extent(ref); +} + +template void BtreeNodePin::link_extent(LogicalCachedExtent*); +template void BtreeNodePin::link_extent(LogicalCachedExtent*); +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index 5942e85f3175f..c1d116fc9866c 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -453,6 +453,7 @@ class BtreeNodePin : public PhysicalNodePin { val_t value; extent_len_t len; btree_range_pin_t pin; + uint16_t pos = std::numeric_limits::max(); public: using val_type = val_t; @@ -460,13 +461,18 @@ class BtreeNodePin : public PhysicalNodePin { BtreeNodePin( CachedExtentRef parent, + uint16_t pos, val_t &value, extent_len_t len, fixed_kv_node_meta_t &&meta) - : parent(parent), value(value), len(len) { + : parent(parent), value(value), len(len), pos(pos) { pin.set_range(std::move(meta)); } + CachedExtentRef get_parent() const final { + return parent; + } + btree_range_pin_t& get_range_pin() { return pin; } @@ -479,9 +485,7 @@ class BtreeNodePin : public PhysicalNodePin { parent = pin; } - void link_extent(LogicalCachedExtent *ref) final { - pin.set_extent(ref); - } + void link_extent(LogicalCachedExtent *ref) final; extent_len_t get_length() const final { ceph_assert(pin.range.end > pin.range.begin); diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 6c3372819f324..9044d7d5936d1 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -15,12 +15,16 @@ #include "crimson/os/seastore/btree/btree_range_pin.h" #include "crimson/os/seastore/root_block.h" +#define RESERVATION_PTR reinterpret_cast(0x1) + namespace crimson::os::seastore::lba_manager::btree { struct lba_map_val_t; } namespace crimson::os::seastore { +bool is_valid_child_ptr(ChildableCachedExtent* child); + template phy_tree_root_t& get_phy_tree_root(root_t& r); @@ -223,6 +227,7 @@ class FixedKVBtree { auto key = get_key(); return std::make_unique( leaf.node, + leaf.pos, val, fixed_kv_node_meta_t{ key, key + val.len, 0 }); } @@ -545,7 +550,8 @@ class FixedKVBtree { op_context_t c, iterator iter, node_key_t laddr, - node_val_t val + node_val_t val, + LogicalCachedExtent* nextent ) { LOG_PREFIX(FixedKVBtree::insert); SUBTRACET( @@ -556,10 +562,10 @@ class FixedKVBtree { iter.is_end() ? min_max_t::max : iter.get_key()); return seastar::do_with( iter, - [this, c, laddr, val](auto &ret) { + [this, c, laddr, val, nextent](auto &ret) { return find_insertion( c, laddr, ret - ).si_then([this, c, laddr, val, &ret] { + ).si_then([this, c, laddr, val, &ret, nextent] { if (!ret.at_boundary() && ret.get_key() == laddr) { return insert_ret( interruptible::ready_future_marker{}, @@ -568,7 +574,7 @@ class FixedKVBtree { ++(get_tree_stats(c.trans).num_inserts); return handle_split( c, ret - ).si_then([c, laddr, val, &ret] { + ).si_then([c, laddr, val, &ret, nextent] { if (!ret.leaf.node->is_mutable()) { CachedExtentRef mut = c.cache.duplicate_for_write( c.trans, ret.leaf.node @@ -581,7 +587,7 @@ class FixedKVBtree { assert(iter == ret.leaf.node->end() || iter->get_key() > laddr); assert(laddr >= ret.leaf.node->get_meta().begin && laddr < ret.leaf.node->get_meta().end); - ret.leaf.node->insert(iter, laddr, val); + ret.leaf.node->insert(iter, laddr, val, nextent); return insert_ret( interruptible::ready_future_marker{}, std::make_pair(ret, true)); @@ -594,11 +600,12 @@ class FixedKVBtree { insert_ret insert( op_context_t c, node_key_t laddr, - node_val_t val) { + node_val_t val, + LogicalCachedExtent* nextent) { return lower_bound( c, laddr - ).si_then([this, c, laddr, val](auto iter) { - return this->insert(c, iter, laddr, val); + ).si_then([this, c, laddr, val, nextent](auto iter) { + return this->insert(c, iter, laddr, val, nextent); }); } @@ -617,7 +624,8 @@ class FixedKVBtree { update_ret update( op_context_t c, iterator iter, - node_val_t val) + node_val_t val, + LogicalCachedExtent* nextent) { LOG_PREFIX(FixedKVBtree::update); SUBTRACET( @@ -634,7 +642,8 @@ class FixedKVBtree { ++(get_tree_stats(c.trans).num_updates); iter.leaf.node->update( iter.leaf.node->iter_idx(iter.leaf.pos), - val); + val, + nextent); return update_ret( interruptible::ready_future_marker{}, iter); diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.cc b/src/crimson/os/seastore/btree/fixed_kv_node.cc new file mode 100644 index 0000000000000..00aceab92b382 --- /dev/null +++ b/src/crimson/os/seastore/btree/fixed_kv_node.cc @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "crimson/os/seastore/btree/fixed_kv_node.h" + +namespace crimson::os::seastore { + +bool is_valid_child_ptr(ChildableCachedExtent* child) { + return child != nullptr && child != RESERVATION_PTR; +} + +} // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 202a270a33602..70135210af03c 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -28,25 +28,8 @@ namespace crimson::os::seastore { * Base class enabling recursive lookup between internal and leaf nodes. */ template -struct FixedKVNode : CachedExtent { +struct FixedKVNode : ChildableCachedExtent { using FixedKVNodeRef = TCachedExtentRef; - struct parent_tracker_t - : public boost::intrusive_ref_counter< - parent_tracker_t, boost::thread_unsafe_counter> { - parent_tracker_t(FixedKVNodeRef parent) - : parent(parent) {} - parent_tracker_t(FixedKVNode* parent) - : parent(parent) {} - FixedKVNodeRef parent = nullptr; - ~parent_tracker_t() { - // this is parent's tracker, reset it - if (parent->my_tracker == this) { - parent->my_tracker = nullptr; - } - } - }; - - using parent_tracker_ref = boost::intrusive_ptr; btree_range_pin_t pin; struct copy_source_cmp_t { @@ -98,20 +81,24 @@ struct FixedKVNode : CachedExtent { * its "prior_instance" if the node is the result of a rewrite), with which * the lba range of this node overlaps. */ - std::vector children; + std::vector children; std::set copy_sources; uint16_t capacity = 0; parent_tracker_t* my_tracker = nullptr; - parent_tracker_ref parent_tracker; RootBlockRef root_block; + bool is_linked() { + assert(!has_parent_tracker() || !(bool)root_block); + return (bool)has_parent_tracker() || (bool)root_block; + } + FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr) - : CachedExtent(std::move(ptr)), + : ChildableCachedExtent(std::move(ptr)), pin(this), children(capacity, nullptr), capacity(capacity) {} FixedKVNode(const FixedKVNode &rhs) - : CachedExtent(rhs), + : ChildableCachedExtent(rhs), pin(rhs.pin, this), children(rhs.capacity, nullptr), capacity(rhs.capacity) {} @@ -128,6 +115,8 @@ struct FixedKVNode : CachedExtent { set_child_ptracker(child); } + virtual bool is_leaf_and_has_children() const = 0; + template void insert_child_ptr(iter_t iter, ChildableCachedExtent* child) { auto raw_children = children.data(); @@ -136,8 +125,18 @@ struct FixedKVNode : CachedExtent { &raw_children[offset + 1], &raw_children[offset], (get_node_size() - offset) * sizeof(ChildableCachedExtent*)); - children[offset] = child; - set_child_ptracker(child); + if (child) { + children[offset] = child; + set_child_ptracker(child); + } else { + // this can only happen when reserving lba spaces + ceph_assert(is_leaf_and_has_children()); + // this is to avoid mistakenly copying pointers from + // copy sources when committing this lba node, because + // we rely on pointers' "nullness" to avoid copying + // pointers for updated values + children[offset] = RESERVATION_PTR; + } } template @@ -227,7 +226,7 @@ struct FixedKVNode : CachedExtent { : stable_parent(stable_parent), pos(pos) {} }; - void link_child(FixedKVNode* child, uint16_t pos) { + void link_child(ChildableCachedExtent* child, uint16_t pos) { assert(pos < get_node_size()); assert(child); ceph_assert(!is_pending()); @@ -242,14 +241,14 @@ struct FixedKVNode : CachedExtent { auto pos = iter.get_offset(); assert(children.capacity()); auto child = children[pos]; - if (child) { + if (is_valid_child_ptr(child)) { return child_pos_t(child->get_transactional_view(t)); } else if (is_pending()) { auto key = iter.get_key(); auto &sparent = get_stable_for_key(key); auto spos = sparent.child_pos_for_key(key); auto child = sparent.children[spos]; - if (child) { + if (is_valid_child_ptr(child)) { return child_pos_t(child->get_transactional_view(t)); } else { return child_pos_t(&sparent, spos); @@ -357,10 +356,9 @@ struct FixedKVNode : CachedExtent { return; } ceph_assert(!root_block); - parent_tracker = prior.parent_tracker; - auto &parent = parent_tracker->parent; - assert(parent); - assert(parent->is_valid()); + take_prior_parent_tracker(); + assert(is_parent_valid()); + auto parent = get_parent_node(); //TODO: can this search be avoided? auto off = parent->lower_bound_offset(get_node_meta().begin); assert(parent->get_key_from_idx(off) == get_node_meta().begin); @@ -385,7 +383,7 @@ struct FixedKVNode : CachedExtent { assert(prior.my_tracker || prior.is_children_empty()); if (prior.my_tracker) { - prior.my_tracker->parent.reset(this); + prior.my_tracker->reset_parent(this); my_tracker = prior.my_tracker; // All my initial pending children is pointing to the original // tracker which has been dropped by the above line, so need @@ -401,8 +399,8 @@ struct FixedKVNode : CachedExtent { ceph_assert(end <= children.end()); for (auto it = begin; it != end; it++) { auto child = *it; - if (child) { - set_child_ptracker((FixedKVNode*)child); + if (is_valid_child_ptr(child)) { + set_child_ptracker(child); } } } @@ -485,7 +483,7 @@ struct FixedKVNode : CachedExtent { } void on_invalidated(Transaction &t) final { - parent_tracker.reset(); + reset_parent_tracker(); } bool is_rewrite() { @@ -495,17 +493,17 @@ struct FixedKVNode : CachedExtent { void on_initial_write() final { // All in-memory relative addrs are necessarily block-relative resolve_relative_addrs(get_paddr()); - ceph_assert( - parent_tracker - ? (parent_tracker->parent && parent_tracker->parent->is_valid()) - : true); + if (pin.is_root()) { + reset_parent_tracker(); + } + assert(has_parent_tracker() ? (is_parent_valid()) : true); } - void set_child_ptracker(FixedKVNode *child) { - if (!my_tracker) { - my_tracker = new parent_tracker_t(this); + void set_child_ptracker(ChildableCachedExtent *child) { + if (!this->my_tracker) { + this->my_tracker = new parent_tracker_t(this); } - child->parent_tracker.reset(my_tracker); + child->reset_parent_tracker(this->my_tracker); } void on_clean_read() final { @@ -564,6 +562,10 @@ struct FixedKVInternalNode : FixedKVNode(rhs), node_layout_t(this->get_bptr().c_str()) {} + bool is_leaf_and_has_children() const final { + return false; + } + uint16_t get_node_split_pivot() final { return this->get_split_pivot().get_offset(); } @@ -617,9 +619,8 @@ struct FixedKVInternalNode ceph_assert(this->root_block); unlink_phy_tree_root_node(this->root_block); } else { - ceph_assert(this->parent_tracker); - auto &parent = this->parent_tracker->parent; - ceph_assert(parent); + ceph_assert(this->is_parent_valid()); + auto parent = this->template get_parent_node>(); auto off = parent->lower_bound_offset(this->get_meta().begin); assert(parent->get_key_from_idx(off) == this->get_meta().begin); assert(parent->children[off] == this); @@ -853,17 +854,13 @@ struct FixedKVInternalNode } } - std::ostream &print_detail(std::ostream &out) const + std::ostream &_print_detail(std::ostream &out) const { out << ", size=" << this->get_size() << ", meta=" << this->get_meta() - << ", parent_tracker=" << (void*)this->parent_tracker.get(); - if (this->parent_tracker) { - out << ", parent=" << (void*)this->parent_tracker->parent.get(); - } - out << ", my_tracker=" << (void*)this->my_tracker; + << ", my_tracker=" << (void*)this->my_tracker; if (this->my_tracker) { - out << ", my_tracker->parent=" << (void*)this->my_tracker->parent.get(); + out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get(); } return out << ", root_block=" << (void*)this->root_block.get(); } @@ -936,8 +933,18 @@ struct FixedKVLeafNode VAL, VAL_LE>; using internal_const_iterator_t = typename node_layout_t::const_iterator; + using this_type_t = FixedKVLeafNode< + CAPACITY, + NODE_KEY, + NODE_KEY_LE, + VAL, + VAL_LE, + node_size, + node_type_t, + has_children>; + using base_t = FixedKVNode; FixedKVLeafNode(ceph::bufferptr &&ptr) - : FixedKVNode(0, std::move(ptr)), + : FixedKVNode(has_children ? CAPACITY : 0, std::move(ptr)), node_layout_t(this->get_bptr().c_str()) {} FixedKVLeafNode(const FixedKVLeafNode &rhs) : FixedKVNode(rhs), @@ -945,11 +952,15 @@ struct FixedKVLeafNode static constexpr bool do_has_children = has_children; + bool is_leaf_and_has_children() const final { + return has_children; + } + uint16_t get_node_split_pivot() final { return this->get_split_pivot().get_offset(); } - bool validate_stable_children() final { + bool validate_stable_children() override { return true; } @@ -959,9 +970,8 @@ struct FixedKVLeafNode ceph_assert(this->root_block); unlink_phy_tree_root_node(this->root_block); } else { - ceph_assert(this->parent_tracker); - auto &parent = this->parent_tracker->parent; - ceph_assert(parent); + ceph_assert(this->is_parent_valid()); + auto parent = this->template get_parent_node>(); auto off = parent->lower_bound_offset(this->get_meta().begin); assert(parent->get_key_from_idx(off) == this->get_meta().begin); assert(parent->children[off] == this); @@ -970,9 +980,49 @@ struct FixedKVLeafNode } } - void on_replace_prior(Transaction &t) final { - this->set_parent_tracker(); - assert(this->mutate_state.empty()); + void prepare_write() final { + if constexpr (has_children) { + if (this->is_initial_pending()) { + if (this->is_rewrite()) { + this->set_children_from_prior_instance(); + } + this->copy_children_from_stable_sources( + [this](base_t &node, uint16_t pos) { + ceph_assert(node.get_type() == this->get_type()); + auto &n = static_cast(node); + return n.iter_idx(pos); + } + ); + if (this->is_rewrite()) { + this->reset_prior_instance(); + } else { + this->adjust_ptracker_for_children(); + } + assert(this->validate_stable_children()); + this->copy_sources.clear(); + } + } + assert(this->is_initial_pending() + ? this->copy_sources.empty(): + true); + } + + void on_replace_prior(Transaction&) final { + ceph_assert(!this->is_rewrite()); + if constexpr (has_children) { + this->set_children_from_prior_instance(); + auto &prior = (this_type_t&)(*this->get_prior_instance()); + auto copied = this->copy_children_from_stable_source( + prior, + prior.begin(), + prior.end(), + this->begin()); + ceph_assert(copied <= get_node_size()); + assert(this->validate_stable_children()); + this->set_parent_tracker_from_prior_instance(); + } else { + this->set_parent_tracker_from_prior_instance(); + } } uint16_t lower_bound_offset(NODE_KEY key) const final { @@ -1011,11 +1061,13 @@ struct FixedKVLeafNode virtual void update( internal_const_iterator_t iter, - VAL val) = 0; + VAL val, + LogicalCachedExtent* nextent) = 0; virtual internal_const_iterator_t insert( internal_const_iterator_t iter, NODE_KEY addr, - VAL val) = 0; + VAL val, + LogicalCachedExtent* nextent) = 0; virtual void remove(internal_const_iterator_t iter) = 0; std::tuple @@ -1024,6 +1076,9 @@ struct FixedKVLeafNode c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); auto right = c.cache.template alloc_new_extent( c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + if constexpr (has_children) { + this->split_child_ptrs(*left, *right); + } auto pivot = this->split_into(*left, *right); left->pin.set_range(left->get_meta()); right->pin.set_range(right->get_meta()); @@ -1038,6 +1093,9 @@ struct FixedKVLeafNode Ref &right) { auto replacement = c.cache.template alloc_new_extent( c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); + if constexpr (has_children) { + replacement->merge_child_ptrs(*this, *right); + } replacement->merge_from(*this, *right->template cast()); replacement->pin.set_range(replacement->get_meta()); return replacement; @@ -1061,6 +1119,14 @@ struct FixedKVLeafNode prefer_left, *replacement_left, *replacement_right); + if constexpr (has_children) { + this->balance_child_ptrs( + *this, + right, + prefer_left, + *replacement_left, + *replacement_right); + } replacement_left->pin.set_range(replacement_left->get_meta()); replacement_right->pin.set_range(replacement_right->get_meta()); @@ -1090,15 +1156,10 @@ struct FixedKVLeafNode this->resolve_relative_addrs(base); } - std::ostream &print_detail(std::ostream &out) const + std::ostream &_print_detail(std::ostream &out) const { - out << ", size=" << this->get_size() - << ", meta=" << this->get_meta() - << ", parent_tracker=" << (void*)this->parent_tracker.get(); - if (this->parent_tracker) { - out << ", parent=" << (void*)this->parent_tracker->parent.get(); - } - return out; + return out << ", size=" << this->get_size() + << ", meta=" << this->get_meta(); } constexpr static size_t get_min_capacity() { diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index dfa4c6561684f..78ea5a465bfc7 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -6,6 +6,8 @@ #include "crimson/common/log.h" +#include "crimson/os/seastore/btree/fixed_kv_node.h" + namespace { [[maybe_unused]] seastar::logger& logger() { return crimson::get_logger(ceph_subsys_seastore_tm); @@ -91,7 +93,22 @@ CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) { } } -std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const +std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) { + return out << "parent_tracker=" << (void*)&tracker + << ", parent=" << (void*)tracker.get_parent().get(); +} + +std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const { + if (parent_tracker) { + out << *parent_tracker; + } else { + out << ", parent_tracker=" << (void*)nullptr; + } + _print_detail(out); + return out; +} + +std::ostream &LogicalCachedExtent::_print_detail(std::ostream &out) const { out << ", laddr=" << laddr; if (pin) { @@ -110,6 +127,36 @@ void CachedExtent::set_invalid(Transaction &t) { on_invalidated(t); } +LogicalCachedExtent::~LogicalCachedExtent() { + if (has_parent_tracker() && is_valid() && !is_pending()) { + assert(get_parent_node()); + auto parent = get_parent_node>(); + auto off = parent->lower_bound_offset(laddr); + assert(parent->get_key_from_idx(off) == laddr); + assert(parent->children[off] == this); + parent->children[off] = nullptr; + } +} + +void LogicalCachedExtent::on_replace_prior(Transaction &t) { + assert(is_mutation_pending()); + take_prior_parent_tracker(); + assert(get_parent_node()); + auto parent = get_parent_node>(); + //TODO: can this search be avoided? + auto off = parent->lower_bound_offset(laddr); + assert(parent->get_key_from_idx(off) == laddr); + parent->children[off] = this; +} + +parent_tracker_t::~parent_tracker_t() { + // this is parent's tracker, reset it + auto &p = (FixedKVNode&)*parent; + if (p.my_tracker == this) { + p.my_tracker = nullptr; + } +} + std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) { return out << "LBAPin(" << rhs.get_key() << "~" << rhs.get_length() diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 5ee08f9bbb609..10161fe220194 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -33,6 +33,8 @@ template < size_t node_size, bool leaf_has_children> class FixedKVBtree; +template +class BtreeNodePin; // #define DEBUG_CACHED_EXTENT_REF #ifdef DEBUG_CACHED_EXTENT_REF @@ -543,6 +545,8 @@ class CachedExtent void set_invalid(Transaction &t); + // a rewrite extent has an invalid prior_instance, + // and a mutation_pending extent has a valid prior_instance CachedExtentRef get_prior_instance() { return prior_instance; } @@ -715,6 +719,8 @@ class CachedExtent friend class crimson::os::seastore::SegmentedAllocator; friend class crimson::os::seastore::TransactionManager; friend class crimson::os::seastore::ExtentPlacementManager; + template + friend class BtreeNodePin; }; std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); @@ -885,6 +891,7 @@ class PhysicalNodePin { virtual key_t get_key() const = 0; virtual PhysicalNodePinRef duplicate() const = 0; virtual bool has_been_invalidated() const = 0; + virtual CachedExtentRef get_parent() const = 0; virtual ~PhysicalNodePin() {} }; @@ -957,6 +964,67 @@ class RetiredExtentPlaceholder : public CachedExtent { } }; +class parent_tracker_t + : public boost::intrusive_ref_counter< + parent_tracker_t, boost::thread_unsafe_counter> { +public: + parent_tracker_t(CachedExtentRef parent) + : parent(parent) {} + parent_tracker_t(CachedExtent* parent) + : parent(parent) {} + ~parent_tracker_t(); + template + TCachedExtentRef get_parent() const { + ceph_assert(parent); + if constexpr (std::is_same_v) { + return parent; + } else { + return parent->template cast(); + } + } + void reset_parent(CachedExtentRef p) { + parent = p; + } + bool is_valid() const { + return parent && parent->is_valid(); + } +private: + CachedExtentRef parent; +}; + +std::ostream &operator<<(std::ostream &, const parent_tracker_t &); + +using parent_tracker_ref = boost::intrusive_ptr; + +class ChildableCachedExtent : public CachedExtent { +public: + template + ChildableCachedExtent(T&&... t) : CachedExtent(std::forward(t)...) {} + bool has_parent_tracker() const { + return (bool)parent_tracker; + } + void reset_parent_tracker(parent_tracker_t *p = nullptr) { + parent_tracker.reset(p); + } + bool is_parent_valid() const { + return parent_tracker && parent_tracker->is_valid(); + } + template + TCachedExtentRef get_parent_node() const { + assert(parent_tracker); + return parent_tracker->template get_parent(); + } + void take_prior_parent_tracker() { + auto &prior = (ChildableCachedExtent&)(*get_prior_instance()); + parent_tracker = prior.parent_tracker; + } + std::ostream &print_detail(std::ostream &out) const final; +private: + parent_tracker_ref parent_tracker; + virtual std::ostream &_print_detail(std::ostream &out) const { + return out; + } +}; /** * LogicalCachedExtent * @@ -965,10 +1033,12 @@ class RetiredExtentPlaceholder : public CachedExtent { * Users of TransactionManager should be using extents derived from * LogicalCachedExtent. */ -class LogicalCachedExtent : public CachedExtent { +class LogicalCachedExtent : public ChildableCachedExtent { public: template - LogicalCachedExtent(T&&... t) : CachedExtent(std::forward(t)...) {} + LogicalCachedExtent(T&&... t) + : ChildableCachedExtent(std::forward(t)...) + {} void set_pin(LBAPinRef &&npin) { assert(!pin); @@ -1005,8 +1075,13 @@ class LogicalCachedExtent : public CachedExtent { return true; } - std::ostream &print_detail(std::ostream &out) const final; + std::ostream &_print_detail(std::ostream &out) const final; + + void on_replace_prior(Transaction &t) final; + + virtual ~LogicalCachedExtent(); protected: + virtual void apply_delta(const ceph::bufferlist &bl) = 0; virtual std::ostream &print_detail_l(std::ostream &out) const { return out; @@ -1026,6 +1101,16 @@ class LogicalCachedExtent : public CachedExtent { private: laddr_t laddr = L_ADDR_NULL; LBAPinRef pin; + + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> + friend class FixedKVBtree; }; using LogicalCachedExtentRef = TCachedExtentRef; diff --git a/src/crimson/os/seastore/lba_manager.cc b/src/crimson/os/seastore/lba_manager.cc index b35e2d0ead8c3..d113bbd1e957c 100644 --- a/src/crimson/os/seastore/lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager.cc @@ -17,17 +17,15 @@ LBAManager::update_mappings( t, extent->get_laddr(), extent->get_prior_paddr_and_reset(), - extent->get_paddr() + extent->get_paddr(), + nullptr // all the extents should have already been + // added to the fixed_kv_btree ); }); } -template LBAManagerRef lba_manager::create_lba_manager(Cache &cache) { - return LBAManagerRef(new btree::BtreeLBAManager(cache)); + return LBAManagerRef(new btree::BtreeLBAManager(cache)); } -template LBAManagerRef lba_manager::create_lba_manager(Cache &cache); -template LBAManagerRef lba_manager::create_lba_manager(Cache &cache); - } diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index f495eb0753483..af11cac7cc4e0 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -80,7 +80,8 @@ class LBAManager { Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr) = 0; + paddr_t addr, + LogicalCachedExtent *nextent) = 0; struct ref_update_result_t { unsigned refcount = 0; @@ -166,7 +167,8 @@ class LBAManager { Transaction& t, laddr_t laddr, paddr_t prev_addr, - paddr_t paddr) = 0; + paddr_t paddr, + LogicalCachedExtent *nextent) = 0; /** * update_mappings @@ -206,7 +208,6 @@ using LBAManagerRef = std::unique_ptr; class Cache; namespace lba_manager { -template LBAManagerRef create_lba_manager(Cache &cache); } diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 64dd3103ce2bf..df123d2ee0555 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -209,7 +209,8 @@ BtreeLBAManager::alloc_extent( Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr) + paddr_t addr, + LogicalCachedExtent* nextent) { struct state_t { laddr_t last_end; @@ -229,7 +230,8 @@ BtreeLBAManager::alloc_extent( cache, c, hint, - [this, FNAME, c, hint, len, addr, lookup_attempts, &t](auto &btree, auto &state) { + [this, FNAME, c, hint, len, addr, lookup_attempts, + &t, nextent](auto &btree, auto &state) { return LBABtree::iterate_repeat( c, btree.upper_bound_right(c, hint), @@ -265,12 +267,13 @@ BtreeLBAManager::alloc_extent( interruptible::ready_future_marker{}, seastar::stop_iteration::no); } - }).si_then([FNAME, c, addr, len, hint, &btree, &state] { + }).si_then([FNAME, c, addr, len, hint, &btree, &state, nextent] { return btree.insert( c, *state.insert_iter, state.last_end, - lba_map_val_t{len, addr, 1, 0} + lba_map_val_t{len, addr, 1, 0}, + nextent ).si_then([&state, FNAME, c, addr, len, hint](auto &&p) { auto [iter, inserted] = std::move(p); TRACET("{}~{}, hint={}, inserted at {}", @@ -473,7 +476,8 @@ BtreeLBAManager::update_mapping( Transaction& t, laddr_t laddr, paddr_t prev_addr, - paddr_t addr) + paddr_t addr, + LogicalCachedExtent *nextent) { LOG_PREFIX(BtreeLBAManager::update_mapping); TRACET("laddr={}, paddr {} => {}", t, laddr, prev_addr, addr); @@ -487,7 +491,8 @@ BtreeLBAManager::update_mapping( ceph_assert(in.paddr == prev_addr); ret.paddr = addr; return ret; - } + }, + nextent ).si_then([&t, laddr, prev_addr, addr, FNAME](auto result) { DEBUGT("laddr={}, paddr {} => {} done -- {}", t, laddr, prev_addr, addr, result); @@ -566,7 +571,8 @@ BtreeLBAManager::update_refcount( ceph_assert((int)out.refcount + delta >= 0); out.refcount += delta; return out; - } + }, + nullptr ).si_then([&t, addr, delta, FNAME](auto result) { DEBUGT("laddr={}, delta={} done -- {}", t, addr, delta, result); return ref_update_result_t{ @@ -581,16 +587,17 @@ BtreeLBAManager::_update_mapping_ret BtreeLBAManager::_update_mapping( Transaction &t, laddr_t addr, - update_func_t &&f) + update_func_t &&f, + LogicalCachedExtent* nextent) { auto c = get_context(t); return with_btree_ret( cache, c, - [f=std::move(f), c, addr](auto &btree) mutable { + [f=std::move(f), c, addr, nextent](auto &btree) mutable { return btree.lower_bound( c, addr - ).si_then([&btree, f=std::move(f), c, addr](auto iter) + ).si_then([&btree, f=std::move(f), c, addr, nextent](auto iter) -> _update_mapping_ret { if (iter.is_end() || iter.get_key() != addr) { LOG_PREFIX(BtreeLBAManager::_update_mapping); @@ -610,7 +617,8 @@ BtreeLBAManager::_update_mapping( return btree.update( c, iter, - ret + ret, + nextent ).si_then([ret](auto) { return ret; }); diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 58dbe1e0581f3..1535ef9312958 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -30,10 +30,12 @@ class BtreeLBAPin : public BtreeNodePin { BtreeLBAPin() = default; BtreeLBAPin( CachedExtentRef parent, + uint16_t pos, lba_map_val_t &val, lba_node_meta_t &&meta) : BtreeNodePin( parent, + pos, val.paddr, val.len, std::forward(meta)) @@ -88,7 +90,8 @@ class BtreeLBAManager : public LBAManager { Transaction &t, laddr_t hint, extent_len_t len, - paddr_t addr) final; + paddr_t addr, + LogicalCachedExtent*) final; ref_ret decref_extent( Transaction &t, @@ -133,7 +136,8 @@ class BtreeLBAManager : public LBAManager { Transaction& t, laddr_t laddr, paddr_t prev_addr, - paddr_t paddr) final; + paddr_t paddr, + LogicalCachedExtent*) final; get_physical_extent_if_live_ret get_physical_extent_if_live( Transaction &t, @@ -198,7 +202,8 @@ class BtreeLBAManager : public LBAManager { _update_mapping_ret _update_mapping( Transaction &t, laddr_t addr, - update_func_t &&f); + update_func_t &&f, + LogicalCachedExtent*); }; using BtreeLBAManagerRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc index c502ef338a1f9..a33f75917c1f9 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.cc @@ -27,15 +27,15 @@ std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v) << ")"; } -std::ostream &LBALeafNode::print_detail(std::ostream &out) const +std::ostream &LBALeafNode::_print_detail(std::ostream &out) const { - out << ", size=" << get_size() - << ", meta=" << get_meta() - << ", parent_tracker=" << (void*)parent_tracker.get(); - if (parent_tracker) { - return out << ", parent=" << (void*)parent_tracker->parent.get(); + out << ", size=" << this->get_size() + << ", meta=" << this->get_meta() + << ", my_tracker=" << (void*)this->my_tracker; + if (this->my_tracker) { + out << ", my_tracker->parent=" << (void*)this->my_tracker->get_parent().get(); } - return out << ", root_block=" << (void*)root_block.get(); + return out << ", root_block=" << (void*)this->root_block.get(); } void LBALeafNode::resolve_relative_addrs(paddr_t base) diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h index ff61829cb2e94..62ceae6cc462a 100644 --- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h +++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h @@ -145,64 +145,125 @@ struct LBALeafNode LBALeafNode, true> { using Ref = TCachedExtentRef; - using internal_iterator_t = const_iterator; + using parent_type_t = FixedKVLeafNode< + LEAF_NODE_CAPACITY, + laddr_t, laddr_le_t, + lba_map_val_t, lba_map_val_le_t, + LBA_BLOCK_SIZE, + LBALeafNode, + true>; + using internal_const_iterator_t = + typename parent_type_t::node_layout_t::const_iterator; + using internal_iterator_t = + typename parent_type_t::node_layout_t::iterator; template LBALeafNode(T&&... t) : - FixedKVLeafNode(std::forward(t)...) {} + parent_type_t(std::forward(t)...) {} static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF; + bool validate_stable_children() final { + LOG_PREFIX(LBALeafNode::validate_stable_children); + if (this->children.empty()) { + return false; + } + + for (auto i : *this) { + auto child = (LogicalCachedExtent*)this->children[i.get_offset()]; + if (is_valid_child_ptr(child) && child->get_laddr() != i.get_key()) { + SUBERROR(seastore_fixedkv_tree, + "stable child not valid: child {}, key {}", + *child, + i.get_key()); + ceph_abort(); + return false; + } + } + return true; + } + void update( - const_iterator iter, - lba_map_val_t val) final { - val.paddr = maybe_generate_relative(val.paddr); - return journal_update( + internal_const_iterator_t iter, + lba_map_val_t val, + LogicalCachedExtent* nextent) final { + LOG_PREFIX(LBALeafNode::update); + if (nextent) { + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, {}", + this->pending_for_transaction, + iter.get_offset(), + *nextent); + // child-ptr may already be correct, see LBAManager::update_mappings() + this->update_child_ptr(iter, nextent); + } + val.paddr = this->maybe_generate_relative(val.paddr); + return this->journal_update( iter, val, - maybe_get_delta_buffer()); + this->maybe_get_delta_buffer()); } - const_iterator insert( - const_iterator iter, + internal_const_iterator_t insert( + internal_const_iterator_t iter, laddr_t addr, - lba_map_val_t val) final { - val.paddr = maybe_generate_relative(val.paddr); - journal_insert( + lba_map_val_t val, + LogicalCachedExtent* nextent) final { + LOG_PREFIX(LBALeafNode::insert); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}, extent {}", + this->pending_for_transaction, + iter.get_offset(), + addr, + (void*)nextent); + this->insert_child_ptr(iter, nextent); + val.paddr = this->maybe_generate_relative(val.paddr); + this->journal_insert( iter, addr, val, - maybe_get_delta_buffer()); + this->maybe_get_delta_buffer()); return iter; } - void remove(const_iterator iter) final { - return journal_remove( + void remove(internal_const_iterator_t iter) final { + LOG_PREFIX(LBALeafNode::remove); + SUBTRACE(seastore_fixedkv_tree, "trans.{}, pos {}, key {}", + this->pending_for_transaction, + iter.get_offset(), + iter.get_key()); + assert(iter != this->end()); + this->remove_child_ptr(iter); + return this->journal_remove( iter, - maybe_get_delta_buffer()); + this->maybe_get_delta_buffer()); } // See LBAInternalNode, same concept void resolve_relative_addrs(paddr_t base); - void node_resolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { + void node_resolve_vals( + internal_iterator_t from, + internal_iterator_t to) const final + { + if (this->is_initial_pending()) { for (auto i = from; i != to; ++i) { auto val = i->get_val(); if (val.paddr.is_relative()) { assert(val.paddr.is_block_relative()); - val.paddr = get_paddr().add_relative(val.paddr); + val.paddr = this->get_paddr().add_relative(val.paddr); i->set_val(val); } } } } - void node_unresolve_vals(iterator from, iterator to) const final { - if (is_initial_pending()) { + void node_unresolve_vals( + internal_iterator_t from, + internal_iterator_t to) const final + { + if (this->is_initial_pending()) { for (auto i = from; i != to; ++i) { auto val = i->get_val(); if (val.paddr.is_relative()) { auto val = i->get_val(); assert(val.paddr.is_record_relative()); - val.paddr = val.paddr.block_relative_to(get_paddr()); + val.paddr = val.paddr.block_relative_to(this->get_paddr()); i->set_val(val); } } @@ -213,7 +274,7 @@ struct LBALeafNode return TYPE; } - std::ostream &print_detail(std::ostream &out) const final; + std::ostream &_print_detail(std::ostream &out) const final; }; using LBALeafNodeRef = TCachedExtentRef; diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc index a14201cbcf71c..9328a03094c56 100644 --- a/src/crimson/os/seastore/seastore_types.cc +++ b/src/crimson/os/seastore/seastore_types.cc @@ -219,6 +219,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t) return out << "LADDR_INTERNAL"; case extent_types_t::LADDR_LEAF: return out << "LADDR_LEAF"; + case extent_types_t::DINK_LADDR_LEAF: + return out << "LADDR_LEAF"; case extent_types_t::ONODE_BLOCK_STAGED: return out << "ONODE_BLOCK_STAGED"; case extent_types_t::OMAP_INNER: diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 9b5e8801e3144..61ddfe5633718 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -1062,7 +1062,7 @@ enum class extent_types_t : uint8_t { ROOT = 0, LADDR_INTERNAL = 1, LADDR_LEAF = 2, - DINK_LADDR_LEAF = 3, + DINK_LADDR_LEAF = 3, // should only be used for unitttests OMAP_INNER = 4, OMAP_LEAF = 5, ONODE_BLOCK_STAGED = 6, diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index 0f083340ce27d..bde4df6ba3848 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -486,7 +486,8 @@ TransactionManager::rewrite_logical_extent( t, lextent->get_laddr(), lextent->get_paddr(), - nlextent->get_paddr()); + nlextent->get_paddr(), + nlextent.get()); } TransactionManager::rewrite_extent_ret TransactionManager::rewrite_extent( diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index e00290d88e2a2..aa4127db46be6 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -142,6 +142,7 @@ class TransactionManager : public ExtentCallbackInterface { assert(!extent.has_pin()); assert(!extent.has_been_invalidated()); assert(!pin->has_been_invalidated()); + assert(pin->get_parent()); extent.set_pin(std::move(pin)); lba_manager->add_pin(extent.get_pin()); } @@ -325,7 +326,8 @@ class TransactionManager : public ExtentCallbackInterface { t, laddr_hint, len, - ext->get_paddr() + ext->get_paddr(), + ext.get() ).si_then([ext=std::move(ext), laddr_hint, &t, FNAME](auto &&ref) mutable { ext->set_pin(std::move(ref)); SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint); @@ -380,7 +382,8 @@ class TransactionManager : public ExtentCallbackInterface { t, laddr_hint, length, - existing_paddr + existing_paddr, + ext.get() ).si_then([ext=std::move(ext), laddr_hint, this](auto &&ref) { ceph_assert(laddr_hint == ref->get_key()); ext->set_pin(std::move(ref)); @@ -409,7 +412,8 @@ class TransactionManager : public ExtentCallbackInterface { t, hint, len, - P_ADDR_ZERO); + P_ADDR_ZERO, + nullptr); } /* alloc_extents diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 64a847f30a729..8ca18fe3b9502 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -257,7 +257,7 @@ struct lba_btree_test : btree_test_base { check.emplace(addr, get_map_val(len)); lba_btree_update([=, this](auto &btree, auto &t) { return btree.insert( - get_op_context(t), addr, get_map_val(len) + get_op_context(t), addr, get_map_val(len), nullptr ).si_then([](auto){}); }); } @@ -324,7 +324,7 @@ TEST_F(lba_btree_test, basic) } struct btree_lba_manager_test : btree_test_base { - BtreeLBAManagerRef lba_manager; + BtreeLBAManagerRef lba_manager; btree_lba_manager_test() = default; @@ -426,7 +426,7 @@ struct btree_lba_manager_test : btree_test_base { auto ret = with_trans_intr( *t.t, [=, this](auto &t) { - return lba_manager->alloc_extent(t, hint, len, paddr); + return lba_manager->alloc_extent(t, hint, len, paddr, nullptr); }).unsafe_get0(); logger().debug("alloc'd: {}", *ret); EXPECT_EQ(len, ret->get_length()); From 89c2d0b3af7feb4124b7a5ac7f54026c205663b5 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Sat, 6 May 2023 17:26:18 +0800 Subject: [PATCH 16/21] crimson/os/seastore/transaction_manager: follow leaf<->logical extent pointers to read extent Signed-off-by: Xuehan Xu --- .../seastore/backref/btree_backref_manager.cc | 8 +- .../seastore/backref/btree_backref_manager.h | 5 +- .../os/seastore/btree/btree_range_pin.cc | 38 ++-- .../os/seastore/btree/btree_range_pin.h | 36 +++- .../os/seastore/btree/fixed_kv_btree.h | 126 +++++------- src/crimson/os/seastore/btree/fixed_kv_node.h | 52 +++-- src/crimson/os/seastore/cache.h | 1 + src/crimson/os/seastore/cached_extent.cc | 4 + src/crimson/os/seastore/cached_extent.h | 55 +++++ .../lba_manager/btree/btree_lba_manager.cc | 10 +- .../lba_manager/btree/btree_lba_manager.h | 5 +- .../os/seastore/object_data_handler.cc | 30 +-- .../os/seastore/transaction_manager.cc | 2 +- src/crimson/os/seastore/transaction_manager.h | 194 +++++++++++------- 14 files changed, 341 insertions(+), 225 deletions(-) diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index 0980cb2ed2b5a..28bf85567ebec 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -120,7 +120,7 @@ BtreeBackrefManager::get_mapping( } else { TRACET("{} got {}, {}", c.trans, offset, iter.get_key(), iter.get_val()); - auto e = iter.get_pin(); + auto e = iter.get_pin(c); return get_mapping_ret( interruptible::ready_future_marker{}, std::move(e)); @@ -157,7 +157,7 @@ BtreeBackrefManager::get_mappings( TRACET("{}~{} got {}, {}, repeat ...", c.trans, offset, end, pos.get_key(), pos.get_val()); ceph_assert((pos.get_key().add_offset(pos.get_val().len)) > offset); - ret.push_back(pos.get_pin()); + ret.push_back(pos.get_pin(c)); return BackrefBtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::no); @@ -253,8 +253,8 @@ BtreeBackrefManager::new_mapping( state.ret = iter; }); }); - }).si_then([](auto &&state) { - return state.ret->get_pin(); + }).si_then([c](auto &&state) { + return state.ret->get_pin(c); }); } diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 1f3347c8cdd1f..9a067f8988514 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -14,13 +14,16 @@ constexpr size_t BACKREF_BLOCK_SIZE = 4096; class BtreeBackrefPin : public BtreeNodePin { extent_types_t type; public: - BtreeBackrefPin() = default; + BtreeBackrefPin(op_context_t ctx) + : BtreeNodePin(ctx) {} BtreeBackrefPin( + op_context_t ctx, CachedExtentRef parent, uint16_t pos, backref_map_val_t &val, backref_node_meta_t &&meta) : BtreeNodePin( + ctx, parent, pos, val.laddr, diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc index 9565a853b83c8..adb84ed06950a 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.cc +++ b/src/crimson/os/seastore/btree/btree_range_pin.cc @@ -7,30 +7,22 @@ namespace crimson::os::seastore { template -void BtreeNodePin::link_extent(LogicalCachedExtent *ref) { - assert(ref->is_valid()); - // it's only when reading logical extents from disk that we need to - // link them to lba leaves - if (!ref->is_pending() && !ref->is_exist_clean()) { - assert(parent); - assert(pos != std::numeric_limits::max()); - if (parent->is_initial_pending()) { - auto &p = ((FixedKVNode&)*parent).get_stable_for_key( - pin.range.begin); - p.link_child(ref, pos); - } else if (parent->is_mutation_pending()) { - auto &p = (FixedKVNode&)*parent->get_prior_instance(); - p.link_child(ref, pos); - } else { - assert(!parent->is_pending() && parent->is_valid()); - auto &p = (FixedKVNode&)*parent; - p.link_child(ref, pos); - } - pos = std::numeric_limits::max(); +get_child_ret_t +BtreeNodePin::get_logical_extent( + Transaction &t) +{ + assert(parent); + assert(parent->is_valid()); + assert(pos != std::numeric_limits::max()); + auto &p = (FixedKVNode&)*parent; + auto v = p.get_logical_child(ctx, pos); + if (!v.has_child()) { + this->child_pos = v.get_child_pos(); } - pin.set_extent(ref); + return v; } -template void BtreeNodePin::link_extent(LogicalCachedExtent*); -template void BtreeNodePin::link_extent(LogicalCachedExtent*); +template class BtreeNodePin; +template class BtreeNodePin; + } // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index c1d116fc9866c..29dfa476c6e12 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -7,11 +7,19 @@ #include "crimson/common/log.h" +#include "crimson/os/seastore/cache.h" #include "crimson/os/seastore/cached_extent.h" #include "crimson/os/seastore/seastore_types.h" namespace crimson::os::seastore { +template +struct op_context_t { + Cache &cache; + Transaction &trans; + btree_pin_set_t *pins = nullptr; +}; + constexpr uint16_t MAX_FIXEDKVBTREE_DEPTH = 8; template @@ -442,6 +450,7 @@ class btree_pin_set_t { template class BtreeNodePin : public PhysicalNodePin { + op_context_t ctx; /** * parent * @@ -457,16 +466,25 @@ class BtreeNodePin : public PhysicalNodePin { public: using val_type = val_t; - BtreeNodePin() = default; + BtreeNodePin(op_context_t ctx) : ctx(ctx) {} BtreeNodePin( + op_context_t ctx, CachedExtentRef parent, uint16_t pos, val_t &value, extent_len_t len, fixed_kv_node_meta_t &&meta) - : parent(parent), value(value), len(len), pos(pos) { + : ctx(ctx), + parent(parent), + value(value), + len(len), + pos(pos) + { pin.set_range(std::move(meta)); + if (!parent->is_pending()) { + this->child_pos = {parent, pos}; + } } CachedExtentRef get_parent() const final { @@ -485,7 +503,14 @@ class BtreeNodePin : public PhysicalNodePin { parent = pin; } - void link_extent(LogicalCachedExtent *ref) final; + void link_extent(LogicalCachedExtent *ref) final { + pin.set_extent(ref); + pos = std::numeric_limits::max(); + } + + uint16_t get_pos() const final { + return pos; + } extent_len_t get_length() const final { ceph_assert(pin.range.end > pin.range.begin); @@ -507,11 +532,12 @@ class BtreeNodePin : public PhysicalNodePin { PhysicalNodePinRef duplicate() const final { auto ret = std::unique_ptr>( - new BtreeNodePin); + new BtreeNodePin(ctx)); ret->pin.set_range(pin.range); ret->value = value; ret->parent = parent; ret->len = len; + ret->pos = pos; return ret; } @@ -522,6 +548,8 @@ class BtreeNodePin : public PhysicalNodePin { bool has_been_invalidated() const final { return parent->has_been_invalidated(); } + + get_child_ret_t get_logical_extent(Transaction&) final; }; } diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 9044d7d5936d1..8b0256bd1f46e 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -28,13 +28,6 @@ bool is_valid_child_ptr(ChildableCachedExtent* child); template phy_tree_root_t& get_phy_tree_root(root_t& r); -template -struct op_context_t { - Cache &cache; - Transaction &trans; - btree_pin_set_t *pins = nullptr; -}; - using get_phy_tree_root_node_ret = std::pair void unlink_phy_tree_root_node(RootBlockRef &root_block); - template Transaction::tree_stats_t& get_tree_stats(Transaction &t); @@ -221,11 +213,13 @@ class FixedKVBtree { return leaf.pos == 0; } - PhysicalNodePinRef get_pin() const { + PhysicalNodePinRef + get_pin(op_context_t ctx) const { assert(!is_end()); auto val = get_val(); auto key = get_key(); return std::make_unique( + ctx, leaf.node, leaf.pos, val, @@ -236,6 +230,9 @@ class FixedKVBtree { return leaf.node; } + uint16_t get_leaf_pos() { + return leaf.pos; + } private: iterator() noexcept {} iterator(depth_t depth) noexcept : internal(depth - 1) {} @@ -1311,7 +1308,6 @@ class FixedKVBtree { F &f, mapped_space_visitor_t *visitor ) { - LOG_PREFIX(FixedKVBtree::lookup_internal_level); assert(depth > 1); auto &parent_entry = iter.get_internal(depth + 1); auto parent = parent_entry.node; @@ -1333,25 +1329,18 @@ class FixedKVBtree { return seastar::now(); }; - auto child_pos = parent->get_child(c.trans, node_iter); - auto &child = child_pos.child; - if (child) { - SUBTRACET(seastore_fixedkv_tree, - "got child on {}, pos: {}, res: {}", - c.trans, - *parent_entry.node, - parent_entry.pos, - *child); - - ceph_assert(child->is_valid()); - if (!child->is_pending_in_trans(c.trans.get_trans_id())) { - c.trans.add_to_read_set(child); - if (!child->is_mutation_pending()) { - c.cache.touch_extent(*child); - } - } - return child->wait_io().then( - [child, on_found=std::move(on_found), node_iter]() mutable { + auto v = parent->template get_child(c, node_iter); + if (v.has_child()) { + return v.get_child_fut().then( + [on_found=std::move(on_found), node_iter, c, + parent_entry](auto child) mutable { + LOG_PREFIX(FixedKVBtree::lookup_internal_level); + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); auto &cnode = (typename internal_node_t::base_t &)*child; assert(cnode.get_node_meta().begin == node_iter.get_key()); assert(cnode.get_node_meta().end > node_iter.get_key()); @@ -1359,6 +1348,7 @@ class FixedKVBtree { }); } + auto child_pos = v.get_child_pos(); auto next_iter = node_iter + 1; auto begin = node_iter->get_key(); auto end = next_iter == parent->end() @@ -1371,8 +1361,8 @@ class FixedKVBtree { begin, end, std::make_optional>( - child_pos.stable_parent->template cast(), - child_pos.pos) + child_pos.template get_parent(), + child_pos.get_pos()) ).si_then([on_found=std::move(on_found)](InternalNodeRef node) { return on_found(node); }); @@ -1387,7 +1377,6 @@ class FixedKVBtree { F &f, mapped_space_visitor_t *visitor ) { - LOG_PREFIX(FixedKVBtree::lookup_leaf); auto &parent_entry = iter.get_internal(2); auto parent = parent_entry.node; assert(parent); @@ -1407,25 +1396,18 @@ class FixedKVBtree { return seastar::now(); }; - auto child_pos = parent->get_child(c.trans, node_iter); - auto &child = child_pos.child; - if (child) { - SUBTRACET(seastore_fixedkv_tree, - "got child on {}, pos: {}, res: {}", - c.trans, - *parent_entry.node, - parent_entry.pos, - *child); - - ceph_assert(child->is_valid()); - if (!child->is_pending_in_trans(c.trans.get_trans_id())) { - c.trans.add_to_read_set(child); - if (!child->is_mutation_pending()) { - c.cache.touch_extent(*child); - } - } - return child->wait_io().then( - [child, on_found=std::move(on_found), node_iter]() mutable { + auto v = parent->template get_child(c, node_iter); + if (v.has_child()) { + return v.get_child_fut().then( + [on_found=std::move(on_found), node_iter, c, + parent_entry](auto child) mutable { + LOG_PREFIX(FixedKVBtree::lookup_leaf); + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_entry.node, + parent_entry.pos, + *child); auto &cnode = (typename internal_node_t::base_t &)*child; assert(cnode.get_node_meta().begin == node_iter.get_key()); assert(cnode.get_node_meta().end > node_iter.get_key()); @@ -1433,6 +1415,7 @@ class FixedKVBtree { }); } + auto child_pos = v.get_child_pos(); auto next_iter = node_iter + 1; auto begin = node_iter->get_key(); auto end = next_iter == parent->end() @@ -1445,8 +1428,8 @@ class FixedKVBtree { begin, end, std::make_optional>( - child_pos.stable_parent->template cast(), - child_pos.pos) + child_pos.template get_parent(), + child_pos.get_pos()) ).si_then([on_found=std::move(on_found)](LeafNodeRef node) { return on_found(node); }); @@ -1966,26 +1949,18 @@ class FixedKVBtree { return seastar::now(); }; - auto child_pos = parent_pos.node->get_child(c.trans, donor_iter); - auto &child = child_pos.child; - if (child) { - SUBTRACET(seastore_fixedkv_tree, - "got child on {}, pos: {}, res: {}", - c.trans, - *parent_pos.node, - donor_iter.get_offset(), - *child); - - ceph_assert(child->is_valid()); - if (!child->is_pending_in_trans(c.trans.get_trans_id())) { - c.trans.add_to_read_set(child); - if (!child->is_mutation_pending()) { - c.cache.touch_extent(*child); - } - } - return child->wait_io().then( - [child, do_merge=std::move(do_merge), &pos, - donor_iter, donor_is_left]() mutable { + auto v = parent_pos.node->template get_child(c, donor_iter); + if (v.has_child()) { + return v.get_child_fut().then( + [do_merge=std::move(do_merge), &pos, + donor_iter, donor_is_left, c, parent_pos](auto child) mutable { + LOG_PREFIX(FixedKVBtree::merge_level); + SUBTRACET(seastore_fixedkv_tree, + "got child on {}, pos: {}, res: {}", + c.trans, + *parent_pos.node, + donor_iter.get_offset(), + *child); auto &node = (typename internal_node_t::base_t&)*child; assert(donor_is_left ? node.get_node_meta().end == pos.node->get_node_meta().begin : @@ -1996,6 +1971,7 @@ class FixedKVBtree { }); } + auto child_pos = v.get_child_pos(); return get_node( c, depth, @@ -2003,8 +1979,8 @@ class FixedKVBtree { begin, end, std::make_optional>( - child_pos.stable_parent->template cast(), - child_pos.pos) + child_pos.template get_parent(), + child_pos.get_pos()) ).si_then([do_merge=std::move(do_merge)](typename NodeType::Ref donor) { return do_merge(donor); }); diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 70135210af03c..3997be0b904dc 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -217,15 +217,6 @@ struct FixedKVNode : ChildableCachedExtent { } } - struct child_pos_t { - FixedKVNodeRef stable_parent; - uint16_t pos = std::numeric_limits::max(); - CachedExtentRef child; - child_pos_t(CachedExtentRef child) : child(child) {} - child_pos_t(FixedKVNodeRef stable_parent, uint16_t pos) - : stable_parent(stable_parent), pos(pos) {} - }; - void link_child(ChildableCachedExtent* child, uint16_t pos) { assert(pos < get_node_size()); assert(child); @@ -236,20 +227,25 @@ struct FixedKVNode : ChildableCachedExtent { set_child_ptracker(child); } - template - child_pos_t get_child(Transaction &t, iter_t iter) { + virtual get_child_ret_t + get_logical_child(op_context_t c, uint16_t pos) = 0; + + template + get_child_ret_t get_child(op_context_t c, iter_t iter) { auto pos = iter.get_offset(); assert(children.capacity()); auto child = children[pos]; if (is_valid_child_ptr(child)) { - return child_pos_t(child->get_transactional_view(t)); + ceph_assert(child->get_type() == T::TYPE); + return c.cache.template get_extent_viewable_by_trans(c.trans, (T*)child); } else if (is_pending()) { auto key = iter.get_key(); auto &sparent = get_stable_for_key(key); auto spos = sparent.child_pos_for_key(key); auto child = sparent.children[spos]; if (is_valid_child_ptr(child)) { - return child_pos_t(child->get_transactional_view(t)); + ceph_assert(child->get_type() == T::TYPE); + return c.cache.template get_extent_viewable_by_trans(c.trans, (T*)child); } else { return child_pos_t(&sparent, spos); } @@ -592,6 +588,12 @@ struct FixedKVInternalNode } } + get_child_ret_t + get_logical_child(op_context_t, uint16_t pos) final { + ceph_abort("impossible"); + return get_child_ret_t(child_pos_t(nullptr, 0)); + } + bool validate_stable_children() final { LOG_PREFIX(FixedKVInternalNode::validate_stable_children); if (this->children.empty()) { @@ -960,6 +962,30 @@ struct FixedKVLeafNode return this->get_split_pivot().get_offset(); } + get_child_ret_t + get_logical_child(op_context_t c, uint16_t pos) final { + auto child = this->children[pos]; + if (is_valid_child_ptr(child)) { + ceph_assert(child->is_logical()); + return c.cache.template get_extent_viewable_by_trans< + LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child); + } else if (this->is_pending()) { + auto key = this->iter_idx(pos).get_key(); + auto &sparent = this->get_stable_for_key(key); + auto spos = sparent.child_pos_for_key(key); + auto child = sparent.children[spos]; + if (is_valid_child_ptr(child)) { + ceph_assert(child->is_logical()); + return c.cache.template get_extent_viewable_by_trans< + LogicalCachedExtent>(c.trans, (LogicalCachedExtent*)child); + } else { + return child_pos_t(&sparent, spos); + } + } else { + return child_pos_t(this, pos); + } + } + bool validate_stable_children() override { return true; } diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 0e004761bab89..b13875f9c467f 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -350,6 +350,7 @@ class Cache { [](T &){}, [](T &) {}); } + /** * get_extent_if_cached * diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 78ea5a465bfc7..93fc701bb0555 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -119,6 +119,10 @@ std::ostream &LogicalCachedExtent::_print_detail(std::ostream &out) const return print_detail_l(out); } +void child_pos_t::link_child(ChildableCachedExtent *c) { + get_parent>()->link_child(c, pos); +} + void CachedExtent::set_invalid(Transaction &t) { state = extent_state_t::INVALID; if (trans_view_hook.is_linked()) { diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 10161fe220194..12b189fea549c 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -13,6 +13,7 @@ #include "include/buffer.h" #include "crimson/common/errorator.h" +#include "crimson/common/interruptible_future.h" #include "crimson/os/seastore/seastore_types.h" namespace crimson::os::seastore { @@ -872,8 +873,51 @@ class ExtentIndex { uint64_t bytes = 0; }; +class ChildableCachedExtent; class LogicalCachedExtent; +class child_pos_t { +public: + child_pos_t(CachedExtentRef stable_parent, uint16_t pos) + : stable_parent(stable_parent), pos(pos) {} + + template + TCachedExtentRef get_parent() { + ceph_assert(stable_parent); + return stable_parent->template cast(); + } + uint16_t get_pos() { + return pos; + } + void link_child(ChildableCachedExtent *c); +private: + CachedExtentRef stable_parent; + uint16_t pos = std::numeric_limits::max(); +}; + +template +struct get_child_ret_t { + std::variant>> ret; + get_child_ret_t(child_pos_t pos) + : ret(std::move(pos)) {} + get_child_ret_t(seastar::future> child) + : ret(std::move(child)) {} + + bool has_child() const { + return ret.index() == 1; + } + + child_pos_t &get_child_pos() { + ceph_assert(ret.index() == 0); + return std::get<0>(ret); + } + + seastar::future> &get_child_fut() { + ceph_assert(ret.index() == 1); + return std::get<1>(ret); + } +}; + template class PhysicalNodePin; @@ -892,8 +936,19 @@ class PhysicalNodePin { virtual PhysicalNodePinRef duplicate() const = 0; virtual bool has_been_invalidated() const = 0; virtual CachedExtentRef get_parent() const = 0; + virtual uint16_t get_pos() const = 0; + + virtual get_child_ret_t + get_logical_extent(Transaction &t) = 0; + + void link_child(ChildableCachedExtent *c) { + ceph_assert(child_pos); + child_pos->link_child(c); + } virtual ~PhysicalNodePin() {} +protected: + std::optional child_pos = std::nullopt; }; using LBAPin = PhysicalNodePin; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index df123d2ee0555..0e0e069b4c396 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -142,7 +142,7 @@ BtreeLBAManager::get_mappings( TRACET("{}~{} got {}, {}, repeat ...", c.trans, offset, length, pos.get_key(), pos.get_val()); ceph_assert((pos.get_key() + pos.get_val().len) > offset); - ret.push_back(pos.get_pin()); + ret.push_back(pos.get_pin(c)); return typename LBABtree::iterate_repeat_ret_inner( interruptible::ready_future_marker{}, seastar::stop_iteration::no); @@ -195,7 +195,7 @@ BtreeLBAManager::get_mapping( } else { TRACET("{} got {}, {}", c.trans, offset, iter.get_key(), iter.get_val()); - auto e = iter.get_pin(); + auto e = iter.get_pin(c); return get_mapping_ret( interruptible::ready_future_marker{}, std::move(e)); @@ -282,8 +282,8 @@ BtreeLBAManager::alloc_extent( state.ret = iter; }); }); - }).si_then([](auto &&state) { - return state.ret->get_pin(); + }).si_then([c](auto &&state) { + return state.ret->get_pin(c); }); } @@ -370,7 +370,7 @@ _init_cached_extent( iter.get_val().paddr == logn->get_paddr()) { assert(!iter.get_leaf_node()->is_pending()); iter.get_leaf_node()->link_child(logn.get(), iter.get_leaf_pos()); - logn->set_pin(iter.get_pin()); + logn->set_pin(iter.get_pin(c)); ceph_assert(iter.get_val().len == e->get_length()); if (c.pins) { c.pins->add_pin( diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 1535ef9312958..884af688da660 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -27,13 +27,16 @@ namespace crimson::os::seastore::lba_manager::btree { class BtreeLBAPin : public BtreeNodePin { public: - BtreeLBAPin() = default; + BtreeLBAPin(op_context_t ctx) + : BtreeNodePin(ctx) {} BtreeLBAPin( + op_context_t c, CachedExtentRef parent, uint16_t pos, lba_map_val_t &val, lba_node_meta_t &&meta) : BtreeNodePin( + c, parent, pos, val.paddr, diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index d25feba5b57fd..fc9cd33af98ef 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -22,18 +22,6 @@ namespace crimson::os::seastore { using context_t = ObjectDataHandler::context_t; using get_iertr = ObjectDataHandler::write_iertr; -auto read_pin( - context_t ctx, - LBAPinRef pin) { - return ctx.tm.pin_to_extent( - ctx.t, - std::move(pin) - ).handle_error_interruptible( - get_iertr::pass_further{}, - crimson::ct_error::assert_all{ "read_pin: invalid error" } - ); -} - /** * extent_to_write_t * @@ -518,7 +506,8 @@ operate_ret operate_left(context_t ctx, LBAPinRef &pin, const overwrite_plan_t & std::nullopt, std::nullopt); } else { - return read_pin(ctx, pin->duplicate() + return ctx.tm.read_pin( + ctx.t, pin->duplicate() ).si_then([prepend_len](auto left_extent) { return get_iertr::make_ready_future( std::nullopt, @@ -545,7 +534,8 @@ operate_ret operate_left(context_t ctx, LBAPinRef &pin, const overwrite_plan_t & left_to_write_extent, std::nullopt); } else { - return read_pin(ctx, pin->duplicate() + return ctx.tm.read_pin( + ctx.t, pin->duplicate() ).si_then([prepend_offset=extent_len, prepend_len, left_to_write_extent=std::move(left_to_write_extent)] (auto left_extent) mutable { @@ -598,7 +588,8 @@ operate_ret operate_right(context_t ctx, LBAPinRef &pin, const overwrite_plan_t std::nullopt); } else { auto append_offset = overwrite_plan.data_end - right_pin_begin; - return read_pin(ctx, pin->duplicate() + return ctx.tm.read_pin( + ctx.t, pin->duplicate() ).si_then([append_offset, append_len](auto right_extent) { return get_iertr::make_ready_future( std::nullopt, @@ -626,7 +617,8 @@ operate_ret operate_right(context_t ctx, LBAPinRef &pin, const overwrite_plan_t std::nullopt); } else { auto append_offset = overwrite_plan.data_end - right_pin_begin; - return read_pin(ctx, pin->duplicate() + return ctx.tm.read_pin( + ctx.t, pin->duplicate() ).si_then([append_offset, append_len, right_to_write_extent=std::move(right_to_write_extent)] (auto right_extent) mutable { @@ -731,8 +723,8 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation( } else { /* First pin overlaps the boundary and has data, read in extent * and rewrite portion prior to size */ - return read_pin( - ctx, + return ctx.tm.read_pin( + ctx.t, pin.duplicate() ).si_then([ctx, size, pin_offset, &pin, &object_data, &to_write]( auto extent) { @@ -1069,7 +1061,7 @@ ObjectDataHandler::read_ret ObjectDataHandler::read( current = end; return seastar::now(); } else { - return ctx.tm.pin_to_extent( + return ctx.tm.read_pin( ctx.t, std::move(pin) ).si_then([&ret, ¤t, end](auto extent) { diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index bde4df6ba3848..d63af2d57d4c9 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -593,7 +593,7 @@ TransactionManager::get_extents_if_live( // Only extent split can happen during the lookup ceph_assert(pin_seg_paddr >= paddr && pin_seg_paddr.add_offset(pin_len) <= paddr.add_offset(len)); - return pin_to_extent_by_type(t, std::move(pin), type + return read_pin_by_type(t, std::move(pin), type ).si_then([&list](auto ret) { list.emplace_back(std::move(ret)); return seastar::now(); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index aa4127db46be6..e5f71352724e7 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -116,81 +116,6 @@ class TransactionManager : public ExtentCallbackInterface { t, offset, length); } - /** - * pin_to_extent - * - * Get extent mapped at pin. - */ - using pin_to_extent_iertr = base_iertr; - template - using pin_to_extent_ret = pin_to_extent_iertr::future< - TCachedExtentRef>; - template - pin_to_extent_ret pin_to_extent( - Transaction &t, - LBAPinRef pin) { - LOG_PREFIX(TransactionManager::pin_to_extent); - SUBTRACET(seastore_tm, "getting extent {}", t, *pin); - static_assert(is_logical_type(T::TYPE)); - using ret = pin_to_extent_ret; - auto &pref = *pin; - return cache->get_absent_extent( - t, - pref.get_val(), - pref.get_length(), - [this, pin=std::move(pin)](T &extent) mutable { - assert(!extent.has_pin()); - assert(!extent.has_been_invalidated()); - assert(!pin->has_been_invalidated()); - assert(pin->get_parent()); - extent.set_pin(std::move(pin)); - lba_manager->add_pin(extent.get_pin()); - } - ).si_then([FNAME, &t](auto ref) mutable -> ret { - SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); - return pin_to_extent_ret( - interruptible::ready_future_marker{}, - std::move(ref)); - }); - } - - /** - * pin_to_extent_by_type - * - * Get extent mapped at pin. - */ - using pin_to_extent_by_type_ret = pin_to_extent_iertr::future< - LogicalCachedExtentRef>; - pin_to_extent_by_type_ret pin_to_extent_by_type( - Transaction &t, - LBAPinRef pin, - extent_types_t type) { - LOG_PREFIX(TransactionManager::pin_to_extent_by_type); - SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type); - assert(is_logical_type(type)); - auto &pref = *pin; - return cache->get_absent_extent_by_type( - t, - type, - pref.get_val(), - pref.get_key(), - pref.get_length(), - [this, pin=std::move(pin)](CachedExtent &extent) mutable { - auto &lextent = static_cast(extent); - assert(!lextent.has_pin()); - assert(!lextent.has_been_invalidated()); - assert(!pin->has_been_invalidated()); - lextent.set_pin(std::move(pin)); - lba_manager->add_pin(lextent.get_pin()); - } - ).si_then([FNAME, &t](auto ref) { - SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); - return pin_to_extent_by_type_ret( - interruptible::ready_future_marker{}, - std::move(ref->template cast())); - }); - } - /** * read_extent * @@ -209,14 +134,15 @@ class TransactionManager : public ExtentCallbackInterface { SUBTRACET(seastore_tm, "{}~{}", t, offset, length); return get_pin( t, offset - ).si_then([this, FNAME, &t, offset, length] (auto pin) { + ).si_then([this, FNAME, &t, offset, length] (auto pin) + -> read_extent_ret { if (length != pin->get_length() || !pin->get_val().is_real()) { SUBERRORT(seastore_tm, "offset {} len {} got wrong pin {}", t, offset, length, *pin); ceph_assert(0 == "Should be impossible"); } - return this->pin_to_extent(t, std::move(pin)); + return this->read_pin(t, std::move(pin)); }); } @@ -233,17 +159,46 @@ class TransactionManager : public ExtentCallbackInterface { SUBTRACET(seastore_tm, "{}", t, offset); return get_pin( t, offset - ).si_then([this, FNAME, &t, offset] (auto pin) { + ).si_then([this, FNAME, &t, offset] (auto pin) + -> read_extent_ret { if (!pin->get_val().is_real()) { SUBERRORT(seastore_tm, "offset {} got wrong pin {}", t, offset, *pin); ceph_assert(0 == "Should be impossible"); } - return this->pin_to_extent(t, std::move(pin)); + return this->read_pin(t, std::move(pin)); }); } + template + base_iertr::future> read_pin( + Transaction &t, + LBAMappingRef pin) + { + auto v = pin->get_logical_extent(t); + if (v.has_child()) { + return v.get_child_fut().then([](auto extent) { + return extent->template cast(); + }); + } else { + return pin_to_extent(t, std::move(pin)); + } + } + + base_iertr::future read_pin_by_type( + Transaction &t, + LBAMappingRef pin, + extent_types_t type) + { + auto v = pin->get_logical_extent(t); + if (v.has_child()) { + return std::move(v.get_child_fut()); + } else { + return pin_to_extent_by_type(t, std::move(pin), type); + } + } + /// Obtain mutable copy of extent LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) { LOG_PREFIX(TransactionManager::get_mutable_extent); @@ -648,6 +603,87 @@ class TransactionManager : public ExtentCallbackInterface { ExtentPlacementManager::dispatch_result_t dispatch_result, std::optional seq_to_trim = std::nullopt); + /** + * pin_to_extent + * + * Get extent mapped at pin. + */ + using pin_to_extent_iertr = base_iertr; + template + using pin_to_extent_ret = pin_to_extent_iertr::future< + TCachedExtentRef>; + template + pin_to_extent_ret pin_to_extent( + Transaction &t, + LBAMappingRef pin) { + LOG_PREFIX(TransactionManager::pin_to_extent); + SUBTRACET(seastore_tm, "getting extent {}", t, *pin); + static_assert(is_logical_type(T::TYPE)); + using ret = pin_to_extent_ret; + auto &pref = *pin; + return cache->get_absent_extent( + t, + pref.get_val(), + pref.get_length(), + [pin=std::move(pin)] + (T &extent) mutable { + assert(!extent.has_laddr()); + assert(!extent.has_been_invalidated()); + assert(!pin->has_been_invalidated()); + assert(pin->get_parent()); + pin->link_child(&extent); + extent.set_laddr(pin->get_key()); + } + ).si_then([FNAME, &t](auto ref) mutable -> ret { + SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); + return pin_to_extent_ret( + interruptible::ready_future_marker{}, + std::move(ref)); + }); + } + + /** + * pin_to_extent_by_type + * + * Get extent mapped at pin. + */ + using pin_to_extent_by_type_ret = pin_to_extent_iertr::future< + LogicalCachedExtentRef>; + pin_to_extent_by_type_ret pin_to_extent_by_type( + Transaction &t, + LBAMappingRef pin, + extent_types_t type) + { + LOG_PREFIX(TransactionManager::pin_to_extent_by_type); + SUBTRACET(seastore_tm, "getting extent {} type {}", t, *pin, type); + assert(is_logical_type(type)); + auto &pref = *pin; + return cache->get_absent_extent_by_type( + t, + type, + pref.get_val(), + pref.get_key(), + pref.get_length(), + [pin=std::move(pin)](CachedExtent &extent) mutable { + auto &lextent = static_cast(extent); + assert(!lextent.has_laddr()); + assert(!lextent.has_been_invalidated()); + assert(!pin->has_been_invalidated()); + assert(pin->get_parent()); + assert(!pin->get_parent()->is_pending()); + pin->link_child(&lextent); + lextent.set_pin(std::move(pin)); + lba_manager->add_pin(lextent.get_pin()); + } + ).si_then([FNAME, &t](auto ref) { + SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); + return pin_to_extent_by_type_ret( + interruptible::ready_future_marker{}, + std::move(ref->template cast())); + }); + } + + public: // Testing interfaces auto get_epm() { From 4a3dfc0f630d6e635bd82801e0107be78d3d2c6d Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Mon, 27 Mar 2023 02:20:59 +0000 Subject: [PATCH 17/21] crimson/os/seastore/btree: drop btree_pin_set_t Signed-off-by: Xuehan Xu --- .../seastore/backref/btree_backref_manager.cc | 42 +- .../seastore/backref/btree_backref_manager.h | 31 +- src/crimson/os/seastore/backref_manager.h | 15 +- .../os/seastore/btree/btree_range_pin.cc | 7 +- .../os/seastore/btree/btree_range_pin.h | 375 +----------------- .../os/seastore/btree/fixed_kv_btree.h | 172 +++++++- src/crimson/os/seastore/btree/fixed_kv_node.h | 48 ++- src/crimson/os/seastore/cached_extent.cc | 9 +- src/crimson/os/seastore/cached_extent.h | 53 +-- src/crimson/os/seastore/lba_manager.h | 18 +- .../lba_manager/btree/btree_lba_manager.cc | 72 +--- .../lba_manager/btree/btree_lba_manager.h | 36 +- .../os/seastore/object_data_handler.cc | 4 +- .../os/seastore/transaction_manager.cc | 27 +- src/crimson/os/seastore/transaction_manager.h | 16 +- .../seastore/test_btree_lba_manager.cc | 18 +- .../seastore/test_object_data_handler.cc | 2 +- 17 files changed, 254 insertions(+), 691 deletions(-) diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index 28bf85567ebec..eab7fb9709e5a 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -74,12 +74,6 @@ void unlink_phy_tree_root_node(RootBlockRef &root_block) { namespace crimson::os::seastore::backref { -static depth_t get_depth(const CachedExtent &e) -{ - assert(is_backref_node(e.get_type())); - return e.cast()->get_node_meta().depth; -} - BtreeBackrefManager::mkfs_ret BtreeBackrefManager::mkfs( Transaction &t) @@ -106,7 +100,7 @@ BtreeBackrefManager::get_mapping( LOG_PREFIX(BtreeBackrefManager::get_mapping); TRACET("{}", t, offset); auto c = get_context(t); - return with_btree_ret( + return with_btree_ret( cache, c, [c, offset](auto &btree) { @@ -546,40 +540,6 @@ BtreeBackrefManager::remove_mapping( }); } -void BtreeBackrefManager::complete_transaction( - Transaction &t, - std::vector &to_clear, - std::vector &to_link) -{ - LOG_PREFIX(BtreeBackrefManager::complete_transaction); - DEBUGT("start", t); - // need to call check_parent from leaf->parent - std::sort( - to_clear.begin(), to_clear.end(), - [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); }); - - for (auto &e: to_clear) { - auto &pin = e->cast()->pin; - DEBUGT("retiring extent {} -- {}", t, pin, *e); - pin_set.retire(pin); - } - - std::sort( - to_link.begin(), to_link.end(), - [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); }); - - for (auto &e : to_link) { - DEBUGT("linking extent -- {}", t, *e); - pin_set.add_pin(e->cast()->pin); - } - - for (auto &e: to_clear) { - auto &pin = e->cast()->pin; - TRACET("checking extent {} -- {}", t, pin, *e); - pin_set.check_parent(pin); - } -} - Cache::backref_entry_query_mset_t BtreeBackrefManager::get_cached_backref_entries_in_range( paddr_t start, diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 9a067f8988514..48ef4d8319171 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -11,18 +11,18 @@ namespace crimson::os::seastore::backref { constexpr size_t BACKREF_BLOCK_SIZE = 4096; -class BtreeBackrefPin : public BtreeNodePin { +class BtreeBackrefMapping : public BtreeNodeMapping { extent_types_t type; public: - BtreeBackrefPin(op_context_t ctx) - : BtreeNodePin(ctx) {} - BtreeBackrefPin( + BtreeBackrefMapping(op_context_t ctx) + : BtreeNodeMapping(ctx) {} + BtreeBackrefMapping( op_context_t ctx, CachedExtentRef parent, uint16_t pos, backref_map_val_t &val, backref_node_meta_t &&meta) - : BtreeNodePin( + : BtreeNodeMapping( ctx, parent, pos, @@ -38,7 +38,7 @@ class BtreeBackrefPin : public BtreeNodePin { using BackrefBtree = FixedKVBtree< paddr_t, backref_map_val_t, BackrefInternalNode, - BackrefLeafNode, BtreeBackrefPin, BACKREF_BLOCK_SIZE, false>; + BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>; class BtreeBackrefManager : public BackrefManager { public: @@ -83,25 +83,10 @@ class BtreeBackrefManager : public BackrefManager { Transaction &t, CachedExtentRef e) final; - void complete_transaction( - Transaction &t, - std::vector &, - std::vector &) final; - rewrite_extent_ret rewrite_extent( Transaction &t, CachedExtentRef extent) final; - void add_pin(BackrefPin &pin) final { - auto *bpin = reinterpret_cast(&pin); - pin_set.add_pin(bpin->get_range_pin()); - bpin->set_parent(nullptr); - } - void remove_pin(BackrefPin &pin) final { - auto *bpin = reinterpret_cast(&pin); - pin_set.retire(bpin->get_range_pin()); - } - Cache::backref_entry_query_mset_t get_cached_backref_entries_in_range( paddr_t start, @@ -121,10 +106,8 @@ class BtreeBackrefManager : public BackrefManager { private: Cache &cache; - btree_pin_set_t pin_set; - op_context_t get_context(Transaction &t) { - return op_context_t{cache, t, &pin_set}; + return op_context_t{cache, t}; } }; diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h index 68c02b11a812c..3feedb997b4c3 100644 --- a/src/crimson/os/seastore/backref_manager.h +++ b/src/crimson/os/seastore/backref_manager.h @@ -42,7 +42,7 @@ class BackrefManager { */ using get_mapping_iertr = base_iertr::extend< crimson::ct_error::enoent>; - using get_mapping_ret = get_mapping_iertr::future; + using get_mapping_ret = get_mapping_iertr::future; virtual get_mapping_ret get_mapping( Transaction &t, paddr_t offset) = 0; @@ -62,7 +62,7 @@ class BackrefManager { * Insert new paddr_t -> laddr_t mapping */ using new_mapping_iertr = base_iertr; - using new_mapping_ret = new_mapping_iertr::future; + using new_mapping_ret = new_mapping_iertr::future; virtual new_mapping_ret new_mapping( Transaction &t, paddr_t key, @@ -140,17 +140,6 @@ class BackrefManager { Transaction &t, scan_mapped_space_func_t &&f) = 0; - virtual void complete_transaction( - Transaction &t, - std::vector &to_clear, ///< extents whose pins are to be cleared, - // as the results of their retirements - std::vector &to_link ///< fresh extents whose pins are to be inserted - // into backref manager's pin set - ) = 0; - - virtual void add_pin(BackrefPin &pin) = 0; - virtual void remove_pin(BackrefPin &pin) = 0; - virtual ~BackrefManager() {} }; diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc index adb84ed06950a..2f801dcf1ec50 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.cc +++ b/src/crimson/os/seastore/btree/btree_range_pin.cc @@ -8,7 +8,7 @@ namespace crimson::os::seastore { template get_child_ret_t -BtreeNodePin::get_logical_extent( +BtreeNodeMapping::get_logical_extent( Transaction &t) { assert(parent); @@ -22,7 +22,6 @@ BtreeNodePin::get_logical_extent( return v; } -template class BtreeNodePin; -template class BtreeNodePin; - +template class BtreeNodeMapping; +template class BtreeNodeMapping; } // namespace crimson::os::seastore diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h index 29dfa476c6e12..fef89197fd9ec 100644 --- a/src/crimson/os/seastore/btree/btree_range_pin.h +++ b/src/crimson/os/seastore/btree/btree_range_pin.h @@ -17,7 +17,6 @@ template struct op_context_t { Cache &cache; Transaction &trans; - btree_pin_set_t *pins = nullptr; }; constexpr uint16_t MAX_FIXEDKVBTREE_DEPTH = 8; @@ -116,339 +115,8 @@ struct fixed_kv_node_meta_le_t { } }; - -/** - * btree_range_pin_t - * - * Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set - * hook, the fixed_kv_node_meta_t representing the key range covered by a node, - * and extent and ref members intended to hold a reference when the extent - * should be pinned. - */ -template -class btree_pin_set_t; - -template -class FixedKVNode; - -template -class btree_range_pin_t : public boost::intrusive::set_base_hook<> { - friend class btree_pin_set_t; - friend class FixedKVNode; - fixed_kv_node_meta_t range; - - btree_pin_set_t *pins = nullptr; - - // We need to be able to remember extent without holding a reference, - // but we can do it more compactly -- TODO - CachedExtent *extent = nullptr; - CachedExtentRef ref; - - using index_t = boost::intrusive::set; - - void acquire_ref() { - ref = CachedExtentRef(extent); - } - - void drop_ref() { - ref.reset(); - } - -public: - btree_range_pin_t() = default; - btree_range_pin_t(CachedExtent *extent) - : extent(extent) {} - btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent) - : range(rhs.range), extent(extent) {} - - bool has_ref() const { - return !!ref; - } - - bool is_root() const { - return range.is_root(); - } - - void set_range(const fixed_kv_node_meta_t &nrange) { - range = nrange; - } - void set_extent(CachedExtent *nextent) { - ceph_assert(!extent); - extent = nextent; - } - - CachedExtent &get_extent() { - assert(extent); - return *extent; - } - - bool has_ref() { - return !!ref; - } - - void take_pin(btree_range_pin_t &other) - { - ceph_assert(other.extent); - if (other.pins) { - other.pins->replace_pin(*this, other); - pins = other.pins; - other.pins = nullptr; - - if (other.has_ref()) { - other.drop_ref(); - acquire_ref(); - } - } - } - - friend bool operator<( - const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { - assert(lhs.range.depth == rhs.range.depth); - return lhs.range.begin < rhs.range.begin; - } - friend bool operator>( - const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { - assert(lhs.range.depth == rhs.range.depth); - return lhs.range.begin > rhs.range.begin; - } - friend bool operator==( - const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) { - assert(lhs.range.depth == rhs.range.depth); - return lhs.range.begin == rhs.range.begin; - } - - struct meta_cmp_t { - bool operator()( - const btree_range_pin_t &lhs, const fixed_kv_node_meta_t &rhs) const { - assert(lhs.range.depth == rhs.depth); - return lhs.range.begin < rhs.begin; - } - bool operator()( - const fixed_kv_node_meta_t &lhs, const btree_range_pin_t &rhs) const { - assert(lhs.depth == rhs.range.depth); - return lhs.begin < rhs.range.begin; - } - }; - - friend std::ostream &operator<<( - std::ostream &lhs, - const btree_range_pin_t &rhs) { - return lhs << "btree_range_pin_t(" - << "begin=" << rhs.range.begin - << ", end=" << rhs.range.end - << ", depth=" << rhs.range.depth - << ", extent=" << rhs.extent - << ")"; - } - - template - friend class BtreeNodePin; - ~btree_range_pin_t() - { - ceph_assert(!pins == !is_linked()); - ceph_assert(!ref); - if (pins) { - crimson::get_logger(ceph_subsys_seastore_lba - ).debug("{}: removing {}", __func__, *this); - pins->remove_pin(*this, true); - } - extent = nullptr; - } - -}; - -/** - * btree_pin_set_t - * - * Ensures that for every cached node, all parent btree nodes required - * to map it are present in cache. Relocating these nodes can - * therefore be done without further reads or cache space. - * - * Contains a btree_range_pin_t for every clean or dirty btree node - * or LogicalCachedExtent instance in cache at any point in time. - * For any btree node, the contained btree_range_pin_t will hold - * a reference to that node pinning it in cache as long as that - * node has children in the set. This invariant can be violated - * only by calling retire_extent and is repaired by calling - * check_parent synchronously after adding any new extents. - */ -template -class btree_pin_set_t { - friend class btree_range_pin_t; - using pins_by_depth_t = std::array< - typename btree_range_pin_t::index_t, - MAX_FIXEDKVBTREE_DEPTH>; - pins_by_depth_t pins_by_depth; - - /// Removes pin from set optionally checking whether parent has other children - void remove_pin(btree_range_pin_t &pin, bool do_check_parent) - { - crimson::get_logger(ceph_subsys_seastore_lba).debug("{}: {}", __func__, pin); - ceph_assert(pin.is_linked()); - ceph_assert(pin.pins); - ceph_assert(!pin.ref); - - auto &layer = pins_by_depth[pin.range.depth]; - layer.erase(layer.s_iterator_to(pin)); - pin.pins = nullptr; - - if (do_check_parent) { - check_parent(pin); - } - } - - void replace_pin( - btree_range_pin_t &to, - btree_range_pin_t &from) - { - assert(to.range.depth == from.range.depth); - pins_by_depth[from.range.depth].replace_node( - btree_range_pin_t::index_t::s_iterator_to(from), to); - } - - /// Returns parent pin if exists - btree_range_pin_t *maybe_get_parent( - const fixed_kv_node_meta_t &meta) - { - auto cmeta = meta; - cmeta.depth++; - auto &layer = pins_by_depth[cmeta.depth]; - auto iter = layer.upper_bound( - cmeta, - typename btree_range_pin_t::meta_cmp_t()); - if (iter == layer.begin()) { - return nullptr; - } else { - --iter; - if (iter->range.is_parent_of(meta)) { - return &*iter; - } else { - return nullptr; - } - } - } - - /// Returns earliest child pin if exist - const btree_range_pin_t - *maybe_get_first_child(const fixed_kv_node_meta_t &meta) const - { - if (meta.depth == 0) { - return nullptr; - } - - auto cmeta = meta; - cmeta.depth--; - - auto &layer = pins_by_depth[cmeta.depth]; - auto iter = layer.lower_bound( - cmeta, - typename btree_range_pin_t::meta_cmp_t()); - if (iter == layer.end()) { - return nullptr; - } else if (meta.is_parent_of(iter->range)) { - return &*iter; - } else { - return nullptr; - } - } - - /// Releases pin if it has no children - void release_if_no_children(btree_range_pin_t &pin) - { - ceph_assert(pin.is_linked()); - if (maybe_get_first_child(pin.range) == nullptr) { - pin.drop_ref(); - } - } - -public: - btree_pin_set_t() {} - /// Adds pin to set, assumes set is consistent - void add_pin(btree_range_pin_t &pin) - { - ceph_assert(!pin.is_linked()); - ceph_assert(!pin.pins); - ceph_assert(!pin.ref); - - auto &layer = pins_by_depth[pin.range.depth]; - auto [prev, inserted] = layer.insert(pin); - if (!inserted) { - crimson::get_logger(ceph_subsys_seastore_lba).error( - "{}: unable to add {} ({}), found {} ({})", - __func__, - pin, - *(pin.extent), - *prev, - *(prev->extent)); - ceph_assert(0 == "impossible"); - return; - } - pin.pins = this; - if (!pin.is_root()) { - auto *parent = maybe_get_parent(pin.range); - ceph_assert(parent); - if (!parent->has_ref()) { - crimson::get_logger(ceph_subsys_seastore_lba - ).debug("{}: acquiring parent {}", __func__, - static_cast(parent)); - parent->acquire_ref(); - } else { - crimson::get_logger(ceph_subsys_seastore_lba).debug( - "{}: parent has ref {}", __func__, - static_cast(parent)); - } - } - if (maybe_get_first_child(pin.range) != nullptr) { - crimson::get_logger(ceph_subsys_seastore_lba).debug( - "{}: acquiring self {}", __func__, pin); - pin.acquire_ref(); - } - } - - - /** - * retire/check_parent - * - * See BtreeLBAManager::complete_transaction. - * retire removes the specified pin from the set, but does not - * check parents. After any new extents are added to the set, - * the caller is required to call check_parent to restore the - * invariant. - */ - void retire(btree_range_pin_t &pin) - { - pin.drop_ref(); - remove_pin(pin, false); - } - - void check_parent(btree_range_pin_t &pin) - { - auto parent = maybe_get_parent(pin.range); - if (parent) { - crimson::get_logger(ceph_subsys_seastore_lba - ).debug("{}: releasing parent {}", __func__, *parent); - release_if_no_children(*parent); - } - } - - template - void scan(F &&f) { - for (auto &layer : pins_by_depth) { - for (auto &i : layer) { - std::invoke(f, i); - } - } - } - - ~btree_pin_set_t() { - for (auto &layer : pins_by_depth) { - ceph_assert(layer.empty()); - } - } -}; - template -class BtreeNodePin : public PhysicalNodePin { +class BtreeNodeMapping : public PhysicalNodeMapping { op_context_t ctx; /** @@ -461,14 +129,14 @@ class BtreeNodePin : public PhysicalNodePin { val_t value; extent_len_t len; - btree_range_pin_t pin; + fixed_kv_node_meta_t range; uint16_t pos = std::numeric_limits::max(); public: using val_type = val_t; - BtreeNodePin(op_context_t ctx) : ctx(ctx) {} + BtreeNodeMapping(op_context_t ctx) : ctx(ctx) {} - BtreeNodePin( + BtreeNodeMapping( op_context_t ctx, CachedExtentRef parent, uint16_t pos, @@ -479,9 +147,9 @@ class BtreeNodePin : public PhysicalNodePin { parent(parent), value(value), len(len), + range(std::move(meta)), pos(pos) { - pin.set_range(std::move(meta)); if (!parent->is_pending()) { this->child_pos = {parent, pos}; } @@ -491,21 +159,12 @@ class BtreeNodePin : public PhysicalNodePin { return parent; } - btree_range_pin_t& get_range_pin() { - return pin; - } - CachedExtentRef get_parent() { return parent; } - void set_parent(CachedExtentRef pin) { - parent = pin; - } - - void link_extent(LogicalCachedExtent *ref) final { - pin.set_extent(ref); - pos = std::numeric_limits::max(); + void set_parent(CachedExtentRef ext) { + parent = ext; } uint16_t get_pos() const final { @@ -513,7 +172,7 @@ class BtreeNodePin : public PhysicalNodePin { } extent_len_t get_length() const final { - ceph_assert(pin.range.end > pin.range.begin); + ceph_assert(range.end > range.begin); return len; } @@ -527,13 +186,13 @@ class BtreeNodePin : public PhysicalNodePin { } key_t get_key() const final { - return pin.range.begin; + return range.begin; } - PhysicalNodePinRef duplicate() const final { - auto ret = std::unique_ptr>( - new BtreeNodePin(ctx)); - ret->pin.set_range(pin.range); + PhysicalNodeMappingRef duplicate() const final { + auto ret = std::unique_ptr>( + new BtreeNodeMapping(ctx)); + ret->range = range; ret->value = value; ret->parent = parent; ret->len = len; @@ -541,10 +200,6 @@ class BtreeNodePin : public PhysicalNodePin { return ret; } - void take_pin(PhysicalNodePin &opin) final { - pin.take_pin(static_cast&>(opin).pin); - } - bool has_been_invalidated() const final { return parent->has_been_invalidated(); } @@ -553,7 +208,3 @@ class BtreeNodePin : public PhysicalNodePin { }; } - -#if FMT_VERSION >= 90000 -template struct fmt::formatter> : fmt::ostream_formatter {}; -#endif diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 8b0256bd1f46e..7248e67a0503f 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -213,7 +213,7 @@ class FixedKVBtree { return leaf.pos == 0; } - PhysicalNodePinRef + PhysicalNodeMappingRef get_pin(op_context_t ctx) const { assert(!is_end()); auto val = get_val(); @@ -360,7 +360,7 @@ class FixedKVBtree { root_leaf->set_size(0); fixed_kv_node_meta_t meta{min_max_t::min, min_max_t::max, 1}; root_leaf->set_meta(meta); - root_leaf->pin.set_range(meta); + root_leaf->range = meta; get_tree_stats(c.trans).depth = 1u; get_tree_stats(c.trans).extents_num_delta++; link_phy_tree_root_node(root_block, root_leaf.get()); @@ -485,6 +485,152 @@ class FixedKVBtree { return upper_bound(c, min_max_t::max); } + template + void check_node( + op_context_t c, + TCachedExtentRef node) + { + for (auto i : *node) { + CachedExtentRef child_node; + Transaction::get_extent_ret ret; + + if constexpr (std::is_base_of_v) { + ret = c.trans.get_extent( + i->get_val().maybe_relative_to(node->get_paddr()), + &child_node); + } else { + if constexpr (leaf_has_children) { + ret = c.trans.get_extent( + i->get_val().paddr.maybe_relative_to(node->get_paddr()), + &child_node); + } + } + if (ret == Transaction::get_extent_ret::PRESENT) { + if (child_node->is_mutation_pending()) { + auto &prior = (child_node_t &)*child_node->prior_instance; + assert(prior.is_valid()); + assert(prior.is_parent_valid()); + if (node->is_mutation_pending()) { + auto &n = node->get_stable_for_key(i->get_key()); + assert(prior.get_parent_node().get() == &n); + auto pos = n.lower_bound_offset(i->get_key()); + assert(pos < n.get_node_size()); + assert(n.children[pos] == &prior); + } else { + assert(prior.get_parent_node().get() == node.get()); + assert(node->children[i->get_offset()] == &prior); + } + } else if (child_node->is_initial_pending()) { + auto cnode = child_node->template cast(); + auto pos = node->find(i->get_key()).get_offset(); + auto child = node->children[pos]; + assert(child); + assert(child == cnode.get()); + assert(cnode->is_parent_valid()); + } else { + assert(child_node->is_valid()); + auto cnode = child_node->template cast(); + assert(cnode->has_parent_tracker()); + if (node->is_pending()) { + auto &n = node->get_stable_for_key(i->get_key()); + assert(cnode->get_parent_node().get() == &n); + auto pos = n.lower_bound_offset(i->get_key()); + assert(pos < n.get_node_size()); + assert(n.children[pos] == cnode.get()); + } else { + assert(cnode->get_parent_node().get() == node.get()); + assert(node->children[i->get_offset()] == cnode.get()); + } + } + } else if (ret == Transaction::get_extent_ret::ABSENT) { + ChildableCachedExtent* child = nullptr; + if (node->is_pending()) { + auto &n = node->get_stable_for_key(i->get_key()); + auto pos = n.lower_bound_offset(i->get_key()); + assert(pos < n.get_node_size()); + child = n.children[pos]; + if (is_valid_child_ptr(child)) { + auto c = (child_node_t*)child; + assert(c->has_parent_tracker()); + assert(c->get_parent_node().get() == &n); + } + } else { + child = node->children[i->get_offset()]; + if (is_valid_child_ptr(child)) { + auto c = (child_node_t*)child; + assert(c->has_parent_tracker()); + assert(c->get_parent_node().get() == node.get()); + } + } + + if (!is_valid_child_ptr(child)) { + if constexpr ( + std::is_base_of_v) + { + assert(!c.cache.query_cache(i->get_val(), nullptr)); + } else { + if constexpr (leaf_has_children) { + assert(!c.cache.query_cache(i->get_val().paddr, nullptr)); + } + } + } + } else { + ceph_abort("impossible"); + } + } + } + + using check_child_trackers_ret = base_iertr::future<>; + check_child_trackers_ret check_child_trackers( + op_context_t c) { + mapped_space_visitor_t checker = [c, this]( + paddr_t, + node_key_t, + extent_len_t, + depth_t depth, + extent_types_t, + iterator& iter) { + if constexpr (!leaf_has_children) { + if (depth == 1) { + return seastar::now(); + } + } + if (depth > 1) { + auto &node = iter.get_internal(depth).node; + assert(node->is_valid()); + check_node(c, node); + } else { + assert(depth == 1); + auto &node = iter.leaf.node; + assert(node->is_valid()); + check_node(c, node); + } + return seastar::now(); + }; + + return seastar::do_with( + std::move(checker), + [this, c](auto &checker) { + return iterate_repeat( + c, + lower_bound( + c, + min_max_t::min, + &checker), + [](auto &pos) { + if (pos.is_end()) { + return base_iertr::make_ready_future< + seastar::stop_iteration>( + seastar::stop_iteration::yes); + } + return base_iertr::make_ready_future< + seastar::stop_iteration>( + seastar::stop_iteration::no); + }, + &checker); + }); + } + using iterate_repeat_ret_inner = base_iertr::future< seastar::stop_iteration>; template @@ -872,7 +1018,7 @@ class FixedKVBtree { fixed_kv_extent.get_length(), n_fixed_kv_extent->get_bptr().c_str()); n_fixed_kv_extent->set_modify_time(fixed_kv_extent.get_modify_time()); - n_fixed_kv_extent->pin.set_range(n_fixed_kv_extent->get_node_meta()); + n_fixed_kv_extent->range = n_fixed_kv_extent->get_node_meta(); if (fixed_kv_extent.get_type() == internal_node_t::TYPE || leaf_node_t::do_has_children) { @@ -1084,8 +1230,8 @@ class FixedKVBtree { parent_pos=std::move(parent_pos)] (internal_node_t &node) { assert(!node.is_pending()); - assert(!node.pin.is_linked()); - node.pin.set_range(fixed_kv_node_meta_t{begin, end, depth}); + assert(!node.is_linked()); + node.range = fixed_kv_node_meta_t{begin, end, depth}; if (parent_pos) { auto &parent = parent_pos->node; parent->link_child(&node, parent_pos->pos); @@ -1100,9 +1246,6 @@ class FixedKVBtree { link_phy_tree_root_node(root_block, &node); } } - if (c.pins) { - c.pins->add_pin(node.pin); - } }; return c.cache.template get_absent_extent( c.trans, @@ -1119,7 +1262,7 @@ class FixedKVBtree { *ret); // This can only happen during init_cached_extent // or when backref extent being rewritten by gc space reclaiming - if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { + if (!ret->is_pending() && !ret->is_linked()) { assert(ret->is_dirty() || (is_backref_node(ret->get_type()) && ret->is_clean())); @@ -1161,8 +1304,8 @@ class FixedKVBtree { parent_pos=std::move(parent_pos)] (leaf_node_t &node) { assert(!node.is_pending()); - assert(!node.pin.is_linked()); - node.pin.set_range(fixed_kv_node_meta_t{begin, end, 1}); + assert(!node.is_linked()); + node.range = fixed_kv_node_meta_t{begin, end, 1}; if (parent_pos) { auto &parent = parent_pos->node; parent->link_child(&node, parent_pos->pos); @@ -1177,9 +1320,6 @@ class FixedKVBtree { link_phy_tree_root_node(root_block, &node); } } - if (c.pins) { - c.pins->add_pin(node.pin); - } }; return c.cache.template get_absent_extent( c.trans, @@ -1196,7 +1336,7 @@ class FixedKVBtree { *ret); // This can only happen during init_cached_extent // or when backref extent being rewritten by gc space reclaiming - if (c.pins && !ret->is_pending() && !ret->pin.is_linked()) { + if (!ret->is_pending() && !ret->is_linked()) { assert(ret->is_dirty() || (is_backref_node(ret->get_type()) && ret->is_clean())); @@ -1625,7 +1765,7 @@ class FixedKVBtree { fixed_kv_node_meta_t meta{ min_max_t::min, min_max_t::max, iter.get_depth() + 1}; nroot->set_meta(meta); - nroot->pin.set_range(meta); + nroot->range = meta; nroot->journal_insert( nroot->begin(), min_max_t::min, diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h index 3997be0b904dc..fe5052824dc8e 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_node.h +++ b/src/crimson/os/seastore/btree/fixed_kv_node.h @@ -30,22 +30,22 @@ namespace crimson::os::seastore { template struct FixedKVNode : ChildableCachedExtent { using FixedKVNodeRef = TCachedExtentRef; - btree_range_pin_t pin; + fixed_kv_node_meta_t range; struct copy_source_cmp_t { using is_transparent = node_key_t; bool operator()(const FixedKVNodeRef &l, const FixedKVNodeRef &r) const { - assert(l->pin.range.end <= r->pin.range.begin - || r->pin.range.end <= l->pin.range.begin - || (l->pin.range.begin == r->pin.range.begin - && l->pin.range.end == r->pin.range.end)); - return l->pin.range.begin < r->pin.range.begin; + assert(l->range.end <= r->range.begin + || r->range.end <= l->range.begin + || (l->range.begin == r->range.begin + && l->range.end == r->range.end)); + return l->range.begin < r->range.begin; } bool operator()(const node_key_t &l, const FixedKVNodeRef &r) const { - return l < r->pin.range.begin; + return l < r->range.begin; } bool operator()(const FixedKVNodeRef &l, const node_key_t &r) const { - return l->pin.range.begin < r; + return l->range.begin < r; } }; @@ -94,12 +94,11 @@ struct FixedKVNode : ChildableCachedExtent { FixedKVNode(uint16_t capacity, ceph::bufferptr &&ptr) : ChildableCachedExtent(std::move(ptr)), - pin(this), children(capacity, nullptr), capacity(capacity) {} FixedKVNode(const FixedKVNode &rhs) : ChildableCachedExtent(rhs), - pin(rhs.pin, this), + range(rhs.range), children(rhs.capacity, nullptr), capacity(rhs.capacity) {} @@ -344,7 +343,7 @@ struct FixedKVNode : ChildableCachedExtent { void set_parent_tracker_from_prior_instance() { assert(is_mutation_pending()); auto &prior = (FixedKVNode&)(*get_prior_instance()); - if (pin.is_root()) { + if (range.is_root()) { ceph_assert(prior.root_block); ceph_assert(pending_for_transaction); root_block = prior.root_block; @@ -405,7 +404,6 @@ struct FixedKVNode : ChildableCachedExtent { // All in-memory relative addrs are necessarily record-relative assert(get_prior_instance()); assert(pending_for_transaction); - pin.take_pin(get_prior_instance()->template cast()->pin); resolve_relative_addrs(record_block_offset); } @@ -489,7 +487,7 @@ struct FixedKVNode : ChildableCachedExtent { void on_initial_write() final { // All in-memory relative addrs are necessarily block-relative resolve_relative_addrs(get_paddr()); - if (pin.is_root()) { + if (range.is_root()) { reset_parent_tracker(); } assert(has_parent_tracker() ? (is_parent_valid()) : true); @@ -617,7 +615,7 @@ struct FixedKVInternalNode virtual ~FixedKVInternalNode() { if (this->is_valid() && !this->is_pending()) { - if (this->pin.is_root()) { + if (this->range.is_root()) { ceph_assert(this->root_block); unlink_phy_tree_root_node(this->root_block); } else { @@ -758,8 +756,8 @@ struct FixedKVInternalNode c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); this->split_child_ptrs(*left, *right); auto pivot = this->split_into(*left, *right); - left->pin.set_range(left->get_meta()); - right->pin.set_range(right->get_meta()); + left->range = left->get_meta(); + right->range = right->get_meta(); return std::make_tuple( left, right, @@ -773,7 +771,7 @@ struct FixedKVInternalNode c.trans, node_size, placement_hint_t::HOT, INIT_GENERATION); replacement->merge_child_ptrs(*this, *right); replacement->merge_from(*this, *right->template cast()); - replacement->pin.set_range(replacement->get_meta()); + replacement->range = replacement->get_meta(); return replacement; } @@ -802,8 +800,8 @@ struct FixedKVInternalNode *replacement_left, *replacement_right); - replacement_left->pin.set_range(replacement_left->get_meta()); - replacement_right->pin.set_range(replacement_right->get_meta()); + replacement_left->range = replacement_left->get_meta(); + replacement_right->range = replacement_right->get_meta(); return std::make_tuple( replacement_left, replacement_right, @@ -992,7 +990,7 @@ struct FixedKVLeafNode virtual ~FixedKVLeafNode() { if (this->is_valid() && !this->is_pending()) { - if (this->pin.is_root()) { + if (this->range.is_root()) { ceph_assert(this->root_block); unlink_phy_tree_root_node(this->root_block); } else { @@ -1106,8 +1104,8 @@ struct FixedKVLeafNode this->split_child_ptrs(*left, *right); } auto pivot = this->split_into(*left, *right); - left->pin.set_range(left->get_meta()); - right->pin.set_range(right->get_meta()); + left->range = left->get_meta(); + right->range = right->get_meta(); return std::make_tuple( left, right, @@ -1123,7 +1121,7 @@ struct FixedKVLeafNode replacement->merge_child_ptrs(*this, *right); } replacement->merge_from(*this, *right->template cast()); - replacement->pin.set_range(replacement->get_meta()); + replacement->range = replacement->get_meta(); return replacement; } @@ -1154,8 +1152,8 @@ struct FixedKVLeafNode *replacement_right); } - replacement_left->pin.set_range(replacement_left->get_meta()); - replacement_right->pin.set_range(replacement_right->get_meta()); + replacement_left->range = replacement_left->get_meta(); + replacement_right->range = replacement_right->get_meta(); return std::make_tuple( replacement_left, replacement_right, diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc index 93fc701bb0555..769b0446a5d6b 100644 --- a/src/crimson/os/seastore/cached_extent.cc +++ b/src/crimson/os/seastore/cached_extent.cc @@ -111,11 +111,6 @@ std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const { std::ostream &LogicalCachedExtent::_print_detail(std::ostream &out) const { out << ", laddr=" << laddr; - if (pin) { - out << ", pin=" << *pin; - } else { - out << ", pin=empty"; - } return print_detail_l(out); } @@ -161,9 +156,9 @@ parent_tracker_t::~parent_tracker_t() { } } -std::ostream &operator<<(std::ostream &out, const LBAPin &rhs) +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs) { - return out << "LBAPin(" << rhs.get_key() << "~" << rhs.get_length() + return out << "LBAMapping(" << rhs.get_key() << "~" << rhs.get_length() << "->" << rhs.get_val(); } diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 12b189fea549c..3c4d79e0ca1e1 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -35,7 +35,7 @@ template < bool leaf_has_children> class FixedKVBtree; template -class BtreeNodePin; +class BtreeNodeMapping; // #define DEBUG_CACHED_EXTENT_REF #ifdef DEBUG_CACHED_EXTENT_REF @@ -721,7 +721,8 @@ class CachedExtent friend class crimson::os::seastore::TransactionManager; friend class crimson::os::seastore::ExtentPlacementManager; template - friend class BtreeNodePin; + friend class BtreeNodeMapping; + friend class ::btree_lba_manager_test; }; std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t); @@ -919,21 +920,19 @@ struct get_child_ret_t { }; template -class PhysicalNodePin; +class PhysicalNodeMapping; template -using PhysicalNodePinRef = std::unique_ptr>; +using PhysicalNodeMappingRef = std::unique_ptr>; template -class PhysicalNodePin { +class PhysicalNodeMapping { public: - virtual void link_extent(LogicalCachedExtent *ref) = 0; - virtual void take_pin(PhysicalNodePin &pin) = 0; virtual extent_len_t get_length() const = 0; virtual extent_types_t get_type() const = 0; virtual val_t get_val() const = 0; virtual key_t get_key() const = 0; - virtual PhysicalNodePinRef duplicate() const = 0; + virtual PhysicalNodeMappingRef duplicate() const = 0; virtual bool has_been_invalidated() const = 0; virtual CachedExtentRef get_parent() const = 0; virtual uint16_t get_pos() const = 0; @@ -946,24 +945,24 @@ class PhysicalNodePin { child_pos->link_child(c); } - virtual ~PhysicalNodePin() {} + virtual ~PhysicalNodeMapping() {} protected: std::optional child_pos = std::nullopt; }; -using LBAPin = PhysicalNodePin; -using LBAPinRef = PhysicalNodePinRef; +using LBAMapping = PhysicalNodeMapping; +using LBAMappingRef = PhysicalNodeMappingRef; -std::ostream &operator<<(std::ostream &out, const LBAPin &rhs); +std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs); -using lba_pin_list_t = std::list; +using lba_pin_list_t = std::list; std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs); -using BackrefPin = PhysicalNodePin; -using BackrefPinRef = PhysicalNodePinRef; +using BackrefMapping = PhysicalNodeMapping; +using BackrefMappingRef = PhysicalNodeMappingRef; -using backref_pin_list_t = std::list; +using backref_pin_list_t = std::list; /** * RetiredExtentPlaceholder @@ -1095,20 +1094,8 @@ class LogicalCachedExtent : public ChildableCachedExtent { : ChildableCachedExtent(std::forward(t)...) {} - void set_pin(LBAPinRef &&npin) { - assert(!pin); - pin = std::move(npin); - laddr = pin->get_key(); - pin->link_extent(this); - } - - bool has_pin() const { - return !!pin; - } - - LBAPin &get_pin() { - assert(pin); - return *pin; + bool has_laddr() const { + return laddr != L_ADDR_NULL; } laddr_t get_laddr() const { @@ -1147,15 +1134,11 @@ class LogicalCachedExtent : public ChildableCachedExtent { void on_delta_write(paddr_t record_block_offset) final { assert(is_exist_mutation_pending() || get_prior_instance()); - if (get_prior_instance()) { - pin->take_pin(*(get_prior_instance()->cast()->pin)); - } logical_on_delta_write(); } private: laddr_t laddr = L_ADDR_NULL; - LBAPinRef pin; template < typename node_key_t, @@ -1222,5 +1205,5 @@ using lextent_list_t = addr_extent_list_base_t< template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; template <> struct fmt::formatter : fmt::ostream_formatter {}; -template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; #endif diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index af11cac7cc4e0..d79f72a6a7bc6 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -62,7 +62,7 @@ class LBAManager { */ using get_mapping_iertr = base_iertr::extend< crimson::ct_error::enoent>; - using get_mapping_ret = get_mapping_iertr::future; + using get_mapping_ret = get_mapping_iertr::future; virtual get_mapping_ret get_mapping( Transaction &t, laddr_t offset) = 0; @@ -72,10 +72,10 @@ class LBAManager { * * Offset will be relative to the block offset of the record * This mapping will block from transaction submission until set_paddr - * is called on the LBAPin. + * is called on the LBAMapping. */ using alloc_extent_iertr = base_iertr; - using alloc_extent_ret = alloc_extent_iertr::future; + using alloc_extent_ret = alloc_extent_iertr::future; virtual alloc_extent_ret alloc_extent( Transaction &t, laddr_t hint, @@ -110,17 +110,9 @@ class LBAManager { Transaction &t, laddr_t addr) = 0; - virtual void complete_transaction( - Transaction &t, - std::vector &to_clear, ///< extents whose pins are to be cleared, - // as the results of their retirements - std::vector &to_link ///< fresh extents whose pins are to be inserted - // into backref manager's pin set - ) = 0; - /** * Should be called after replay on each cached extent. - * Implementation must initialize the LBAPin on any + * Implementation must initialize the LBAMapping on any * LogicalCachedExtent's and may also read in any dependent * structures, etc. * @@ -200,8 +192,6 @@ class LBAManager { laddr_t laddr, extent_len_t len) = 0; - virtual void add_pin(LBAPin &pin) = 0; - virtual ~LBAManager() {} }; using LBAManagerRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index 0e0e069b4c396..c4756dc083c16 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -182,7 +182,7 @@ BtreeLBAManager::get_mapping( LOG_PREFIX(BtreeLBAManager::get_mapping); TRACET("{}", t, offset); auto c = get_context(t); - return with_btree_ret( + return with_btree_ret( cache, c, [FNAME, c, offset](auto &btree) { @@ -274,10 +274,13 @@ BtreeLBAManager::alloc_extent( state.last_end, lba_map_val_t{len, addr, 1, 0}, nextent - ).si_then([&state, FNAME, c, addr, len, hint](auto &&p) { + ).si_then([&state, FNAME, c, addr, len, hint, nextent](auto &&p) { auto [iter, inserted] = std::move(p); TRACET("{}~{}, hint={}, inserted at {}", c.trans, addr, len, hint, state.last_end); + if (nextent) { + nextent->set_laddr(iter.get_key()); + } ceph_assert(inserted); state.ret = iter; }); @@ -292,65 +295,6 @@ static bool is_lba_node(const CachedExtent &e) return is_lba_node(e.get_type()); } -btree_range_pin_t &BtreeLBAManager::get_pin( - CachedExtent &e) -{ - if (is_lba_node(e)) { - return e.cast()->pin; - } else if (e.is_logical()) { - return static_cast( - e.cast()->get_pin()).get_range_pin(); - } else { - ceph_abort_msg("impossible"); - } -} - -static depth_t get_depth(const CachedExtent &e) -{ - if (is_lba_node(e)) { - return e.cast()->get_node_meta().depth; - } else if (e.is_logical()) { - return 0; - } else { - ceph_assert(0 == "currently impossible"); - return 0; - } -} - -void BtreeLBAManager::complete_transaction( - Transaction &t, - std::vector &to_clear, - std::vector &to_link) -{ - LOG_PREFIX(BtreeLBAManager::complete_transaction); - DEBUGT("start", t); - // need to call check_parent from leaf->parent - std::sort( - to_clear.begin(), to_clear.end(), - [](auto &l, auto &r) { return get_depth(*l) < get_depth(*r); }); - - for (auto &e: to_clear) { - auto &pin = get_pin(*e); - DEBUGT("retiring extent {} -- {}", t, pin, *e); - pin_set.retire(pin); - } - - std::sort( - to_link.begin(), to_link.end(), - [](auto &l, auto &r) -> bool { return get_depth(*l) > get_depth(*r); }); - - for (auto &e : to_link) { - DEBUGT("linking extent -- {}", t, *e); - pin_set.add_pin(get_pin(*e)); - } - - for (auto &e: to_clear) { - auto &pin = get_pin(*e); - TRACET("checking extent {} -- {}", t, pin, *e); - pin_set.check_parent(pin); - } -} - BtreeLBAManager::base_iertr::template future<> _init_cached_extent( op_context_t c, @@ -370,12 +314,8 @@ _init_cached_extent( iter.get_val().paddr == logn->get_paddr()) { assert(!iter.get_leaf_node()->is_pending()); iter.get_leaf_node()->link_child(logn.get(), iter.get_leaf_pos()); - logn->set_pin(iter.get_pin(c)); + logn->set_laddr(iter.get_pin(c)->get_key()); ceph_assert(iter.get_val().len == e->get_length()); - if (c.pins) { - c.pins->add_pin( - static_cast(logn->get_pin()).get_range_pin()); - } DEBUGT("logical extent {} live", c.trans, *logn); ret = true; } else { diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 884af688da660..6dcdbb568b2b7 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -25,17 +25,17 @@ namespace crimson::os::seastore::lba_manager::btree { -class BtreeLBAPin : public BtreeNodePin { +class BtreeLBAMapping : public BtreeNodeMapping { public: - BtreeLBAPin(op_context_t ctx) - : BtreeNodePin(ctx) {} - BtreeLBAPin( + BtreeLBAMapping(op_context_t ctx) + : BtreeNodeMapping(ctx) {} + BtreeLBAMapping( op_context_t c, CachedExtentRef parent, uint16_t pos, lba_map_val_t &val, lba_node_meta_t &&meta) - : BtreeNodePin( + : BtreeNodeMapping( c, parent, pos, @@ -47,7 +47,7 @@ class BtreeLBAPin : public BtreeNodePin { using LBABtree = FixedKVBtree< laddr_t, lba_map_val_t, LBAInternalNode, - LBALeafNode, BtreeLBAPin, LBA_BLOCK_SIZE, true>; + LBALeafNode, BtreeLBAMapping, LBA_BLOCK_SIZE, true>; /** * BtreeLBAManager @@ -108,11 +108,6 @@ class BtreeLBAManager : public LBAManager { return update_refcount(t, addr, 1); } - void complete_transaction( - Transaction &t, - std::vector &, - std::vector &) final; - /** * init_cached_extent * @@ -148,24 +143,9 @@ class BtreeLBAManager : public LBAManager { paddr_t addr, laddr_t laddr, extent_len_t len) final; - - void add_pin(LBAPin &pin) final { - auto *bpin = reinterpret_cast(&pin); - pin_set.add_pin(bpin->get_range_pin()); - bpin->set_parent(nullptr); - } - - ~BtreeLBAManager() { - pin_set.scan([](auto &i) { - LOG_PREFIX(BtreeLBAManager::~BtreeLBAManager); - SUBERROR(seastore_lba, "Found {}, has_ref={} -- {}", - i, i.has_ref(), i.get_extent()); - }); - } private: Cache &cache; - btree_pin_set_t pin_set; struct { uint64_t num_alloc_extents = 0; @@ -173,11 +153,9 @@ class BtreeLBAManager : public LBAManager { } stats; op_context_t get_context(Transaction &t) { - return op_context_t{cache, t, &pin_set}; + return op_context_t{cache, t}; } - static btree_range_pin_t &get_pin(CachedExtent &e); - seastar::metrics::metric_group metrics; void register_metrics(); diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc index fc9cd33af98ef..76e179e2414e4 100644 --- a/src/crimson/os/seastore/object_data_handler.cc +++ b/src/crimson/os/seastore/object_data_handler.cc @@ -476,7 +476,7 @@ using operate_ret_bare = std::pair< std::optional, std::optional>; using operate_ret = get_iertr::future; -operate_ret operate_left(context_t ctx, LBAPinRef &pin, const overwrite_plan_t &overwrite_plan) +operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan) { if (overwrite_plan.get_left_size() == 0) { return get_iertr::make_ready_future( @@ -555,7 +555,7 @@ operate_ret operate_left(context_t ctx, LBAPinRef &pin, const overwrite_plan_t & * * Proceed overwrite_plan.right_operation. */ -operate_ret operate_right(context_t ctx, LBAPinRef &pin, const overwrite_plan_t &overwrite_plan) +operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan) { if (overwrite_plan.get_right_size() == 0) { return get_iertr::make_ready_future( diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc index d63af2d57d4c9..eda9ca1c56fb0 100644 --- a/src/crimson/os/seastore/transaction_manager.cc +++ b/src/crimson/os/seastore/transaction_manager.cc @@ -377,30 +377,6 @@ TransactionManager::do_submit_transaction( backref_to_clear.push_back(e); } - // ...but add_pin from parent->leaf - std::vector lba_to_link; - std::vector backref_to_link; - lba_to_link.reserve(tref.get_fresh_block_stats().num + - tref.get_existing_block_stats().valid_num); - backref_to_link.reserve(tref.get_fresh_block_stats().num); - tref.for_each_fresh_block([&](auto &e) { - if (e->is_valid()) { - if (is_lba_node(e->get_type()) || e->is_logical()) - lba_to_link.push_back(e); - else if (is_backref_node(e->get_type())) - backref_to_link.push_back(e); - } - }); - - for (auto &e: tref.get_existing_block_list()) { - if (e->is_valid()) { - lba_to_link.push_back(e); - } - } - - lba_manager->complete_transaction(tref, lba_to_clear, lba_to_link); - backref_manager->complete_transaction(tref, backref_to_clear, backref_to_link); - journal->get_trimmer().update_journal_tails( cache->get_oldest_dirty_from().value_or(start_seq), cache->get_oldest_backref_dirty_from().value_or(start_seq)); @@ -473,7 +449,6 @@ TransactionManager::rewrite_logical_extent( lextent->get_length(), nlextent->get_bptr().c_str()); nlextent->set_laddr(lextent->get_laddr()); - nlextent->set_pin(lextent->get_pin().duplicate()); nlextent->set_modify_time(lextent->get_modify_time()); DEBUGT("rewriting logical extent -- {} to {}", t, *lextent, *nlextent); @@ -581,7 +556,7 @@ TransactionManager::get_extents_if_live( return trans_intr::parallel_for_each( pin_list, [=, this, &list, &t]( - LBAPinRef &pin) -> Cache::get_extent_iertr::future<> + LBAMappingRef &pin) -> Cache::get_extent_iertr::future<> { auto pin_paddr = pin->get_val(); auto &pin_seg_paddr = pin_paddr.as_seg_paddr(); diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h index e5f71352724e7..7a67d4efe9c4d 100644 --- a/src/crimson/os/seastore/transaction_manager.h +++ b/src/crimson/os/seastore/transaction_manager.h @@ -90,7 +90,7 @@ class TransactionManager : public ExtentCallbackInterface { * Get the logical pin at offset */ using get_pin_iertr = LBAManager::get_mapping_iertr; - using get_pin_ret = LBAManager::get_mapping_iertr::future; + using get_pin_ret = LBAManager::get_mapping_iertr::future; get_pin_ret get_pin( Transaction &t, laddr_t offset) { @@ -205,13 +205,13 @@ class TransactionManager : public ExtentCallbackInterface { auto ret = cache->duplicate_for_write( t, ref)->cast(); - if (!ret->has_pin()) { + if (!ret->has_laddr()) { SUBDEBUGT(seastore_tm, "duplicating extent for write -- {} -> {}", t, *ref, *ret); - ret->set_pin(ref->get_pin().duplicate()); + ret->set_laddr(ref->get_laddr()); } else { SUBTRACET(seastore_tm, "extent is already duplicated -- {}", @@ -283,8 +283,8 @@ class TransactionManager : public ExtentCallbackInterface { len, ext->get_paddr(), ext.get() - ).si_then([ext=std::move(ext), laddr_hint, &t, FNAME](auto &&ref) mutable { - ext->set_pin(std::move(ref)); + ).si_then([ext=std::move(ext), laddr_hint, &t](auto &&) mutable { + LOG_PREFIX(TransactionManager::alloc_extent); SUBDEBUGT(seastore_tm, "new extent: {}, laddr_hint: {}", t, *ext, laddr_hint); return alloc_extent_iertr::make_ready_future>( std::move(ext)); @@ -341,7 +341,6 @@ class TransactionManager : public ExtentCallbackInterface { ext.get() ).si_then([ext=std::move(ext), laddr_hint, this](auto &&ref) { ceph_assert(laddr_hint == ref->get_key()); - ext->set_pin(std::move(ref)); return epm->read( ext->get_paddr(), ext->get_length(), @@ -355,7 +354,7 @@ class TransactionManager : public ExtentCallbackInterface { using reserve_extent_iertr = alloc_extent_iertr; - using reserve_extent_ret = reserve_extent_iertr::future; + using reserve_extent_ret = reserve_extent_iertr::future; reserve_extent_ret reserve_region( Transaction &t, laddr_t hint, @@ -672,8 +671,7 @@ class TransactionManager : public ExtentCallbackInterface { assert(pin->get_parent()); assert(!pin->get_parent()->is_pending()); pin->link_child(&lextent); - lextent.set_pin(std::move(pin)); - lba_manager->add_pin(lextent.get_pin()); + lextent.set_laddr(pin->get_key()); } ).si_then([FNAME, &t](auto ref) { SUBTRACET(seastore_tm, "got extent -- {}", t, *ref); diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 8ca18fe3b9502..f3cb83324bcf8 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -328,23 +328,7 @@ struct btree_lba_manager_test : btree_test_base { btree_lba_manager_test() = default; - void complete_commit(Transaction &t) final { - std::vector lba_to_clear; - lba_to_clear.reserve(t.get_retired_set().size()); - for (auto &e: t.get_retired_set()) { - if (e->is_logical() || is_lba_node(e->get_type())) - lba_to_clear.push_back(e); - } - std::vector lba_to_link; - lba_to_link.reserve(t.get_fresh_block_stats().num); - t.for_each_fresh_block([&](auto &e) { - if (e->is_valid() && - (is_lba_node(e->get_type()) || e->is_logical())) - lba_to_link.push_back(e); - }); - - lba_manager->complete_transaction(t, lba_to_clear, lba_to_link); - } + void complete_commit(Transaction &t) final {} LBAManager::mkfs_ret test_structure_setup(Transaction &t) final { lba_manager.reset(new BtreeLBAManager(*cache)); diff --git a/src/test/crimson/seastore/test_object_data_handler.cc b/src/test/crimson/seastore/test_object_data_handler.cc index 0697e13aab3df..11b9ed0e6cca7 100644 --- a/src/test/crimson/seastore/test_object_data_handler.cc +++ b/src/test/crimson/seastore/test_object_data_handler.cc @@ -135,7 +135,7 @@ struct object_data_handler_test_t: } } } - std::list get_mappings(objaddr_t offset, extent_len_t length) { + std::list get_mappings(objaddr_t offset, extent_len_t length) { auto t = create_mutate_transaction(); auto ret = with_trans_intr(*t, [&](auto &t) { return tm->get_pins(t, offset, length); From 302bc3c2d95cd36012008e334444f689ddb6694f Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Mon, 29 Aug 2022 16:12:00 +0800 Subject: [PATCH 18/21] test/crimson/seastore: check intra-fixedkv-btree parent->child trackers during unittests Signed-off-by: Xuehan Xu --- .../seastore/backref/btree_backref_manager.cc | 13 ++++++++++++- .../seastore/backref/btree_backref_manager.h | 2 ++ src/crimson/os/seastore/backref_manager.h | 3 +++ src/crimson/os/seastore/btree/fixed_kv_btree.h | 14 +++++++++----- src/crimson/os/seastore/cache.h | 18 ++++++++++++++++++ src/crimson/os/seastore/cached_extent.h | 10 ---------- src/crimson/os/seastore/lba_manager.h | 3 +++ .../lba_manager/btree/btree_lba_manager.cc | 11 +++++++++++ .../lba_manager/btree/btree_lba_manager.h | 2 ++ .../crimson/seastore/test_btree_lba_manager.cc | 5 +++++ .../seastore/test_transaction_manager.cc | 5 +++++ 11 files changed, 70 insertions(+), 16 deletions(-) diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc index eab7fb9709e5a..ce3f737b21a8e 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.cc +++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc @@ -332,6 +332,17 @@ BtreeBackrefManager::merge_cached_backrefs( }); } +BtreeBackrefManager::check_child_trackers_ret +BtreeBackrefManager::check_child_trackers( + Transaction &t) { + auto c = get_context(t); + return with_btree( + cache, c, + [c](auto &btree) { + return btree.check_child_trackers(c); + }); +} + BtreeBackrefManager::scan_mapped_space_ret BtreeBackrefManager::scan_mapped_space( Transaction &t, @@ -419,7 +430,7 @@ BtreeBackrefManager::scan_mapped_space( BackrefBtree::mapped_space_visitor_t f = [&scan_visitor, block_size, FNAME, c]( paddr_t paddr, paddr_t key, extent_len_t len, - depth_t depth, extent_types_t type) { + depth_t depth, extent_types_t type, BackrefBtree::iterator&) { TRACET("tree node {}~{} {}, depth={} used", c.trans, paddr, len, type, depth); ceph_assert(paddr.is_absolute()); diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h index 48ef4d8319171..e19d9ce7b065b 100644 --- a/src/crimson/os/seastore/backref/btree_backref_manager.h +++ b/src/crimson/os/seastore/backref/btree_backref_manager.h @@ -75,6 +75,8 @@ class BtreeBackrefManager : public BackrefManager { Transaction &t, paddr_t offset) final; + check_child_trackers_ret check_child_trackers(Transaction &t) final; + scan_mapped_space_ret scan_mapped_space( Transaction &t, scan_mapped_space_func_t &&f) final; diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h index 3feedb997b4c3..4a354bdca8798 100644 --- a/src/crimson/os/seastore/backref_manager.h +++ b/src/crimson/os/seastore/backref_manager.h @@ -127,6 +127,9 @@ class BackrefManager { Transaction &t, paddr_t offset) = 0; + using check_child_trackers_ret = base_iertr::future<>; + virtual check_child_trackers_ret check_child_trackers(Transaction &t) = 0; + /** * scan all extents in both tree and cache, * including backref extents, logical extents and lba extents, diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h index 7248e67a0503f..2aaf1620fcb4d 100644 --- a/src/crimson/os/seastore/btree/fixed_kv_btree.h +++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h @@ -77,7 +77,7 @@ class FixedKVBtree { using iterator_fut = base_iertr::future; using mapped_space_visitor_t = std::function< - void(paddr_t, node_key_t, extent_len_t, depth_t, extent_types_t)>; + void(paddr_t, node_key_t, extent_len_t, depth_t, extent_types_t, iterator&)>; class iterator { public: @@ -1377,7 +1377,8 @@ class FixedKVBtree { root_node->get_node_meta().begin, root_node->get_length(), get_root().get_depth(), - internal_node_t::TYPE); + internal_node_t::TYPE, + iter); return lookup_root_iertr::now(); }; auto on_found_leaf = @@ -1388,7 +1389,8 @@ class FixedKVBtree { root_node->get_node_meta().begin, root_node->get_length(), get_root().get_depth(), - leaf_node_t::TYPE); + leaf_node_t::TYPE, + iter); return lookup_root_iertr::now(); }; @@ -1465,7 +1467,8 @@ class FixedKVBtree { node->get_node_meta().begin, node->get_length(), depth, - node->get_type()); + node->get_type(), + iter); return seastar::now(); }; @@ -1532,7 +1535,8 @@ class FixedKVBtree { node->get_node_meta().begin, node->get_length(), 1, - node->get_type()); + node->get_type(), + iter); return seastar::now(); }; diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index b13875f9c467f..2b2b66fd9231d 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -25,6 +25,15 @@ class BtreeBackrefManager; namespace crimson::os::seastore { +template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> +class FixedKVBtree; class BackrefManager; class SegmentProvider; @@ -1540,6 +1549,15 @@ class Cache { } } + template < + typename node_key_t, + typename node_val_t, + typename internal_node_t, + typename leaf_node_t, + typename pin_t, + size_t node_size, + bool leaf_has_children> + friend class FixedKVBtree; }; using CacheRef = std::unique_ptr; diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 3c4d79e0ca1e1..3d7af9fcbdc52 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -1139,16 +1139,6 @@ class LogicalCachedExtent : public ChildableCachedExtent { private: laddr_t laddr = L_ADDR_NULL; - - template < - typename node_key_t, - typename node_val_t, - typename internal_node_t, - typename leaf_node_t, - typename pin_t, - size_t node_size, - bool leaf_has_children> - friend class FixedKVBtree; }; using LogicalCachedExtentRef = TCachedExtentRef; diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h index d79f72a6a7bc6..f36a788344ac0 100644 --- a/src/crimson/os/seastore/lba_manager.h +++ b/src/crimson/os/seastore/lba_manager.h @@ -124,6 +124,9 @@ class LBAManager { Transaction &t, CachedExtentRef e) = 0; + using check_child_trackers_ret = base_iertr::future<>; + virtual check_child_trackers_ret check_child_trackers(Transaction &t) = 0; + /** * Calls f for each mapping in [begin, end) */ diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc index c4756dc083c16..296af756b756a 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc @@ -351,6 +351,17 @@ BtreeLBAManager::init_cached_extent( }); } +BtreeLBAManager::check_child_trackers_ret +BtreeLBAManager::check_child_trackers( + Transaction &t) { + auto c = get_context(t); + return with_btree( + cache, c, + [c](auto &btree) { + return btree.check_child_trackers(c); + }); +} + BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings( Transaction &t, diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h index 6dcdbb568b2b7..b48abf9456bbe 100644 --- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h +++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h @@ -120,6 +120,8 @@ class BtreeLBAManager : public LBAManager { Transaction &t, CachedExtentRef e) final; + check_child_trackers_ret check_child_trackers(Transaction &t) final; + scan_mappings_ret scan_mappings( Transaction &t, laddr_t begin, diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index f3cb83324bcf8..67e187465616a 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -499,6 +499,11 @@ struct btree_lba_manager_test : btree_test_base { } void check_mappings(test_transaction_t &t) { + (void)with_trans_intr( + *t.t, + [=, this](auto &t) { + return lba_manager->check_child_trackers(t); + }).unsafe_get0(); for (auto &&i: t.mappings) { auto laddr = i.first; auto len = i.second.len; diff --git a/src/test/crimson/seastore/test_transaction_manager.cc b/src/test/crimson/seastore/test_transaction_manager.cc index baea0358e766d..d0bc2bd3f8f19 100644 --- a/src/test/crimson/seastore/test_transaction_manager.cc +++ b/src/test/crimson/seastore/test_transaction_manager.cc @@ -561,6 +561,11 @@ struct transaction_manager_test_t : ++iter; }); }).unsafe_get0(); + (void)with_trans_intr( + *t.t, + [=, this](auto &t) { + return lba_manager->check_child_trackers(t); + }).unsafe_get0(); } bool try_submit_transaction(test_transaction_t t) { From 3c4f8c761333dcc1a4e24e73f37808720c8f684f Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Sat, 11 Mar 2023 03:46:14 +0000 Subject: [PATCH 19/21] test/crimson/seastore: complement lba test with logical extents Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.cc | 2 +- src/crimson/os/seastore/cache.h | 14 ++++- src/crimson/os/seastore/cached_extent.h | 2 + .../os/seastore/extent_placement_manager.h | 12 ++++ src/crimson/os/seastore/seastore_types.h | 4 ++ src/crimson/os/seastore/transaction.h | 7 ++- .../seastore/test_btree_lba_manager.cc | 60 ++++++++++++++----- 7 files changed, 80 insertions(+), 21 deletions(-) diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc index 65f7f1d400f3d..d6c9fdce3aa63 100644 --- a/src/crimson/os/seastore/cache.cc +++ b/src/crimson/os/seastore/cache.cc @@ -1196,7 +1196,7 @@ record_t Cache::prepare_record( fresh_stat.increment(i->get_length()); get_by_ext(efforts.fresh_inline_by_ext, i->get_type()).increment(i->get_length()); - assert(i->is_inline()); + assert(i->is_inline() || i->get_paddr().is_fake()); bufferlist bl; i->prepare_write(); diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 2b2b66fd9231d..3abb7bb9360c1 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -804,19 +804,29 @@ class Cache { /** * alloc_new_extent * - * Allocates a fresh extent. if delayed is true, addr will be alloc'd later + * Allocates a fresh extent. if delayed is true, addr will be alloc'd later. + * Note that epaddr can only be fed by the btree lba unittest for now */ template TCachedExtentRef alloc_new_extent( Transaction &t, ///< [in, out] current transaction extent_len_t length, ///< [in] length placement_hint_t hint, ///< [in] user hint - rewrite_gen_t gen ///< [in] rewrite generation +#ifdef UNIT_TESTS_BUILT + rewrite_gen_t gen, ///< [in] rewrite generation + std::optional epaddr = std::nullopt ///< [in] paddr fed by callers +#else + rewrite_gen_t gen +#endif ) { LOG_PREFIX(Cache::alloc_new_extent); SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}", t, T::TYPE, length, hint, rewrite_gen_printer_t{gen}); +#ifdef UNIT_TESTS_BUILT + auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen, epaddr); +#else auto result = epm.alloc_new_extent(t, T::TYPE, length, hint, gen); +#endif auto ret = CachedExtent::make_cached_extent_ref(std::move(result.bp)); ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING, result.paddr, diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h index 3d7af9fcbdc52..464f34d79fdc7 100644 --- a/src/crimson/os/seastore/cached_extent.h +++ b/src/crimson/os/seastore/cached_extent.h @@ -16,6 +16,8 @@ #include "crimson/common/interruptible_future.h" #include "crimson/os/seastore/seastore_types.h" +struct btree_lba_manager_test; + namespace crimson::os::seastore { class Transaction; diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h index 9ab9ce7fe9f3e..b94c03ec34ada 100644 --- a/src/crimson/os/seastore/extent_placement_manager.h +++ b/src/crimson/os/seastore/extent_placement_manager.h @@ -246,7 +246,12 @@ class ExtentPlacementManager { extent_types_t type, extent_len_t length, placement_hint_t hint, +#ifdef UNIT_TESTS_BUILT + rewrite_gen_t gen, + std::optional external_paddr = std::nullopt +#else rewrite_gen_t gen +#endif ) { assert(hint < placement_hint_t::NUM_HINTS); assert(is_target_rewrite_generation(gen)); @@ -261,7 +266,14 @@ class ExtentPlacementManager { buffer::create_page_aligned(length)); bp.zero(); paddr_t addr; +#ifdef UNIT_TESTS_BUILT + if (unlikely(external_paddr.has_value())) { + assert(external_paddr->is_fake()); + addr = *external_paddr; + } else if (gen == INLINE_GENERATION) { +#else if (gen == INLINE_GENERATION) { +#endif addr = make_record_relative_paddr(0); } else if (category == data_category_t::DATA) { assert(data_writers_by_gen[generation_to_writer(gen)]); diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h index 61ddfe5633718..55d8eb4a260ac 100644 --- a/src/crimson/os/seastore/seastore_types.h +++ b/src/crimson/os/seastore/seastore_types.h @@ -625,6 +625,10 @@ struct paddr_t { return get_addr_type() != paddr_types_t::RESERVED; } + bool is_fake() const { + return get_device_id() == DEVICE_ID_FAKE; + } + auto operator<=>(const paddr_t &) const = default; DENC(paddr_t, v, p) { diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h index ed9d1d1a0c4ae..d423196feba70 100644 --- a/src/crimson/os/seastore/transaction.h +++ b/src/crimson/os/seastore/transaction.h @@ -169,8 +169,11 @@ class Transaction { pre_alloc_list.emplace_back(ref->cast()); fresh_block_stats.increment(ref->get_length()); } else { - assert(ref->get_paddr() == make_record_relative_paddr(0)); - ref->set_paddr(make_record_relative_paddr(offset)); + if (likely(ref->get_paddr() == make_record_relative_paddr(0))) { + ref->set_paddr(make_record_relative_paddr(offset)); + } else { + ceph_assert(ref->get_paddr().is_fake()); + } offset += ref->get_length(); inline_block_list.push_back(ref); fresh_block_stats.increment(ref->get_length()); diff --git a/src/test/crimson/seastore/test_btree_lba_manager.cc b/src/test/crimson/seastore/test_btree_lba_manager.cc index 67e187465616a..0635358463aa5 100644 --- a/src/test/crimson/seastore/test_btree_lba_manager.cc +++ b/src/test/crimson/seastore/test_btree_lba_manager.cc @@ -252,13 +252,29 @@ struct lba_btree_test : btree_test_base { return lba_map_val_t{0, P_ADDR_NULL, len, 0}; } + device_off_t next_off = 0; + paddr_t get_paddr() { + next_off += block_size; + return make_fake_paddr(next_off); + } + void insert(laddr_t addr, extent_len_t len) { ceph_assert(check.count(addr) == 0); check.emplace(addr, get_map_val(len)); lba_btree_update([=, this](auto &btree, auto &t) { + auto extent = cache->alloc_new_extent( + t, + TestBlock::SIZE, + placement_hint_t::HOT, + 0, + get_paddr()); return btree.insert( - get_op_context(t), addr, get_map_val(len), nullptr - ).si_then([](auto){}); + get_op_context(t), addr, get_map_val(len), extent.get() + ).si_then([addr, extent](auto p){ + auto& [iter, inserted] = p; + assert(inserted); + extent->set_laddr(addr); + }); }); } @@ -405,12 +421,18 @@ struct btree_lba_manager_test : btree_test_base { auto alloc_mapping( test_transaction_t &t, laddr_t hint, - size_t len, - paddr_t paddr) { + size_t len) { auto ret = with_trans_intr( *t.t, [=, this](auto &t) { - return lba_manager->alloc_extent(t, hint, len, paddr, nullptr); + auto extent = cache->alloc_new_extent( + t, + TestBlock::SIZE, + placement_hint_t::HOT, + 0, + get_paddr()); + return lba_manager->alloc_extent( + t, hint, len, extent->get_paddr(), extent.get()); }).unsafe_get0(); logger().debug("alloc'd: {}", *ret); EXPECT_EQ(len, ret->get_length()); @@ -441,14 +463,20 @@ struct btree_lba_manager_test : btree_test_base { ceph_assert(target->second.refcount > 0); target->second.refcount--; - auto refcnt = with_trans_intr( + (void) with_trans_intr( *t.t, [=, this](auto &t) { return lba_manager->decref_extent( t, - target->first); - }).unsafe_get0().refcount; - EXPECT_EQ(refcnt, target->second.refcount); + target->first + ).si_then([this, &t, target](auto result) { + EXPECT_EQ(result.refcount, target->second.refcount); + if (result.refcount == 0) { + return cache->retire_extent_addr(t, result.addr, result.length); + } + return Cache::retire_extent_iertr::now(); + }); + }).unsafe_get0(); if (target->second.refcount == 0) { t.mappings.erase(target); } @@ -557,7 +585,7 @@ TEST_F(btree_lba_manager_test, basic) auto t = create_transaction(); check_mappings(t); // check in progress transaction sees mapping check_mappings(); // check concurrent does not - auto ret = alloc_mapping(t, laddr, block_size, get_paddr()); + auto ret = alloc_mapping(t, laddr, block_size); submit_test_transaction(std::move(t)); } check_mappings(); // check new transaction post commit sees it @@ -571,7 +599,7 @@ TEST_F(btree_lba_manager_test, force_split) auto t = create_transaction(); logger().debug("opened transaction"); for (unsigned j = 0; j < 5; ++j) { - auto ret = alloc_mapping(t, 0, block_size, get_paddr()); + auto ret = alloc_mapping(t, 0, block_size); if ((i % 10 == 0) && (j == 3)) { check_mappings(t); check_mappings(); @@ -591,7 +619,7 @@ TEST_F(btree_lba_manager_test, force_split_merge) auto t = create_transaction(); logger().debug("opened transaction"); for (unsigned j = 0; j < 5; ++j) { - auto ret = alloc_mapping(t, 0, block_size, get_paddr()); + auto ret = alloc_mapping(t, 0, block_size); // just to speed things up a bit if ((i % 100 == 0) && (j == 3)) { check_mappings(t); @@ -648,7 +676,7 @@ TEST_F(btree_lba_manager_test, single_transaction_split_merge) { auto t = create_transaction(); for (unsigned i = 0; i < 400; ++i) { - alloc_mapping(t, 0, block_size, get_paddr()); + alloc_mapping(t, 0, block_size); } check_mappings(t); submit_test_transaction(std::move(t)); @@ -671,7 +699,7 @@ TEST_F(btree_lba_manager_test, single_transaction_split_merge) { auto t = create_transaction(); for (unsigned i = 0; i < 600; ++i) { - alloc_mapping(t, 0, block_size, get_paddr()); + alloc_mapping(t, 0, block_size); } auto addresses = get_mapped_addresses(t); for (unsigned i = 0; i != addresses.size(); ++i) { @@ -699,7 +727,7 @@ TEST_F(btree_lba_manager_test, split_merge_multi) } }; iterate([&](auto &t, auto idx) { - alloc_mapping(t, idx * block_size, block_size, get_paddr()); + alloc_mapping(t, idx * block_size, block_size); }); check_mappings(); iterate([&](auto &t, auto idx) { @@ -710,7 +738,7 @@ TEST_F(btree_lba_manager_test, split_merge_multi) check_mappings(); iterate([&](auto &t, auto idx) { if ((idx % 32) > 0) { - alloc_mapping(t, idx * block_size, block_size, get_paddr()); + alloc_mapping(t, idx * block_size, block_size); } }); check_mappings(); From 62974a65897edce2afdb16ba99baea0193a0d4a6 Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Thu, 23 Mar 2023 09:59:12 +0000 Subject: [PATCH 20/21] crimson/os/seastore/cache: add comment about backref_extent_entry_t Signed-off-by: Xuehan Xu --- src/crimson/os/seastore/cache.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h index 3abb7bb9360c1..9289dda0881b5 100644 --- a/src/crimson/os/seastore/cache.h +++ b/src/crimson/os/seastore/cache.h @@ -1106,6 +1106,14 @@ class Cache { /// Dump live extents void dump_contents(); + /** + * backref_extent_entry_t + * + * All the backref extent entries have to be indexed by paddr in memory, + * so they can be retrived by range during cleaning. + * + * See BtreeBackrefManager::retrieve_backref_extents_in_range() + */ struct backref_extent_entry_t { backref_extent_entry_t( paddr_t paddr, From 33b56a04d51443f5f035bfc267da16d4a5dfe28c Mon Sep 17 00:00:00 2001 From: Xuehan Xu Date: Mon, 8 May 2023 08:15:55 +0000 Subject: [PATCH 21/21] crimson/tools/store_nbd: read logical extents via TransactionManager::read_pin() Signed-off-by: Xuehan Xu --- src/crimson/tools/store_nbd/tm_driver.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc index 310e5ed00ca61..bd216fd58ec4d 100644 --- a/src/crimson/tools/store_nbd/tm_driver.cc +++ b/src/crimson/tools/store_nbd/tm_driver.cc @@ -71,7 +71,7 @@ TMDriver::read_extents_ret TMDriver::read_extents( "read_extents: get_extent {}~{}", pin->get_val(), pin->get_length()); - return tm->pin_to_extent( + return tm->read_pin( t, std::move(pin) ).si_then([&ret](auto ref) mutable {