diff --git a/src/os/bluestore/BitmapFreelistManager.cc b/src/os/bluestore/BitmapFreelistManager.cc index 7ecd326887523c..70cd11445a8ce1 100644 --- a/src/os/bluestore/BitmapFreelistManager.cc +++ b/src/os/bluestore/BitmapFreelistManager.cc @@ -125,7 +125,6 @@ int BitmapFreelistManager::_expand(uint64_t old_size, KeyValueDB* db) << old_size << " to 0x" << (blocks0 * bytes_per_block) << " (0x" << blocks0 << " blocks)" << std::dec << dendl; // reset past-eof blocks to unallocated - // TBD:: GBH :: do we need _sync/_expand with NULL FM ?? _xor(old_size, blocks0 * bytes_per_block - old_size, txn); } @@ -308,7 +307,6 @@ void BitmapFreelistManager::_sync(KeyValueDB* kvdb, bool read_only) if (!read_only && size_db < size) { dout(1) << __func__ << " committing new size 0x" << std::hex << size << std::dec << dendl; - // TBD:: GBH :: do we need _sync/_expand with NULL FM ?? r = _expand(size_db, kvdb); ceph_assert(r == 0); } else if (size_db > size) { @@ -488,7 +486,7 @@ void BitmapFreelistManager::allocate( { dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; - if (!null_manager) { + if (is_null_manager() == false) { _xor(offset, length, txn); } } @@ -499,7 +497,7 @@ void BitmapFreelistManager::release( { dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length << std::dec << dendl; - if (!null_manager) { + if (is_null_manager() == false) { _xor(offset, length, txn); } } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index e0deee42875793..3d3ec675040335 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -5968,11 +5968,10 @@ int BlueStore::_open_db_and_around(bool read_only, bool to_repair) goto out_db; } - // don't read allocation meta data from rocksdb r = _open_fm(nullptr, true); if (r < 0) goto out_db; - + r = _init_alloc(read_only); if (r < 0) goto out_fm; @@ -5988,8 +5987,7 @@ int BlueStore::_open_db_and_around(bool read_only, bool to_repair) r = _open_db(false, to_repair, read_only); if (r < 0) { goto out_alloc; - } - + } return 0; out_alloc: @@ -7148,6 +7146,17 @@ int BlueStore::_mount() //================================================================================================================ +// BlueStore is committing all allocation information (alloc/release) into RocksDB before the client Write is performed. +// This cause a delay in write path and add significant load to the CPU/Memory/Disk. +// This allows Ceph to survive any failure without losing the allocation state. +// +// This new code skips the RocksDB updates on allocation time and instead perform a full desatge of the allocator object +// with all the OSD allocation state in a single step during umount(). +// This result with an 25% increase in IOPS and reduce latency in small random-write workload, but exposes the system +// to losing allocation info in failure cases where we don't call umount. +// We add code to perform a full allocation-map rebuild from information stored inside the ONode which is used in failure cases. +// When we perform a graceful shutdown there is no need for recovery and we simply read the allocation-map from a flat file +// where we store the allocation-map during umount(). //================================================================================================================ static const std::string allocator_dir = "ALLOCATOR_DIR_GBH2"; static const std::string allocator_file = "ALLOCATOR_FILE_GBH2"; @@ -8148,7 +8157,6 @@ int BlueStore::umount() dout(20) << __func__ << " closing" << dendl; } - _close_db_and_around(false); if (cct->_conf->bluestore_fsck_on_umount) { @@ -8160,7 +8168,6 @@ int BlueStore::umount() return -EIO; } } - return 0; } @@ -9943,7 +9950,7 @@ void BlueStore::inject_broken_shared_blob_key(const string& key, db->submit_transaction_sync(txn); }; -void BlueStore::inject_leaked_f(uint64_t len) +void BlueStore::inject_leaked(uint64_t len) { ceph_assert(!fm->is_null_manager()); @@ -9960,7 +9967,7 @@ void BlueStore::inject_leaked_f(uint64_t len) db->submit_transaction_sync(txn); } -void BlueStore::inject_false_free_f(coll_t cid, ghobject_t oid) +void BlueStore::inject_false_free(coll_t cid, ghobject_t oid) { ceph_assert(!fm->is_null_manager()); diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index f69dffd02a4a61..c6c26ec874d24a 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -2971,8 +2971,8 @@ class BlueStore : public ObjectStore, /// methods to inject various errors fsck can repair void inject_broken_shared_blob_key(const std::string& key, const ceph::buffer::list& bl); - void inject_leaked_f(uint64_t len); - void inject_false_free_f(coll_t cid, ghobject_t oid); + void inject_leaked(uint64_t len); + void inject_false_free(coll_t cid, ghobject_t oid); void inject_statfs(const std::string& key, const store_statfs_t& new_statfs); void inject_global_statfs(const store_statfs_t& new_statfs); void inject_misreference(coll_t cid1, ghobject_t oid1, diff --git a/src/os/bluestore/FreelistManager.cc b/src/os/bluestore/FreelistManager.cc index 04ffdde3ab1ca9..5907df443e00b9 100644 --- a/src/os/bluestore/FreelistManager.cc +++ b/src/os/bluestore/FreelistManager.cc @@ -17,9 +17,8 @@ FreelistManager *FreelistManager::create( // op is per prefix, has to done pre-db-open, and we don't know the // freelist type until after we open the db. ceph_assert(prefix == "B"); - if (type == "bitmap") { + if (type == "bitmap") return new BitmapFreelistManager(cct, "B", "b"); - } #ifdef HAVE_LIBZBD // With zoned drives there is only one FreelistManager implementation that we diff --git a/src/os/bluestore/FreelistManager.h b/src/os/bluestore/FreelistManager.h index 8e4edb1fd15a2e..8b18cc4b04f4d9 100644 --- a/src/os/bluestore/FreelistManager.h +++ b/src/os/bluestore/FreelistManager.h @@ -13,12 +13,11 @@ #include "zoned_types.h" class FreelistManager { + bool null_manager = false; public: CephContext* cct; - std::string type; - bool null_manager = false; - FreelistManager(CephContext* cct) : cct(cct), null_manager(false) {} - virtual ~FreelistManager() { std::cout << __func__ << std::endl; } + FreelistManager(CephContext* cct) : null_manager(false), cct(cct) {} + virtual ~FreelistManager() {} static FreelistManager *create( CephContext* cct, @@ -50,6 +49,7 @@ class FreelistManager { virtual uint64_t get_size() const = 0; virtual uint64_t get_alloc_units() const = 0; virtual uint64_t get_alloc_size() const = 0; + virtual void get_meta(uint64_t target_size, std::vector>*) const = 0; diff --git a/src/os/bluestore/bluestore_tool.cc b/src/os/bluestore/bluestore_tool.cc index 53a9854cae0b03..62c0886c6c3295 100644 --- a/src/os/bluestore/bluestore_tool.cc +++ b/src/os/bluestore/bluestore_tool.cc @@ -454,7 +454,6 @@ int main(int argc, char **argv) validate_path(cct.get(), path, false); BlueStore bluestore(cct.get(), path); int r = bluestore.read_allocation_from_drive_for_bluestore_tool(); - cout << action << " bluestore.allocmap ret_code=" << r << std::endl; if (r < 0) { cerr << action << " failed: " << cpp_strerror(r) << std::endl; exit(EXIT_FAILURE); diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc index 28a9fbf3853a19..3db6f40358b245 100644 --- a/src/test/objectstore/store_test.cc +++ b/src/test/objectstore/store_test.cc @@ -8213,7 +8213,7 @@ TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) { ASSERT_EQ(bstore->fsck(false), 0); ASSERT_EQ(bstore->repair(false), 0); bstore->mount(); - bstore->inject_leaked_f(0x30000); + bstore->inject_leaked(0x30000); bstore->umount(); ASSERT_EQ(bstore->fsck(false), 1); ASSERT_EQ(bstore->repair(false), 0); @@ -8222,7 +8222,7 @@ TEST_P(StoreTestSpecificAUSize, BluestoreRepairTest) { //////////// false free fix //////////// cerr << "fix false free pextents" << std::endl; bstore->mount(); - bstore->inject_false_free_f(cid, hoid); + bstore->inject_false_free(cid, hoid); bstore->umount(); ASSERT_EQ(bstore->fsck(false), 2); ASSERT_EQ(bstore->repair(false), 0);