From bd207412f11685297aef2b54502290a1c29bc83b Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Mon, 10 Oct 2022 14:41:43 +0300
Subject: [PATCH 01/13] os/bluestore: unify allocation functions' signature at
 BlueFS.

Signed-off-by: Igor Fedotov <ifedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 110 +++++++++++++++++--------------------
 src/os/bluestore/BlueFS.h  |   2 +-
 2 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index e72f24bbc7048..142a249f949d1 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -1160,11 +1160,6 @@ int BlueFS::_replay(bool noop, bool to_stdout)
   if (!noop) {
     log_file->vselector_hint =
       vselector->get_hint_for_log();
-  } else {
-    // do not use fnode from superblock in 'noop' mode - log_file's one should
-    // be fine and up-to-date
-    ceph_assert(log_file->fnode.ino == 1);
-    ceph_assert(log_file->fnode.extents.size() != 0);
   }
   dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
   if (unlikely(to_stdout)) {
@@ -1725,26 +1720,27 @@ int BlueFS::device_migrate_to_existing(
 
   for (auto& [ino, file_ref] : nodes.file_map) {
     //do not copy log
-    if (file_ref->fnode.ino == 1) {
+    if (ino == 1) {
       continue;
     }
     dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
 
-    auto& fnode_extents = file_ref->fnode.extents;
     vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
 
     bool rewrite = std::any_of(
-      fnode_extents.begin(),
-      fnode_extents.end(),
+      file_ref->fnode.extents.begin(),
+      file_ref->fnode.extents.end(),
       [=](auto& ext) {
 	return ext.bdev != dev_target && devs_source.count(ext.bdev);
       });
     if (rewrite) {
       dout(10) << __func__ << "  migrating" << dendl;
-
+      bluefs_fnode_t old_fnode;
+      old_fnode.swap_extents(file_ref->fnode);
+      auto& old_fnode_extents = old_fnode.extents;
       // read entire file
       bufferlist bl;
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	buf.resize(old_ext.length);
 	int r = _bdev_read_random(old_ext.bdev,
 	  old_ext.offset,
@@ -1761,8 +1757,8 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // write entire file
-      PExtentVector extents;
-      auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+      auto l = _allocate_without_fallback(dev_target, bl.length(),
+        &file_ref->fnode);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
 	     << bl.length() << std::dec << " from " << (int)dev_target
@@ -1771,7 +1767,7 @@ int BlueFS::device_migrate_to_existing(
       }
 
       uint64_t off = 0;
-      for (auto& i : extents) {
+      for (auto& i : file_ref->fnode.extents) {
 	bufferlist cur;
 	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
 	ceph_assert(cur_len > 0);
@@ -1782,7 +1778,7 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // release old extents
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	PExtentVector to_release;
 	to_release.emplace_back(old_ext.offset, old_ext.length);
 	alloc[old_ext.bdev]->release(to_release);
@@ -1792,12 +1788,11 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // update fnode
-      fnode_extents.clear();
-      for (auto& i : extents) {
-	fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+      for (auto& i : file_ref->fnode.extents) {
+	i.bdev = dev_target_new;
       }
     } else {
-      for (auto& ext : fnode_extents) {
+      for (auto& ext : file_ref->fnode.extents) {
 	if (dev_target != dev_target_new && ext.bdev == dev_target) {
 	  dout(20) << __func__ << "  " << " ... adjusting extent 0x"
 		   << std::hex << ext.offset << std::dec
@@ -1863,30 +1858,29 @@ int BlueFS::device_migrate_to_new(
   flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
   int dev_target_new = dev_target; //FIXME: remove, makes no sense
 
-  for (auto& p : nodes.file_map) {
+  for (auto& [ino, file_ref] : nodes.file_map) {
     //do not copy log
-    if (p.second->fnode.ino == 1) {
+    if (ino == 1) {
       continue;
     }
-    dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
+    dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
 
-    auto& fnode_extents = p.second->fnode.extents;
+    vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
 
-    bool rewrite = false;
-    for (auto ext_it = fnode_extents.begin();
-	 ext_it != p.second->fnode.extents.end();
-	 ++ext_it) {
-      if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
-	rewrite = true;
-	break;
-      }
-    }
+    bool rewrite = std::any_of(
+      file_ref->fnode.extents.begin(),
+      file_ref->fnode.extents.end(),
+      [=](auto& ext) {
+	return ext.bdev != dev_target && devs_source.count(ext.bdev);
+      });
     if (rewrite) {
       dout(10) << __func__ << "  migrating" << dendl;
-
+      bluefs_fnode_t old_fnode;
+      old_fnode.swap_extents(file_ref->fnode);
+      auto& old_fnode_extents = old_fnode.extents;
       // read entire file
       bufferlist bl;
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	buf.resize(old_ext.length);
 	int r = _bdev_read_random(old_ext.bdev,
 	  old_ext.offset,
@@ -1903,8 +1897,8 @@ int BlueFS::device_migrate_to_new(
       }
 
       // write entire file
-      PExtentVector extents;
-      auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+      auto l = _allocate_without_fallback(dev_target, bl.length(),
+        &file_ref->fnode);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
 	     << bl.length() << std::dec << " from " << (int)dev_target
@@ -1913,7 +1907,7 @@ int BlueFS::device_migrate_to_new(
       }
 
       uint64_t off = 0;
-      for (auto& i : extents) {
+      for (auto& i : file_ref->fnode.extents) {
 	bufferlist cur;
 	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
 	ceph_assert(cur_len > 0);
@@ -1924,7 +1918,7 @@ int BlueFS::device_migrate_to_new(
       }
 
       // release old extents
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	PExtentVector to_release;
 	to_release.emplace_back(old_ext.offset, old_ext.length);
 	alloc[old_ext.bdev]->release(to_release);
@@ -1934,9 +1928,8 @@ int BlueFS::device_migrate_to_new(
       }
 
       // update fnode
-      fnode_extents.clear();
-      for (auto& i : extents) {
-	fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+      for (auto& i : file_ref->fnode.extents) {
+	i.bdev = dev_target_new;
       }
     }
   }
@@ -2475,20 +2468,10 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   int r;
   vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
   log_file->fnode.swap_extents(old_fnode);
-  if (allocate_with_fallback) {
-    r = _allocate(log_dev, need, &log_file->fnode);
-    ceph_assert(r == 0);
-  } else {
-    PExtentVector extents;
-    r = _allocate_without_fallback(log_dev,
-			       need,
-			       &extents);
-    ceph_assert(r == 0);
-    for (auto& p : extents) {
-      log_file->fnode.append_extent(
-	bluefs_extent_t(log_dev, p.offset, p.length));
-    }
-  }
+  r = allocate_with_fallback ?
+    _allocate(log_dev, need, &log_file->fnode) :
+    _allocate_without_fallback(log_dev, need, &log_file->fnode);
+  ceph_assert(r == 0);
 
   _close_writer(log.writer);
 
@@ -2664,6 +2647,11 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   // todo - maybe improve _allocate so we will give clear set of new allocations
   uint64_t processed = 0;
   mempool::bluefs::vector<bluefs_extent_t> old_extents;
+dout(0) << __func__ << " " << std::hex
+        << log.writer->pos << " "
+        << log.writer->file->fnode.size << " "
+        << old_log_jump_to
+        << std::dec << dendl;
   for (auto& e : log_file->fnode.extents) {
     if (processed + e.length <= old_log_jump_to) {
       // drop whole extent
@@ -3494,7 +3482,7 @@ const char* BlueFS::get_device_name(unsigned id)
 }
 
 int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
-		      PExtentVector* extents)
+		      bluefs_fnode_t* node)
 {
   dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
            << " from " << (int)id << dendl;
@@ -3502,12 +3490,13 @@ int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
   if (!alloc[id]) {
     return -ENOENT;
   }
-  extents->reserve(4);  // 4 should be (more than) enough for most allocations
+  PExtentVector extents;
+  extents.reserve(4);  // 4 should be (more than) enough for most allocations
   int64_t need = round_up_to(len, alloc_size[id]);
-  int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
+  int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, &extents);
   if (alloc_len < 0 || alloc_len < need) {
     if (alloc_len > 0) {
-      alloc[id]->release(*extents);
+      alloc[id]->release(extents);
     }
     derr << __func__ << " unable to allocate 0x" << std::hex << need
 	 << " on bdev " << (int)id
@@ -3526,6 +3515,9 @@ int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
   if (is_shared_alloc(id)) {
     shared_alloc->bluefs_used += alloc_len;
   }
+  for (auto& p : extents) {
+    node->append_extent(bluefs_extent_t(id, p.offset, p.length));
+  }
 
   return 0;
 }
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 7132e1e94c929..627f74a06bd4d 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -420,7 +420,7 @@ class BlueFS {
   int _allocate(uint8_t bdev, uint64_t len,
 		bluefs_fnode_t* node);
   int _allocate_without_fallback(uint8_t id, uint64_t len,
-				 PExtentVector* extents);
+				 bluefs_fnode_t* node);
 
   /* signal replay log to include h->file in nearest log flush */
   int _signal_dirty_to_log_D(FileWriter *h);

From 285df4b43e8096cb1b3c9c3c5b380cad759ea52c Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Tue, 8 Nov 2022 17:51:12 +0300
Subject: [PATCH 02/13] os/bluestore: get rid off
 BlueFS::_compact_log_async_dump_metadata_NF()

We can reuse _compact_log_dump_metadata_NF() instead

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 92 ++++++++++++++------------------------
 src/os/bluestore/BlueFS.h  |  5 +--
 2 files changed, 35 insertions(+), 62 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 142a249f949d1..45c67cd8b3e47 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -2325,76 +2325,46 @@ bool BlueFS::_should_start_compact_log_L_N()
 }
 
 void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t *t,
-					int flags)
+					int bdev_update_flags,
+                                        uint64_t capture_before_seq)
 {
   std::lock_guard nl(nodes.lock);
 
-  t->seq = 1;
-  t->uuid = super.uuid;
   dout(20) << __func__ << " op_init" << dendl;
-
-  t->op_init();
   for (auto& [ino, file_ref] : nodes.file_map) {
     if (ino == 1)
       continue;
     ceph_assert(ino > 1);
     std::lock_guard fl(file_ref->lock);
-    for(auto& e : file_ref->fnode.extents) {
-      auto bdev = e.bdev;
-      auto bdev_new = bdev;
-      ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
-      if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
-	bdev_new = BDEV_DB;
-      }
-      if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
-	bdev_new = BDEV_SLOW;
-      }
-      if (bdev == BDEV_NEWDB) {
-	// REMOVE_DB xor RENAME_DB
-	ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
-	ceph_assert(!(flags & RENAME_SLOW2DB));
-	bdev_new = BDEV_DB;
-      }
-      if (bdev == BDEV_NEWWAL) {
-	ceph_assert(flags & REMOVE_WAL);
-	bdev_new = BDEV_WAL;
+    if (bdev_update_flags) {
+      for(auto& e : file_ref->fnode.extents) {
+        auto bdev = e.bdev;
+        auto bdev_new = bdev;
+        ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL));
+        if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+	  bdev_new = BDEV_DB;
+        }
+        if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+	  bdev_new = BDEV_SLOW;
+        }
+        if (bdev == BDEV_NEWDB) {
+	  // REMOVE_DB xor RENAME_DB
+	  ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW));
+	  ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB));
+	  bdev_new = BDEV_DB;
+        }
+        if (bdev == BDEV_NEWWAL) {
+	  ceph_assert(bdev_update_flags & REMOVE_WAL);
+	  bdev_new = BDEV_WAL;
+        }
+        e.bdev = bdev_new;
       }
-      e.bdev = bdev_new;
     }
-    dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
-    t->op_file_update(file_ref->fnode);
-  }
-  for (auto& [path, dir_ref] : nodes.dir_map) {
-    dout(20) << __func__ << " op_dir_create " << path << dendl;
-    t->op_dir_create(path);
-    for (auto& [fname, file_ref] : dir_ref->file_map) {
-      dout(20) << __func__ << " op_dir_link " << path << "/" << fname
-	       << " to " << file_ref->fnode.ino << dendl;
-      t->op_dir_link(path, fname, file_ref->fnode.ino);
-    }
-  }
-}
-/* Streams to t files modified before *capture_before_seq* and all dirs */
-void BlueFS::_compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
-						 uint64_t capture_before_seq)
-{
-  std::lock_guard nl(nodes.lock);
-
-  t->seq = 1;
-  t->uuid = super.uuid;
-  dout(20) << __func__ << " op_init" << dendl;
-
-  t->op_init();
-  for (auto& [ino, file_ref] : nodes.file_map) {
-    if (ino == 1)
-      continue;
-    ceph_assert(ino > 1);
-    std::lock_guard fl(file_ref->lock);
-    if (file_ref->dirty_seq < capture_before_seq) {
+    if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) {
       dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
     } else {
       dout(20) << __func__ << " op_file_update just modified, dirty_seq="
-	       << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
+               << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
     }
     t->op_file_update(file_ref->fnode);
   }
@@ -2452,8 +2422,9 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
 		       << " flags:" << flags
 		       << dendl;
   bluefs_transaction_t t;
-  _compact_log_dump_metadata_NF(&t, flags);
-
+  t.seq = 2;
+  t.uuid = super.uuid;
+  _compact_log_dump_metadata_NF(&t, flags, 0);
   dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
   t.op_jump_seq(log.seq_live);
 
@@ -2604,7 +2575,10 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
 
   // 2. prepare compacted log
   bluefs_transaction_t t;
-  _compact_log_async_dump_metadata_NF(&t, seq_now);
+  t.seq = 1;
+  t.uuid = super.uuid;
+  t.op_init();
+  _compact_log_dump_metadata_NF(&t, 0, seq_now);
 
   // now state is captured to bufferlist
   // log can be used to write to, ops in log will be continuation of captured state
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 627f74a06bd4d..658bca7042d12 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -457,9 +457,8 @@ class BlueFS {
     RENAME_DB2SLOW = 8,
   };
   void _compact_log_dump_metadata_NF(bluefs_transaction_t *t,
-				 int flags);
-  void _compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
-					   uint64_t capture_before_seq);
+				 int flags,
+				 uint64_t capture_before_seq);
 
   void _compact_log_sync_LNF_LD();
   void _compact_log_async_LD_LNF_D();

From 0fc0ced22c49c80dc5c0b972fd6e9465252f1909 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Tue, 8 Nov 2022 19:21:08 +0300
Subject: [PATCH 03/13] os/bluestore: simplify and cleanup
 BlueFS::_compact_log_async_...()

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc      | 64 +++++++++------------------------
 src/os/bluestore/bluefs_types.h |  5 +++
 2 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 45c67cd8b3e47..9f8ada16d9db1 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -2551,17 +2551,22 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
   dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
            << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
+  bluefs_fnode_t new_log_tail_fnode;
+  bluefs_fnode_t old_log_snapshot_fnode;
   int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
 		    cct->_conf->bluefs_max_log_runway,
-                    &log_file->fnode);
+                    &new_log_tail_fnode);
   ceph_assert(r == 0);
+  old_log_snapshot_fnode.clone_extents(log_file->fnode);
+  log_file->fnode.clone_extents(new_log_tail_fnode);
+
   //adjust usage as flush below will need it
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
   dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
 
   // update the log file change and log a jump to the offset where we want to
   // write the new entries
-  log.t.op_file_update(log_file->fnode);
+  log.t.op_file_update_inc(log_file->fnode);
   // jump to new position should mean next seq
   log.t.op_jump(log.seq_live + 1, old_log_jump_to);
   uint64_t seq_now = log.seq_live;
@@ -2595,9 +2600,9 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   t.op_jump(seq_now, new_log_jump_to);
 
   // allocate
-  //FIXME: check if we want DB here?
-  r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
-                    &new_log->fnode);
+  r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+                new_log_jump_to,
+                &new_log->fnode);
   ceph_assert(r == 0);
 
   bufferlist bl;
@@ -2617,42 +2622,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   _flush_bdev(new_log_writer);
   // 5. update our log fnode
   // we need to append to new_log the extents that were allocated in step 1.1
-  // we do it by inverse logic - we drop 'old_log_jump_to' bytes and keep rest
-  // todo - maybe improve _allocate so we will give clear set of new allocations
-  uint64_t processed = 0;
-  mempool::bluefs::vector<bluefs_extent_t> old_extents;
-dout(0) << __func__ << " " << std::hex
-        << log.writer->pos << " "
-        << log.writer->file->fnode.size << " "
-        << old_log_jump_to
-        << std::dec << dendl;
-  for (auto& e : log_file->fnode.extents) {
-    if (processed + e.length <= old_log_jump_to) {
-      // drop whole extent
-      dout(10) << __func__ << " remove old log extent " << e << dendl;
-      old_extents.push_back(e);
-    } else {
-      // keep, but how much?
-      if (processed < old_log_jump_to) {
-	ceph_assert(processed + e.length > old_log_jump_to);
-	ceph_assert(old_log_jump_to - processed <= std::numeric_limits<uint32_t>::max());
-	uint32_t cut_at = uint32_t(old_log_jump_to - processed);
-	// need to cut, first half gets dropped
-	bluefs_extent_t retire(e.bdev, e.offset, cut_at);
-	old_extents.push_back(retire);
-	// second half goes to new log
-	bluefs_extent_t keep(e.bdev, e.offset + cut_at, e.length - cut_at);
-	new_log->fnode.append_extent(keep);
-	dout(10) << __func__ << " kept " << keep << " removed " << retire << dendl;
-      } else {
-	// take entire extent
-	ceph_assert(processed >= old_log_jump_to);
-	new_log->fnode.append_extent(e);
-	dout(10) << __func__ << " kept " << e << dendl;
-      }
-    }
-    processed += e.length;
-  }
+  new_log->fnode.claim_extents(new_log_tail_fnode.extents);
   // we will write it to super
   new_log->fnode.reset_delta();
 
@@ -2667,12 +2637,8 @@ dout(0) << __func__ << " " << std::hex
   _flush_bdev();
 
   log.lock.lock();
-  // swapping log_file and new_log
+  // swapping log_file and new_log, new log file is the log file now.
   vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
-
-  // clear the extents from old log file, they are added to new log
-  log_file->fnode.clear_extents();
-  // swap the log files. New log file is the log file now.
   new_log->fnode.swap_extents(log_file->fnode);
 
   log.writer->pos = log.writer->file->fnode.size =
@@ -2688,10 +2654,12 @@ dout(0) << __func__ << " " << std::hex
   log_cond.notify_all();
 
   // 7. release old space
-  dout(10) << __func__ << " release old log extents " << old_extents << dendl;
+  dout(10) << __func__
+           << " release old log extents " << old_log_snapshot_fnode.extents
+           << dendl;
   {
     std::lock_guard dl(dirty.lock);
-    for (auto& r : old_extents) {
+    for (auto& r : old_log_snapshot_fnode.extents) {
       dirty.pending_release[r.bdev].insert(r.offset, r.length);
     }
   }
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
index b53000188ae77..2134b559a20df 100644
--- a/src/os/bluestore/bluefs_types.h
+++ b/src/os/bluestore/bluefs_types.h
@@ -115,6 +115,11 @@ struct bluefs_fnode_t {
   void reset_delta() {
     allocated_commited = allocated;
   }
+  void clone_extents(const bluefs_fnode_t& fnode) {
+    for (const auto& p : fnode.extents) {
+      append_extent(p);
+    }
+  }
   void claim_extents(mempool::bluefs::vector<bluefs_extent_t>& extents) {
     for (const auto& p : extents) {
       append_extent(p);

From 05478fc46bc6437e8e57642a5ffdedde851f08f3 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Tue, 8 Nov 2022 18:16:21 +0300
Subject: [PATCH 04/13] os/bluestore: introduce method to estimate BlueFS
 transaction size

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 28 +++++++++++++++++-----------
 src/os/bluestore/BlueFS.h  |  4 +++-
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 9f8ada16d9db1..3c9abee517a93 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -2277,6 +2277,17 @@ void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
   }
 }
 
+
+uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
+{
+  uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
+				     std::max(alloc_size[BDEV_DB],
+					      alloc_size[BDEV_SLOW]));
+
+  // conservative estimate for final encoded size
+  return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
+}
+
 uint64_t BlueFS::_estimate_log_size_N()
 {
   std::lock_guard nl(nodes.lock);
@@ -2589,13 +2600,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   // log can be used to write to, ops in log will be continuation of captured state
   log.lock.unlock();
 
-  uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
-				     std::max(alloc_size[BDEV_DB],
-					      alloc_size[BDEV_SLOW]));
-
-  // conservative estimate for final encoded size
-  new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
-                                max_alloc_size);
+  new_log_jump_to = _estimate_transaction_size(&t);
   //newly constructed log head will jump to what we had before
   t.op_jump(seq_now, new_log_jump_to);
 
@@ -2677,13 +2682,14 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   ceph_assert(old_is_comp);
 }
 
-void BlueFS::_pad_bl(bufferlist& bl)
+void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size)
 {
-  uint64_t partial = bl.length() % super.block_size;
+  pad_size = std::max(pad_size, uint64_t(super.block_size));
+  uint64_t partial = bl.length() % pad_size;
   if (partial) {
     dout(10) << __func__ << " padding with 0x" << std::hex
-	     << super.block_size - partial << " zeros" << std::dec << dendl;
-    bl.append_zero(super.block_size - partial);
+	     << pad_size - partial << " zeros" << std::dec << dendl;
+    bl.append_zero(pad_size - partial);
   }
 }
 
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 658bca7042d12..d960018e39881 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -404,7 +404,8 @@ class BlueFS {
   void _init_alloc();
   void _stop_alloc();
 
-  void _pad_bl(ceph::buffer::list& bl);  ///< pad ceph::buffer::list to block size w/ zeros
+  ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros
+  void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
 
   uint64_t _get_used(unsigned id) const;
   uint64_t _get_total(unsigned id) const;
@@ -447,6 +448,7 @@ class BlueFS {
 			       int64_t available_runway);
   int _flush_and_sync_log_LD(uint64_t want_seq = 0);
 
+  uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
   uint64_t _estimate_log_size_N();
   bool _should_start_compact_log_L_N();
 

From 0bfc42ac8d3586367e55e71e978d3eb4f62cf5b3 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Wed, 9 Nov 2022 04:39:44 +0300
Subject: [PATCH 05/13] os/bluestore: increment Bluefs::super.version at
 _write_super

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 3c9abee517a93..ff6412b43bd36 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -565,7 +565,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   _init_logger();
   _init_alloc();
 
-  super.version = 1;
+  super.version = 0;
   super.block_size = bdev[BDEV_DB]->get_block_size();
   super.osd_uuid = osd_uuid;
   super.uuid.generate_random();
@@ -1028,6 +1028,7 @@ int BlueFS::fsck()
 
 int BlueFS::_write_super(int dev)
 {
+  ++super.version;
   // build superblock
   bufferlist bl;
   encode(super, bl);
@@ -2489,7 +2490,6 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   }
   dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
 
-  ++super.version;
   _write_super(super_dev);
   _flush_bdev();
 
@@ -2637,7 +2637,6 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   new_log->fnode.size = 0;
   new_log->fnode.mtime = ceph_clock_now();
   super.log_fnode = new_log->fnode;
-  ++super.version;
   _write_super(BDEV_DB);
   _flush_bdev();
 

From 0af28582453122ccd87389261a45127d399caf7d Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Wed, 9 Nov 2022 05:24:00 +0300
Subject: [PATCH 06/13] os/bluestore: introduce bluefs_fnode_t::swap method

+ minor refactoring.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/bluefs_types.cc | 1 -
 src/os/bluestore/bluefs_types.h  | 9 ++++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
index 3a812cf5f018d..c8d2ede7bed92 100644
--- a/src/os/bluestore/bluefs_types.cc
+++ b/src/os/bluestore/bluefs_types.cc
@@ -167,7 +167,6 @@ bluefs_fnode_delta_t* bluefs_fnode_t::make_delta(bluefs_fnode_delta_t* delta) {
       delta->extents.push_back(*p);
       ++p;
     }
-    reset_delta();
   }
   return delta;
 }
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
index 2134b559a20df..cbf0ed1def939 100644
--- a/src/os/bluestore/bluefs_types.h
+++ b/src/os/bluestore/bluefs_types.h
@@ -149,6 +149,12 @@ struct bluefs_fnode_t {
     extents.erase(it);
   }
   
+  void swap(bluefs_fnode_t& other) {
+    std::swap(ino, other.ino);
+    std::swap(size, other.size);
+    std::swap(mtime, other.mtime);
+    swap_extents(other);
+  }
   void swap_extents(bluefs_fnode_t& other) {
     other.extents.swap(extents);
     other.extents_index.swap(extents_index);
@@ -290,9 +296,10 @@ struct bluefs_transaction_t {
   void op_file_update_inc(bluefs_fnode_t& file) {
     using ceph::encode;
     bluefs_fnode_delta_t delta;
-    file.make_delta(&delta); //also resets delta to zero
+    file.make_delta(&delta);
     encode((__u8)OP_FILE_UPDATE_INC, op_bl);
     encode(delta, op_bl);
+    file.reset_delta();
   }
   void op_file_remove(uint64_t ino) {
     using ceph::encode;

From b65c780a3b524a44d0f860b0edda3baaac13c539 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Wed, 2 Nov 2022 19:39:14 +0300
Subject: [PATCH 07/13] os/bluestore: prepend compacted BlueFS log with a
 starter part.

The rationale is to have initial log fnode after compaction small
enough to fit into 4K superblock. Without that compacted metadata might
require fnode longer than 4K which goes beyond existing 4K
superblock. BlueFS assert in this case for now.
Hence the resulting log allocation disposition is like:
- superblock(4K) keeps initial log fnode which refers:
  op_init, op_update_inc(log), op_jump(next seq)
- updated log fnode built from superblock + above op_update_inc refers:
  compacted meta (a bunch of op_update and others)
- *
- more op_update_inc(log) to follow if log is extended
- *

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc      | 571 +++++++++++++++++++++++---------
 src/os/bluestore/BlueFS.h       |  13 +-
 src/os/bluestore/bluefs_types.h |  13 +-
 3 files changed, 440 insertions(+), 157 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index ff6412b43bd36..064b058e4f404 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -988,7 +988,6 @@ int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
       new_log_dev_next,
       RENAME_DB2SLOW,
       layout);
-    //}
   } else if(id == BDEV_NEWWAL) {
     _rewrite_log_and_layout_sync_LNF_LD(false,
       BDEV_DB,
@@ -2289,6 +2288,33 @@ uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
   return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
 }
 
+uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq,
+                                           bluefs_fnode_t& fnode,
+                                           uint64_t expected_final_size,
+                                           bufferlist* out)
+{
+  bluefs_transaction_t t0;
+  t0.seq = start_seq;
+  t0.uuid = super.uuid;
+  t0.op_init();
+  t0.op_file_update_inc(fnode);
+  t0.op_jump(start_seq, expected_final_size); // this is a fixed size op,
+                                              // hence it's valid with fake
+                                              // params for overall txc size
+                                              // estimation
+  if (!out) {
+    return _estimate_transaction_size(&t0);
+  }
+
+  ceph_assert(expected_final_size > 0);
+  out->reserve(expected_final_size);
+  encode(t0, *out);
+  // make sure we're not wrong aboth the size
+  ceph_assert(out->length() <= expected_final_size);
+  _pad_bl(*out, expected_final_size);
+  return expected_final_size;
+}
+
 uint64_t BlueFS::_estimate_log_size_N()
 {
   std::lock_guard nl(nodes.lock);
@@ -2336,13 +2362,17 @@ bool BlueFS::_should_start_compact_log_L_N()
   return true;
 }
 
-void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t *t,
+void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq,
+                                        bluefs_transaction_t *t,
 					int bdev_update_flags,
                                         uint64_t capture_before_seq)
 {
+  dout(20) << __func__ << dendl;
+  t->seq = start_seq;
+  t->uuid = super.uuid;
+
   std::lock_guard nl(nodes.lock);
 
-  dout(20) << __func__ << " op_init" << dendl;
   for (auto& [ino, file_ref] : nodes.file_map) {
     if (ino == 1)
       continue;
@@ -2409,6 +2439,21 @@ void BlueFS::_compact_log_sync_LNF_LD()
   logger->inc(l_bluefs_log_compactions);
 }
 
+/*
+ * SYNC LOG COMPACTION
+ *
+ * 0. Lock the log completely through the whole procedure
+ *
+ * 1. Build new log. It will include log's starter and compacted metadata
+ *    body. Jump op appended to the starter will link the pieces together.
+ *
+ * 2. Write out new log's content
+ *
+ * 3. Write out new superblock. This includes relevant device layout update.
+ *
+ * 4. Finalization. Old space release.
+ */
+
 void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
 					 int super_dev,
 					 int log_dev,
@@ -2416,10 +2461,26 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
 					 int flags,
 					 std::optional<bluefs_layout_t> layout)
 {
+  // we substitute log_dev with log_dev_new for new allocations below
+  // and permitting fallback allocations prevents such a substitution
+  ceph_assert((allocate_with_fallback && log_dev == log_dev_new) ||
+              !allocate_with_fallback);
+
+  dout(10) << __func__ << " super_dev:" << super_dev
+                       << " log_dev:" << log_dev
+                       << " log_dev_new:" << log_dev_new
+		       << " flags:" << flags
+		       << " seq:" << log.seq_live
+		       << dendl;
+  utime_t mtime = ceph_clock_now();
+  uint64_t starter_seq = 1;
+
+  // Part 0.
+  // Lock the log totally till the end of the procedure
   std::lock_guard ll(log.lock);
 
   File *log_file = log.writer->file.get();
-
+  bluefs_fnode_t fnode_tail;
   // log.t.seq is always set to current live seq
   ceph_assert(log.t.seq == log.seq_live);
   // Capturing entire state. Dump anything that has been stored there.
@@ -2428,44 +2489,147 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   // From now on, no changes to log.t are permitted until we finish rewriting log.
   // Can allow dirty to remain dirty - log.seq_live will not change.
 
-  dout(20) << __func__ << " super_dev:" << super_dev
-                       << " log_dev:" << log_dev
-                       << " log_dev_new:" << log_dev_new
-		       << " flags:" << flags
-		       << dendl;
-  bluefs_transaction_t t;
-  t.seq = 2;
-  t.uuid = super.uuid;
-  _compact_log_dump_metadata_NF(&t, flags, 0);
-  dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
-  t.op_jump_seq(log.seq_live);
+  //
+  // Part 1.
+  // Build new log starter and compacted metadata body
+  // 1.1. Build full compacted meta transaction.
+  //      Encode a bluefs transaction that dumps all of the in-memory fnodes
+  //      and names.
+  //      This might be pretty large and its allocation map can exceed
+  //      superblock size. Hence instead we'll need log starter part which
+  //      goes to superblock and refers that new meta through op_update_inc.
+  // 1.2.  Allocate space for the above transaction
+  //       using its size estimation.
+  // 1.3.  Allocate the space required for the starter part of the new log.
+  //       It should be small enough to fit into superblock.
+  // 1.4   Building new log persistent fnode representation which will
+  //       finally land to disk.
+  //       Depending on input parameters we might need to perform device ids
+  //       rename - runtime and persistent replicas should be different when we
+  //       are in the device migration process.
+  // 1.5   Store starter fnode to run-time superblock, to be written out later.
+  //       It doesn't contain compacted meta to fit relevant alocation map into
+  //       superblock.
+  // 1.6   Proceed building new log persistent fnode representation.
+  //       Will add log tail with compacted meta extents from 1.1.
+  //       Device rename applied as well
+  //
+  // 1.7.  Encode new log fnode starter,
+  //       It will include op_init, new log's op_update_inc
+  //       and jump to the compacted meta transaction beginning.
+  //       Superblock will reference this starter part
+  //
+  // 1.8.  Encode compacted meta transaction,
+  //       extend the transaction with a jump to proper sequence no
+  //
+
+
+  // 1.1 Build full compacted meta transaction
+  bluefs_transaction_t compacted_meta_t;
+  _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0);
+
+  // 1.2 Allocate the space required for the compacted meta transaction
+  uint64_t compacted_meta_need =
+    _estimate_transaction_size(&compacted_meta_t) +
+      cct->_conf->bluefs_max_log_runway;
+
+  dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
+
+  int r = allocate_with_fallback ?
+    _allocate(log_dev, compacted_meta_need, &fnode_tail) :
+    _allocate_without_fallback(log_dev, compacted_meta_need, &fnode_tail);
+  ceph_assert(r == 0);
 
-  bufferlist bl;
-  encode(t, bl);
-  _pad_bl(bl);
 
-  uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
-  dout(20) << __func__ << " need " << need << dendl;
+  // 1.3 Allocate the space required for the starter part of the new log.
+  // estimate new log fnode size to be referenced from superblock
+  // hence use dummy fnode and jump parameters
+  uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
 
-  bluefs_fnode_t old_fnode;
-  int r;
-  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
-  log_file->fnode.swap_extents(old_fnode);
+  bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
   r = allocate_with_fallback ?
-    _allocate(log_dev, need, &log_file->fnode) :
-    _allocate_without_fallback(log_dev, need, &log_file->fnode);
+    _allocate(log_dev, starter_need, &fnode_starter) :
+    _allocate_without_fallback(log_dev, starter_need, &fnode_starter);
   ceph_assert(r == 0);
 
-  _close_writer(log.writer);
+  // 1.4 Building starter fnode
+  bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime);
+  for (auto p : fnode_starter.extents) {
+    // rename device if needed - this is possible when fallback allocations
+    // are prohibited only. Which means every extent is targeted to the same
+    // device and we can unconditionally update them.
+    if (log_dev != log_dev_new) {
+      dout(10) << __func__ << " renaming log extents to "
+               << log_dev_new << dendl;
+      p.bdev = log_dev_new;
+    }
+    fnode_persistent.append_extent(p);
+  }
+
+  // 1.5 Store starter fnode to run-time superblock, to be written out later
+  super.log_fnode = fnode_persistent;
+
+  // 1.6 Proceed building new log persistent fnode representation
+  // we'll build incremental update starting from this point
+  fnode_persistent.reset_delta();
+  for (auto p : fnode_tail.extents) {
+    // rename device if needed - this is possible when fallback allocations
+    // are prohibited only. Which means every extent is targeted to the same
+    // device and we can unconditionally update them.
+    if (log_dev != log_dev_new) {
+      dout(10) << __func__ << " renaming log extents to "
+               << log_dev_new << dendl;
+      p.bdev = log_dev_new;
+    }
+    fnode_persistent.append_extent(p);
+  }
 
-  // we will write it to super
-  log_file->fnode.reset_delta();
-  log_file->fnode.size = bl.length();
+  // 1.7 Encode new log fnode
+  // This will flush incremental part of fnode_persistent only.
+  bufferlist starter_bl;
+  _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl);
 
+  // 1.8 Encode compacted meta transaction
+  dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
+  // hopefully "compact_meta_need" estimation provides enough extra space
+  // for this op, assert below if not
+  compacted_meta_t.op_jump_seq(log.seq_live);
+
+  bufferlist compacted_meta_bl;
+  encode(compacted_meta_t, compacted_meta_bl);
+  _pad_bl(compacted_meta_bl);
+  ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
+
+  //
+  // Part 2
+  // Write out new log's content
+  // 2.1. Build the full runtime new log's fnode
+  //
+  // 2.2. Write out new log's
+  //
+  // 2.3. Do flush and wait for completion through flush_bdev()
+  //
+  // 2.4. Finalize log update
+  //      Update all sequence numbers
+  //
+
+  // 2.1 Build the full runtime new log's fnode
+  bluefs_fnode_t old_log_fnode;
+  old_log_fnode.swap(fnode_starter);
+  old_log_fnode.clone_extents(fnode_tail);
+  old_log_fnode.reset_delta();
+  log_file->fnode.swap(old_log_fnode);
+
+  // 2.2 Write out new log's content
+  // Get rid off old writer
+  _close_writer(log.writer);
+  // Make new log writer and stage new log's content writing
   log.writer = _create_writer(log_file);
-  log.writer->append(bl);
+  log.writer->append(starter_bl);
+  log.writer->append(compacted_meta_bl);
+
+  // 2.3 Do flush and wait for completion through flush_bdev()
   _flush_special(log.writer);
-  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
 #ifdef HAVE_LIBAIO
   if (!cct->_conf->bluefs_sync_write) {
     list<aio_t> completed_ios;
@@ -2475,110 +2639,123 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   }
 #endif
   _flush_bdev();
+
+  // 2.4 Finalize log update
   ++log.seq_live;
   dirty.seq_live = log.seq_live;
   log.t.seq = log.seq_live;
+  vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
 
-  super.memorized_layout = layout;
-  super.log_fnode = log_file->fnode;
-  // rename device if needed
-  if (log_dev != log_dev_new) {
-    dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
-    for (auto& p : super.log_fnode.extents) {
-      p.bdev = log_dev_new;
-    }
-  }
-  dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
+  // Part 3.
+  // Write out new superblock to reflect all the changes.
+  //
 
+  super.memorized_layout = layout;
   _write_super(super_dev);
   _flush_bdev();
 
-  dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
-  std::lock_guard dl(dirty.lock);
-  for (auto& r : old_fnode.extents) {
-    dirty.pending_release[r.bdev].insert(r.offset, r.length);
+  // we're mostly done
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+
+  // Part 4
+  // Finalization. Release old space.
+  //
+  {
+    dout(10) << __func__
+             << " release old log extents " << old_log_fnode.extents
+             << dendl;
+    std::lock_guard dl(dirty.lock);
+    for (auto& r : old_log_fnode.extents) {
+      dirty.pending_release[r.bdev].insert(r.offset, r.length);
+    }
   }
 }
 
 /*
- * 1. Allocate a new extent to continue the log, and then log an event
- * that jumps the log write position to the new extent.  At this point, the
- * old extent(s) won't be written to, and reflect everything to compact.
- * New events will be written to the new region that we'll keep.
+ * ASYNC LOG COMPACTION
  *
- * 2. While still holding the lock, encode a bufferlist that dumps all of the
- * in-memory fnodes and names.  This will become the new beginning of the
- * log.  The last event will jump to the log continuation extent from #1.
- *
- * 3. Queue a write to a new extent for the new beginnging of the log.
+ * 0. Lock the log and forbid its extension. The former covers just
+ *    a part of the below procedure while the latter spans over it
+ *    completely.
+ * 1. Allocate a new extent to continue the log, and then log an event
+ *    that jumps the log write position to the new extent.  At this point, the
+ *    old extent(s) won't be written to, and reflect everything to compact.
+ *    New events will be written to the new region that we'll keep.
+ *    The latter will finally become new log tail on compaction completion.
  *
- * 4. Drop lock and wait
+ * 2. Build new log. It will include log's starter, compacted metadata
+ *    body and the above tail. Jump ops appended to the starter and meta body
+ *    will link the pieces togather. Log's lock is releases in the mid of the
+ *    process to permit parallel access to it.
  *
- * 5. Retake the lock.
+ * 3. Write out new log's content.
  *
- * 6. Update the log_fnode to splice in the new beginning.
+ * 4. Write out new superblock to reflect all the changes.
  *
- * 7. Write the new superblock.
+ * 5. Apply new log fnode, log is locked for a while.
  *
- * 8. Release the old log space.  Clean up.
+ * 6. Finalization. Clean up, old space release and total unlocking.
  */
 
 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
 {
   dout(10) << __func__ << dendl;
+  utime_t mtime = ceph_clock_now();
+  uint64_t starter_seq = 1;
+  uint64_t old_log_jump_to = 0;
+
+  // Part 0.
+  // Lock the log and forbid its expansion and other compactions
+
   // only one compaction allowed at one time
   bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
   if (old_is_comp) {
     dout(10) << __func__ << " ongoing" <<dendl;
     return;
   }
-
+  // lock log's run-time structures for a while
   log.lock.lock();
-  File *log_file = log.writer->file.get();
-  FileWriter *new_log_writer = nullptr;
-  FileRef new_log = nullptr;
-  uint64_t new_log_jump_to = 0;
-  uint64_t old_log_jump_to = 0;
-
-  new_log = ceph::make_ref<File>();
-  new_log->fnode.ino = 0;   // we use _flush_special to avoid log of the fnode
+  //signal _maybe_extend_log that expansion of log is temporary inacceptable
+  bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
+  ceph_assert(old_forbidden == false);
 
+  //
   // Part 1.
   // Prepare current log for jumping into it.
-  // 1. Allocate extent
-  // 2. Update op to log
-  // 3. Jump op to log
+  // 1.1. Allocate extent
+  // 1.2. Save log's fnode extents and add new extents
+  // 1.3. Update op to log
+  // 1.4. Jump op to log
   // During that, no one else can write to log, otherwise we risk jumping backwards.
   // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
 
-  //signal _maybe_extend_log that expansion of log is temporary inacceptable
-  bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
-  ceph_assert(old_forbidden == false);
-
-  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
-
-  // 1.1 allocate new log space and jump to it.
+  // 1.1 allocate new log extents and store them at fnode_tail
+  File *log_file = log.writer->file.get();
   old_log_jump_to = log_file->fnode.get_allocated();
+  bluefs_fnode_t fnode_tail;
   uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
   dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
-           << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
-  bluefs_fnode_t new_log_tail_fnode;
-  bluefs_fnode_t old_log_snapshot_fnode;
+           << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
   int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
 		    cct->_conf->bluefs_max_log_runway,
-                    &new_log_tail_fnode);
+                    &fnode_tail);
   ceph_assert(r == 0);
-  old_log_snapshot_fnode.clone_extents(log_file->fnode);
-  log_file->fnode.clone_extents(new_log_tail_fnode);
 
+  // 1.2 save log's fnode extents and add new extents
+  bluefs_fnode_t old_log_fnode(log_file->fnode);
+  log_file->fnode.clone_extents(fnode_tail);
   //adjust usage as flush below will need it
+  vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
   dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
 
-  // update the log file change and log a jump to the offset where we want to
+  // 1.3 update the log file change and log a jump to the offset where we want to
   // write the new entries
   log.t.op_file_update_inc(log_file->fnode);
-  // jump to new position should mean next seq
+
+  // 1.4 jump to new position should mean next seq
   log.t.op_jump(log.seq_live + 1, old_log_jump_to);
   uint64_t seq_now = log.seq_live;
   // we need to flush all bdev because we will be streaming all dirty files to log
@@ -2587,96 +2764,188 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   _flush_bdev();
   _flush_and_sync_log_jump_D(old_log_jump_to, runway);
 
-  // out of jump section
-
-  // 2. prepare compacted log
-  bluefs_transaction_t t;
-  t.seq = 1;
-  t.uuid = super.uuid;
-  t.op_init();
-  _compact_log_dump_metadata_NF(&t, 0, seq_now);
-
-  // now state is captured to bufferlist
-  // log can be used to write to, ops in log will be continuation of captured state
+  //
+  // Part 2.
+  // Build new log starter and compacted metadata body
+  // 2.1.  Build full compacted meta transaction.
+  //       While still holding the lock, encode a bluefs transaction
+  //       that dumps all of the in-memory fnodes and names.
+  //       This might be pretty large and its allocation map can exceed
+  //       superblock size. Hence instead we'll need log starter part which
+  //       goes to superblock and refers that new meta through op_update_inc.
+  // 2.2.  After releasing the lock allocate space for the above transaction
+  //       using its size estimation.
+  //       Then build tailing list of extents which consists of these
+  //       newly allocated extents followed by ones from Part 1.
+  // 2.3.  Allocate the space required for the starter part of the new log.
+  //       It should be small enough to fit into superblock.
+  //       Effectively we start building new log fnode here.
+  // 2.4.  Store starter fnode to run-time superblock, to be written out later
+  // 2.5.  Finalize new log's fnode building
+  //       This will include log's starter and tailing extents built at 2.2
+  // 2.6.  Encode new log fnode starter,
+  //       It will include op_init, new log's op_update_inc
+  //       and jump to the compacted meta transaction beginning.
+  //       Superblock will reference this starter part
+  // 2.7.  Encode compacted meta transaction,
+  //       extend the transaction with a jump to the log tail from 1.1 before
+  //       encoding.
+  //
+
+  // 2.1 Build full compacted meta transaction
+  bluefs_transaction_t compacted_meta_t;
+  _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now);
+
+  // now state is captured to compacted_meta_t,
+  // current log can be used to write to,
+  //ops in log will be continuation of captured state
   log.lock.unlock();
 
-  new_log_jump_to = _estimate_transaction_size(&t);
-  //newly constructed log head will jump to what we had before
-  t.op_jump(seq_now, new_log_jump_to);
+  // 2.2 Allocate the space required for the compacted meta transaction
+  uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t);
+  dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need
+           << dendl;
+  {
+    bluefs_fnode_t fnode_pre_tail;
+    // do allocate
+    r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+                  compacted_meta_need,
+                  &fnode_pre_tail);
+    ceph_assert(r == 0);
+    // build trailing list of extents in fnode_tail,
+    // this will include newly allocated extents for compacted meta
+    // and aux extents allocated at step 1.1
+    fnode_pre_tail.claim_extents(fnode_tail.extents);
+    fnode_tail.swap_extents(fnode_pre_tail);
+  }
 
-  // allocate
+  // 2.3 Allocate the space required for the starter part of the new log.
+  // Start building New log fnode
+  FileRef new_log = nullptr;
+  new_log = ceph::make_ref<File>();
+  new_log->fnode.ino = log_file->fnode.ino;
+  new_log->fnode.mtime = mtime;
+  // Estimate the required space
+  uint64_t starter_need =
+    _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
+  // and now allocate and store at new_log_fnode
   r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
-                new_log_jump_to,
+                starter_need,
                 &new_log->fnode);
   ceph_assert(r == 0);
 
-  bufferlist bl;
-  encode(t, bl);
-  _pad_bl(bl);
+  // 2.4 Store starter fnode to run-time superblock, to be written out later
+  super.log_fnode = new_log->fnode;
 
-  dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
-	   << std::dec << dendl;
+  // 2.5 Finalize new log's fnode building
+  // start collecting new log fnode updates (to make op_update_inc later)
+  // since this point. This will include compacted meta from 2.2 and aux
+  // extents from 1.1.
+  new_log->fnode.reset_delta();
+  new_log->fnode.claim_extents(fnode_tail.extents);
 
-  new_log_writer = _create_writer(new_log);
+  // 2.6 Encode new log fnode
+  bufferlist starter_bl;
+  _make_initial_transaction(starter_seq, new_log->fnode, starter_need,
+    &starter_bl);
 
-  new_log_writer->append(bl);
-  // 3. flush
+  // 2.7 Encode compacted meta transaction,
+  dout(20) << __func__
+           << " new_log jump seq " << seq_now
+           << std::hex << " offset 0x" << starter_need + compacted_meta_need
+	   << std::dec << dendl;
+  // Extent compacted_meta transaction with a just to new log tail.
+  // Hopefully "compact_meta_need" estimation provides enough extra space
+  // for this new jump, assert below if not
+  compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need);
+  // Now do encodeing and padding
+  bufferlist compacted_meta_bl;
+  compacted_meta_bl.reserve(compacted_meta_need);
+  encode(compacted_meta_t, compacted_meta_bl);
+  ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
+  _pad_bl(compacted_meta_bl, compacted_meta_need);
+
+  //
+  // Part 3.
+  // Write out new log's content
+  // 3.1 Stage new log's content writing
+  // 3.2 Do flush and wait for completion through flush_bdev()
+  //
+
+  // 3.1 Stage new log's content writing
+  // Make new log writer and append bufferlists to write out.
+  FileWriter *new_log_writer = _create_writer(new_log);
+  // And append all new log's bufferlists to write out.
+  new_log_writer->append(starter_bl);
+  new_log_writer->append(compacted_meta_bl);
+
+  // 3.2. flush and wait
   _flush_special(new_log_writer);
+  _flush_bdev(new_log_writer, false); // do not check log.lock is locked
 
-  // 4. wait
-  _flush_bdev(new_log_writer);
-  // 5. update our log fnode
-  // we need to append to new_log the extents that were allocated in step 1.1
-  new_log->fnode.claim_extents(new_log_tail_fnode.extents);
-  // we will write it to super
-  new_log->fnode.reset_delta();
+  // Part 4.
+  // Write out new superblock to reflect all the changes.
+  //
 
-  // 6. write the super block to reflect the changes
-  dout(10) << __func__ << " writing super" << dendl;
-  new_log->fnode.ino = log_file->fnode.ino;
-  new_log->fnode.size = 0;
-  new_log->fnode.mtime = ceph_clock_now();
-  super.log_fnode = new_log->fnode;
   _write_super(BDEV_DB);
   _flush_bdev();
 
+  // Part 5.
+  // Apply new log fnode
+  //
+
+  // we need to acquire log's lock back at this point
   log.lock.lock();
-  // swapping log_file and new_log, new log file is the log file now.
+  // Reconstruct actual log object from the new one.
   vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
-  new_log->fnode.swap_extents(log_file->fnode);
-
-  log.writer->pos = log.writer->file->fnode.size =
-    log.writer->pos - old_log_jump_to + new_log_jump_to;
-
+  log_file->fnode.size =
+    log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need;
+  log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime);
+  log_file->fnode.swap_extents(new_log->fnode);
+  // update log's writer
+  log.writer->pos = log.writer->file->fnode.size;
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
-
+  // and unlock
   log.lock.unlock();
 
+  // we're mostly done
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+
+  //Part 6.
+  // Finalization
+  // 6.1 Permit log's extension, forbidden at step 0.
+  //
+  // 6.2 Release the new log writer
+  //
+  // 6.3 Release old space
+  //
+  // 6.4. Enable other compactions
+  //
+
+  // 6.1 Permit log's extension, forbidden at step 0.
   old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
   ceph_assert(old_forbidden == true);
   //to wake up if someone was in need of expanding log
   log_cond.notify_all();
 
-  // 7. release old space
-  dout(10) << __func__
-           << " release old log extents " << old_log_snapshot_fnode.extents
-           << dendl;
+  // 6.2 Release the new log writer
+  _close_writer(new_log_writer);
+  new_log_writer = nullptr;
+  new_log = nullptr;
+
+  // 6.3 Release old space
   {
+    dout(10) << __func__
+             << " release old log extents " << old_log_fnode.extents
+             << dendl;
     std::lock_guard dl(dirty.lock);
-    for (auto& r : old_log_snapshot_fnode.extents) {
+    for (auto& r : old_log_fnode.extents) {
       dirty.pending_release[r.bdev].insert(r.offset, r.length);
     }
   }
 
-  // delete the new log, remove from the dirty files list
-  _close_writer(new_log_writer);
-  new_log_writer = nullptr;
-  new_log = nullptr;
-  log_cond.notify_all();
-
-  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
-  logger->inc(l_bluefs_log_compactions);
-
+  // 6.4. Enable other compactions
   old_is_comp = atomic_exchange(&log_is_compacting, false);
   ceph_assert(old_is_comp);
 }
@@ -3378,12 +3647,14 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
 }
 
 // be careful - either h->file->lock or log.lock must be taken
-void BlueFS::_flush_bdev(FileWriter *h)
+void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked)
 {
-  if (h->file->fnode.ino > 1) {
-    ceph_assert(ceph_mutex_is_locked(h->lock));
-  } else if (h->file->fnode.ino == 1) {
-    ceph_assert(ceph_mutex_is_locked(log.lock));
+  if (check_mutext_locked) {
+    if (h->file->fnode.ino > 1) {
+      ceph_assert(ceph_mutex_is_locked(h->lock));
+    } else if (h->file->fnode.ino == 1) {
+      ceph_assert(ceph_mutex_is_locked(log.lock));
+    }
   }
   std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
   h->dirty_devs.fill(false);
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index d960018e39881..eb85939eb1171 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -449,6 +449,10 @@ class BlueFS {
   int _flush_and_sync_log_LD(uint64_t want_seq = 0);
 
   uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
+  uint64_t _make_initial_transaction(uint64_t start_seq,
+                                     bluefs_fnode_t& fnode,
+                                     uint64_t expected_final_size,
+                                     bufferlist* out);
   uint64_t _estimate_log_size_N();
   bool _should_start_compact_log_L_N();
 
@@ -458,9 +462,10 @@ class BlueFS {
     RENAME_SLOW2DB = 4,
     RENAME_DB2SLOW = 8,
   };
-  void _compact_log_dump_metadata_NF(bluefs_transaction_t *t,
-				 int flags,
-				 uint64_t capture_before_seq);
+  void _compact_log_dump_metadata_NF(uint64_t start_seq,
+                                     bluefs_transaction_t *t,
+				     int flags,
+				     uint64_t capture_before_seq);
 
   void _compact_log_sync_LNF_LD();
   void _compact_log_async_LD_LNF_D();
@@ -474,7 +479,7 @@ class BlueFS {
 
   //void _aio_finish(void *priv);
 
-  void _flush_bdev(FileWriter *h);
+  void _flush_bdev(FileWriter *h, bool check_mutex_locked = true);
   void _flush_bdev();  // this is safe to call without a lock
   void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs);  // this is safe to call without a lock
 
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
index cbf0ed1def939..d5d8ee5a62826 100644
--- a/src/os/bluestore/bluefs_types.h
+++ b/src/os/bluestore/bluefs_types.h
@@ -62,7 +62,7 @@ struct bluefs_fnode_t {
   uint64_t ino;
   uint64_t size;
   utime_t mtime;
-  uint8_t __unused__; // was prefer_bdev
+  uint8_t __unused__ = 0; // was prefer_bdev
   mempool::bluefs::vector<bluefs_extent_t> extents;
 
   // precalculated logical offsets for extents vector entries
@@ -72,7 +72,15 @@ struct bluefs_fnode_t {
   uint64_t allocated;
   uint64_t allocated_commited;
 
-  bluefs_fnode_t() : ino(0), size(0), __unused__(0), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t(uint64_t _ino, uint64_t _size, utime_t _mtime) :
+    ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t(const bluefs_fnode_t& other) :
+    ino(other.ino), size(other.size), mtime(other.mtime),
+    allocated(other.allocated),
+    allocated_commited(other.allocated_commited) {
+    clone_extents(other);
+  }
 
   uint64_t get_allocated() const {
     return allocated;
@@ -111,7 +119,6 @@ struct bluefs_fnode_t {
     denc(v.extents, p);
     DENC_FINISH(p);
   }
-
   void reset_delta() {
     allocated_commited = allocated;
   }

From 228c0532d2886b7662343b68d4c27e6917e2c753 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Wed, 9 Nov 2022 16:45:03 +0300
Subject: [PATCH 08/13] test/test_bluefs: get rid of build warning

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/test/objectstore/test_bluefs.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index ca0afa49f7fb9..0dda9b0c4224c 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -234,7 +234,7 @@ TEST(BlueFS, very_large_write) {
     delete h;
     ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h));
     ASSERT_EQ(h->file->fnode.size, total_written);
-    unique_ptr<char> huge_buf(new char[h->file->fnode.size]);
+    auto huge_buf = std::make_unique<char[]>(h->file->fnode.size);
     auto l = h->file->fnode.size;
     int64_t r = fs.read(h, 0, l, NULL, huge_buf.get());
     ASSERT_EQ(r, l);

From d4a556128e2df1a495dd7897c6a1b0c66a285aa4 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Wed, 9 Nov 2022 17:36:07 +0300
Subject: [PATCH 09/13] os/bluestore: new BlueFS perf counters on compaction.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 23 +++++++++++++++++++++++
 src/os/bluestore/BlueFS.h  |  2 ++
 2 files changed, 25 insertions(+)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 064b058e4f404..457dda8a2a358 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -369,6 +369,14 @@ void BlueFS::_init_logger()
 		    "Bytes requested in prefetch read mode",
 		     NULL,
 		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_time_avg     (l_bluefs_compaction_lat, "compact_lat",
+                    "Average bluefs log compaction latency",
+                    "c__t",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg     (l_bluefs_compaction_lock_lat, "compact_lock_lat",
+                    "Average lock duration while compacting bluefs log",
+                    "c_lt",
+                    PerfCountersBuilder::PRIO_INTERESTING);
   b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
 	    "How many times bluefs read found page with all 0s");
   b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
@@ -2478,6 +2486,7 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   // Part 0.
   // Lock the log totally till the end of the procedure
   std::lock_guard ll(log.lock);
+  auto t0 = mono_clock::now();
 
   File *log_file = log.writer->file.get();
   bluefs_fnode_t fnode_tail;
@@ -2671,6 +2680,7 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
       dirty.pending_release[r.bdev].insert(r.offset, r.length);
     }
   }
+  logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
 }
 
 /*
@@ -2717,6 +2727,16 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   }
   // lock log's run-time structures for a while
   log.lock.lock();
+  auto t0 = mono_clock::now();
+
+  // Part 1.
+  // Prepare current log for jumping into it.
+  // 1. Allocate extent
+  // 2. Update op to log
+  // 3. Jump op to log
+  // During that, no one else can write to log, otherwise we risk jumping backwards.
+  // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
+
   //signal _maybe_extend_log that expansion of log is temporary inacceptable
   bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
   ceph_assert(old_forbidden == false);
@@ -2799,6 +2819,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   // now state is captured to compacted_meta_t,
   // current log can be used to write to,
   //ops in log will be continuation of captured state
+  logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
   log.lock.unlock();
 
   // 2.2 Allocate the space required for the compacted meta transaction
@@ -3863,11 +3884,13 @@ void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
 {
   if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
       _should_start_compact_log_L_N()) {
+    auto t0 = mono_clock::now();
     if (cct->_conf->bluefs_compact_log_sync) {
       _compact_log_sync_LNF_LD();
     } else {
       _compact_log_async_LD_LNF_D();
     }
+    logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0);
   }
 }
 
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index eb85939eb1171..458a39cf58b8d 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -61,6 +61,8 @@ enum {
   l_bluefs_read_disk_bytes_slow,
   l_bluefs_read_prefetch_count,
   l_bluefs_read_prefetch_bytes,
+  l_bluefs_compaction_lat,
+  l_bluefs_compaction_lock_lat,
   l_bluefs_read_zeros_candidate,
   l_bluefs_read_zeros_errors,
 

From e5b7ba93191ad41c4a4beae802b7e86c8febec04 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Wed, 9 Nov 2022 17:58:52 +0300
Subject: [PATCH 10/13] os/bluestore: output cosmetics for BlueFS

This includes finer position specification during replay
and logging read size in hex.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 457dda8a2a358..529a34fc22972 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -1310,7 +1310,9 @@ int BlueFS::_replay(bool noop, bool to_stdout)
     }
 
     auto p = t.op_bl.cbegin();
+    auto pos0 = pos;
     while (!p.end()) {
+      pos = pos0 + p.get_off();
       __u8 op;
       decode(op, p);
       switch (op) {
@@ -2132,7 +2134,9 @@ int64_t BlueFS::_read_random(
       buf->pos += r;
     }
   }
-  dout(20) << __func__ << " got " << ret << dendl;
+  dout(20) << __func__ << std::hex
+           << " got 0x" << ret
+           << std::dec  << dendl;
   --h->file->num_reading;
   return ret;
 }
@@ -2257,7 +2261,9 @@ int64_t BlueFS::_read(
     buf->pos += r;
   }
 
-  dout(20) << __func__ << " got " << ret << dendl;
+  dout(20) << __func__ << std::hex
+           << " got 0x" << ret
+           << std::dec  << dendl;
   ceph_assert(!outbl || (int)outbl->length() == ret);
   --h->file->num_reading;
   return ret;

From 001b08d0b755a855f19f15b84104c7eb3d367c60 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Fri, 11 Nov 2022 01:06:15 +0300
Subject: [PATCH 11/13] os/bluestore: support main/slow device's alloc unit for
 BlueFS.

This effectively enables having 4K allocation units for BlueFS.
But it doesn't turn it on by default for the sake of performance.
Using main device which lacks enough free large continuous extents
might do the trick though.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc          | 122 ++++++++++-----
 src/os/bluestore/BlueFS.h           |  11 +-
 src/os/bluestore/BlueStore.cc       |   4 +-
 src/test/objectstore/test_bluefs.cc | 222 +++++++++++++++++++++++++++-
 4 files changed, 313 insertions(+), 46 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 529a34fc22972..326cbce5472ce 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -377,6 +377,16 @@ void BlueFS::_init_logger()
                     "Average lock duration while compacting bluefs log",
                     "c_lt",
                     PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback",
+		    "Amount of allocations that required fallback to "
+                    " slow/shared device",
+		     "asdf",
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback",
+		    "Amount of allocations that required fallback to shared device's "
+                    "regular unit size",
+		     "assf",
+		    PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
 	    "How many times bluefs read found page with all 0s");
   b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
@@ -586,6 +596,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   int r = _allocate(
     vselector->select_prefer_bdev(log_file->vselector_hint),
     cct->_conf->bluefs_max_log_runway,
+    0,
     &log_file->fnode);
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
   ceph_assert(r == 0);
@@ -1098,12 +1109,17 @@ int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
     auto id = e.bdev;
     bool fail = false;
     ceph_assert(id < MAX_BDEV);
+    ceph_assert(bdev[id]);
+    // let's use minimal allocation unit we can have
+    auto alloc_unit = bdev[id]->get_block_size();
+
     if (int r = _verify_alloc_granularity(id, e.offset, e.length,
+                                          alloc_unit,
 					  op_name); r < 0) {
       return r;
     }
 
-    apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
+    apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id],
       [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
 	if (is_alloc == bs.test(pos)) {
 	  fail = true;
@@ -1125,31 +1141,14 @@ int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
 }
 
 int BlueFS::_verify_alloc_granularity(
-  __u8 id, uint64_t offset, uint64_t length, const char *op)
+  __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op)
 {
-  if ((offset & (alloc_size[id] - 1)) ||
-      (length & (alloc_size[id] - 1))) {
+  if ((offset & (alloc_unit - 1)) ||
+      (length & (alloc_unit - 1))) {
     derr << __func__ << " " << op << " of " << (int)id
 	 << ":0x" << std::hex << offset << "~" << length << std::dec
 	 << " does not align to alloc_size 0x"
-	 << std::hex << alloc_size[id] << std::dec << dendl;
-    // be helpful
-    auto need = alloc_size[id];
-    while (need && ((offset & (need - 1)) ||
-		    (length & (need - 1)))) {
-      need >>= 1;
-    }
-    if (need) {
-      const char *which;
-      if (id == BDEV_SLOW ||
-	  (id == BDEV_DB && !bdev[BDEV_SLOW])) {
-	which = "bluefs_shared_alloc_size";
-      } else {
-	which = "bluefs_alloc_size";
-      }
-      derr << "work-around by setting " << which << " = " << need
-	   << " for this OSD" << dendl;
-    }
+	 << std::hex << alloc_unit << std::dec << dendl;
     return -EFAULT;
   }
   return 0;
@@ -1186,8 +1185,11 @@ int BlueFS::_replay(bool noop, bool to_stdout)
   if (!noop) {
     if (cct->_conf->bluefs_log_replay_check_allocations) {
       for (size_t i = 0; i < MAX_BDEV; ++i) {
-	if (alloc_size[i] != 0 && bdev[i] != nullptr) {
-	  used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
+	if (bdev[i] != nullptr) {
+          // let's use minimal allocation unit we can have
+          auto au = bdev[i]->get_block_size();
+          //hmm... on 32TB/4K drive this would take 1GB RAM!!!
+	  used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au);
 	}
       }
       // check initial log layout
@@ -1767,7 +1769,7 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // write entire file
-      auto l = _allocate_without_fallback(dev_target, bl.length(),
+      auto l = _allocate_without_fallback(dev_target, bl.length(), 0,
         &file_ref->fnode);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
@@ -1907,7 +1909,7 @@ int BlueFS::device_migrate_to_new(
       }
 
       // write entire file
-      auto l = _allocate_without_fallback(dev_target, bl.length(),
+      auto l = _allocate_without_fallback(dev_target, bl.length(), 0,
         &file_ref->fnode);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
@@ -2551,8 +2553,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
 
   int r = allocate_with_fallback ?
-    _allocate(log_dev, compacted_meta_need, &fnode_tail) :
-    _allocate_without_fallback(log_dev, compacted_meta_need, &fnode_tail);
+    _allocate(log_dev, compacted_meta_need, 0, &fnode_tail) :
+    _allocate_without_fallback(log_dev, compacted_meta_need, 0, &fnode_tail);
   ceph_assert(r == 0);
 
 
@@ -2563,8 +2565,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
 
   bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
   r = allocate_with_fallback ?
-    _allocate(log_dev, starter_need, &fnode_starter) :
-    _allocate_without_fallback(log_dev, starter_need, &fnode_starter);
+    _allocate(log_dev, starter_need, 0, &fnode_starter) :
+    _allocate_without_fallback(log_dev, starter_need, 0, &fnode_starter);
   ceph_assert(r == 0);
 
   // 1.4 Building starter fnode
@@ -2766,6 +2768,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
            << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
   int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
 		    cct->_conf->bluefs_max_log_runway,
+                    0,
                     &fnode_tail);
   ceph_assert(r == 0);
 
@@ -2837,6 +2840,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
     // do allocate
     r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
                   compacted_meta_need,
+                  0,
                   &fnode_pre_tail);
     ceph_assert(r == 0);
     // build trailing list of extents in fnode_tail,
@@ -2858,6 +2862,7 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   // and now allocate and store at new_log_fnode
   r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
                 starter_need,
+                0,
                 &new_log->fnode);
   ceph_assert(r == 0);
 
@@ -3065,6 +3070,7 @@ int64_t BlueFS::_maybe_extend_log()
     int r = _allocate(
       vselector->select_prefer_bdev(log.writer->file->vselector_hint),
       cct->_conf->bluefs_max_log_runway,
+      0,
       &log.writer->file->fnode);
     ceph_assert(r == 0);
     vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
@@ -3366,6 +3372,7 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
     // in _flush_and_sync_log.
     int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
 		      offset + length - allocated,
+                      0,
 		      &h->file->fnode);
     if (r < 0) {
       derr << __func__ << " allocated: 0x" << std::hex << allocated
@@ -3727,18 +3734,24 @@ const char* BlueFS::get_device_name(unsigned id)
 }
 
 int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
+                      uint64_t alloc_unit,
 		      bluefs_fnode_t* node)
 {
-  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
+  dout(10) << __func__ << " len 0x" << std::hex << len
+           << " alloc_unit hint 0x " << alloc_unit
+           << std::dec
            << " from " << (int)id << dendl;
   assert(id < alloc.size());
   if (!alloc[id]) {
     return -ENOENT;
   }
+  if (!alloc_unit) {
+   alloc_unit = alloc_size[id];
+  }
   PExtentVector extents;
   extents.reserve(4);  // 4 should be (more than) enough for most allocations
-  int64_t need = round_up_to(len, alloc_size[id]);
-  int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, &extents);
+  int64_t need = round_up_to(len, alloc_unit);
+  int64_t alloc_len = alloc[id]->allocate(need, alloc_unit, 0, &extents);
   if (alloc_len < 0 || alloc_len < need) {
     if (alloc_len > 0) {
       alloc[id]->release(extents);
@@ -3754,6 +3767,15 @@ int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
          << ", fragmentation " << alloc[id]->get_fragmentation()
          << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
 	 << std::dec << dendl;
+    if (is_shared_alloc(id) && alloc_unit != shared_alloc->alloc_unit) {
+      // fallback to shared alloc unit is permitted though
+      alloc_unit = shared_alloc->alloc_unit;
+      dout(20) << __func__ << " fallback to bdev "
+	       << (int)id
+               << " with alloc unit 0x" << std::hex << alloc_unit
+               << std::dec << dendl;
+      return _allocate_without_fallback(id, len, alloc_unit, node);
+    }
     alloc[id]->dump();
     return -ENOSPC;
   }
@@ -3768,22 +3790,27 @@ int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
 }
 
 int BlueFS::_allocate(uint8_t id, uint64_t len,
+		      uint64_t alloc_unit,
 		      bluefs_fnode_t* node)
 {
-  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
-           << " from " << (int)id << dendl;
+  dout(10) << __func__ << " len 0x" << std::hex << len
+           << " alloc unit hint 0x" << alloc_unit
+           << std::dec << " from " << (int)id << dendl;
   ceph_assert(id < alloc.size());
   int64_t alloc_len = 0;
   PExtentVector extents;
   uint64_t hint = 0;
   int64_t need = len;
   if (alloc[id]) {
-    need = round_up_to(len, alloc_size[id]);
+    if (!alloc_unit) {
+      alloc_unit = alloc_size[id];
+    }
+    need = round_up_to(len, alloc_unit);
     if (!node->extents.empty() && node->extents.back().bdev == id) {
       hint = node->extents.back().end();
     }   
     extents.reserve(4);  // 4 should be (more than) enough for most allocations
-    alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
+    alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
   }
   if (alloc_len < 0 || alloc_len < need) {
     if (alloc[id]) {
@@ -3796,20 +3823,32 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
               << ", allocator type " << alloc[id]->get_type()
               << ", capacity 0x" << alloc[id]->get_capacity()
               << ", block size 0x" << alloc[id]->get_block_size()
-              << ", alloc size 0x" << alloc_size[id]
+              << ", alloc unit 0x" << alloc_unit
               << ", free 0x" << alloc[id]->get_free()
               << ", fragmentation " << alloc[id]->get_fragmentation()
               << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
 	      << std::dec << dendl;
     } else {
-      dout(20) << __func__ << " alloc-id not set on index="<< (int)id << " unable to allocate 0x" << std::hex << need
+      dout(20) << __func__ << " alloc-id not set on index="<< (int)id
+               << " unable to allocate 0x" << std::hex << need
 	       << " on bdev " << (int)id << std::dec << dendl;
     }
-    if (id != BDEV_SLOW) {
+    if (alloc[id] && is_shared_alloc(id) && alloc_unit != shared_alloc->alloc_unit) {
+      alloc_unit = shared_alloc->alloc_unit;
+      dout(20) << __func__ << " fallback to bdev "
+	       << (int)id
+               << " with alloc unit 0x" << std::hex << alloc_unit
+               << std::dec << dendl;
+      logger->inc(l_bluefs_alloc_shared_size_fallbacks);
+      return _allocate(id, len, alloc_unit, node);
+    } else if (id != BDEV_SLOW && alloc[id + 1]) {
       dout(20) << __func__ << " fallback to bdev "
 	       << (int)id + 1
 	       << dendl;
-      return _allocate(id + 1, len, node);
+      if (alloc[id] && is_shared_alloc(id + 1)) {
+        logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
+      }
+      return _allocate(id + 1, len, 0, node); // back to default alloc unit
     } else {
       derr << __func__ << " allocation failed, needed 0x" << std::hex << need
            << dendl;
@@ -3851,6 +3890,7 @@ int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
     vselector->sub_usage(f->vselector_hint, f->fnode);
     int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
       want,
+      0,
       &f->fnode);
     vselector->add_usage(f->vselector_hint, f->fnode);
     if (r < 0)
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 458a39cf58b8d..88e47d9f8d97f 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -63,9 +63,10 @@ enum {
   l_bluefs_read_prefetch_bytes,
   l_bluefs_compaction_lat,
   l_bluefs_compaction_lock_lat,
+  l_bluefs_alloc_shared_dev_fallbacks,
+  l_bluefs_alloc_shared_size_fallbacks,
   l_bluefs_read_zeros_candidate,
   l_bluefs_read_zeros_errors,
-
   l_bluefs_last,
 };
 
@@ -94,16 +95,19 @@ class BlueFSVolumeSelector {
 struct bluefs_shared_alloc_context_t {
   bool need_init = false;
   Allocator* a = nullptr;
+  uint64_t alloc_unit = 0;
 
   std::atomic<uint64_t> bluefs_used = 0;
 
-  void set(Allocator* _a) {
+  void set(Allocator* _a, uint64_t _au) {
     a = _a;
+    alloc_unit = _au;
     need_init = true;
     bluefs_used = 0;
   }
   void reset() {
     a = nullptr;
+    alloc_unit = 0;
   }
 };
 
@@ -421,8 +425,10 @@ class BlueFS {
   }
   const char* get_device_name(unsigned id);
   int _allocate(uint8_t bdev, uint64_t len,
+                uint64_t alloc_unit,
 		bluefs_fnode_t* node);
   int _allocate_without_fallback(uint8_t id, uint64_t len,
+				 uint64_t alloc_unit,
 				 bluefs_fnode_t* node);
 
   /* signal replay log to include h->file in nearest log flush */
@@ -508,6 +514,7 @@ class BlueFS {
     const char* op_name);
   int _verify_alloc_granularity(
     __u8 id, uint64_t offset, uint64_t length,
+    uint64_t alloc_unit,
     const char *op);
   int _replay(bool noop, bool to_stdout = false); ///< replay journal
 
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 0d5a291761e07..71b85217d221d 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -5840,12 +5840,12 @@ int BlueStore::_create_alloc()
       delete alloc;
       return -EINVAL;
     }
-    shared_alloc.set(a);
+    shared_alloc.set(a, alloc_size);
   } else
 #endif
   {
     // BlueFS will share the same allocator
-    shared_alloc.set(alloc);
+    shared_alloc.set(alloc, alloc_size);
   }
 
   return 0;
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index 0dda9b0c4224c..f665ca1f82ee7 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -10,13 +10,14 @@
 #include <random>
 #include <thread>
 #include <stack>
+#include <gtest/gtest.h>
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "include/stringify.h"
 #include "include/scope_guard.h"
 #include "common/errno.h"
-#include <gtest/gtest.h>
 
+#include "os/bluestore/Allocator.h"
 #include "os/bluestore/BlueFS.h"
 
 using namespace std;
@@ -1111,6 +1112,225 @@ TEST(BlueFS, truncate_fsync) {
   }
 }
 
+TEST(BlueFS, test_shared_alloc) {
+  uint64_t size = 1048576 * 128;
+  TempBdev bdev_slow{size};
+  uint64_t size_db = 1048576 * 8;
+  TempBdev bdev_db{size_db};
+
+  ConfSaver conf(g_ceph_context->_conf);
+  conf.SetVal("bluefs_shared_alloc_size", "1048576");
+
+  bluefs_shared_alloc_context_t shared_alloc;
+  uint64_t shared_alloc_unit = 4096;
+  shared_alloc.set(
+    Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+                      size, shared_alloc_unit, 0, 0, "test shared allocator"),
+    shared_alloc_unit);
+  shared_alloc.a->init_add_free(0, size);
+
+  BlueFS fs(g_ceph_context);
+  // DB device is fully utilized
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, size_db - 0x1000));
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0,
+                                   &shared_alloc));
+  uuid_d fsid;
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+  ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+  {
+    for (int i=0; i<10; i++) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       ASSERT_EQ(0, fs.mkdir(dir));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          BlueFS::FileWriter *h;
+          ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+          ASSERT_NE(nullptr, h);
+          auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+          bufferlist bl;
+          std::unique_ptr<char[]> buf = gen_buffer(4096);
+	  bufferptr bp = buffer::claim_char(4096, buf.get());
+	  bl.push_back(bp);
+          h->append(bl.c_str(), bl.length());
+          fs.fsync(h);
+       }
+    }
+  }
+  {
+    for (int i=0; i<10; i+=2) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          fs.unlink(dir, file);
+	  fs.sync_metadata(false);
+       }
+       ASSERT_EQ(0, fs.rmdir(dir));
+       fs.sync_metadata(false);
+    }
+  }
+  fs.compact_log();
+  auto *logger = fs.get_perf_counters();
+  ASSERT_NE(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0);
+  auto num_files = logger->get(l_bluefs_num_files);
+  fs.umount();
+  fs.mount();
+  ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+  fs.umount();
+}
+
+TEST(BlueFS, test_shared_alloc_sparse) {
+  uint64_t size = 1048576 * 128 * 2;
+  uint64_t main_unit = 4096;
+  uint64_t bluefs_alloc_unit = 1048576;
+  TempBdev bdev_slow{size};
+
+  ConfSaver conf(g_ceph_context->_conf);
+  conf.SetVal("bluefs_shared_alloc_size",
+    stringify(bluefs_alloc_unit).c_str());
+
+  bluefs_shared_alloc_context_t shared_alloc;
+  shared_alloc.set(
+    Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+                      size, main_unit, 0, 0, "test shared allocator"),
+    main_unit);
+  // prepare sparse free space but let's have a continuous chunk at
+  // the beginning to fit initial log's fnode into superblock,
+  // we don't have any tricks to deal with sparse allocations
+  // (and hence long fnode) at mkfs
+  shared_alloc.a->init_add_free(bluefs_alloc_unit, 4 * bluefs_alloc_unit);
+  for(uint64_t i = 5 * bluefs_alloc_unit; i < size; i += 2 * main_unit) {
+    shared_alloc.a->init_add_free(i, main_unit);
+  }
+
+  BlueFS fs(g_ceph_context);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0,
+                                   &shared_alloc));
+  uuid_d fsid;
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+  ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+  {
+    for (int i=0; i<10; i++) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       ASSERT_EQ(0, fs.mkdir(dir));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          BlueFS::FileWriter *h;
+          ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+          ASSERT_NE(nullptr, h);
+          auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+          bufferlist bl;
+          std::unique_ptr<char[]> buf = gen_buffer(4096);
+	  bufferptr bp = buffer::claim_char(4096, buf.get());
+	  bl.push_back(bp);
+          h->append(bl.c_str(), bl.length());
+          fs.fsync(h);
+       }
+    }
+  }
+  {
+    for (int i=0; i<10; i+=2) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          fs.unlink(dir, file);
+	  fs.sync_metadata(false);
+       }
+       ASSERT_EQ(0, fs.rmdir(dir));
+       fs.sync_metadata(false);
+    }
+  }
+  fs.compact_log();
+  auto *logger = fs.get_perf_counters();
+  ASSERT_NE(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0);
+  auto num_files = logger->get(l_bluefs_num_files);
+  fs.umount();
+
+  fs.mount();
+  ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+  fs.umount();
+}
+
+TEST(BlueFS, test_4k_shared_alloc) {
+  uint64_t size = 1048576 * 128 * 2;
+  uint64_t main_unit = 4096;
+  uint64_t bluefs_alloc_unit = main_unit;
+  TempBdev bdev_slow{size};
+
+  ConfSaver conf(g_ceph_context->_conf);
+  conf.SetVal("bluefs_shared_alloc_size",
+    stringify(bluefs_alloc_unit).c_str());
+
+  bluefs_shared_alloc_context_t shared_alloc;
+  shared_alloc.set(
+    Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+                      size, main_unit, 0, 0, "test shared allocator"),
+    main_unit);
+  shared_alloc.a->init_add_free(bluefs_alloc_unit, size - bluefs_alloc_unit);
+
+  BlueFS fs(g_ceph_context);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0,
+                                   &shared_alloc));
+  uuid_d fsid;
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+  ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+  {
+    for (int i=0; i<10; i++) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       ASSERT_EQ(0, fs.mkdir(dir));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          BlueFS::FileWriter *h;
+          ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+          ASSERT_NE(nullptr, h);
+          auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+          bufferlist bl;
+          std::unique_ptr<char[]> buf = gen_buffer(4096);
+	  bufferptr bp = buffer::claim_char(4096, buf.get());
+	  bl.push_back(bp);
+          h->append(bl.c_str(), bl.length());
+          fs.fsync(h);
+       }
+    }
+  }
+  {
+    for (int i=0; i<10; i+=2) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          fs.unlink(dir, file);
+	  fs.sync_metadata(false);
+       }
+       ASSERT_EQ(0, fs.rmdir(dir));
+       fs.sync_metadata(false);
+    }
+  }
+  fs.compact_log();
+  auto *logger = fs.get_perf_counters();
+  ASSERT_EQ(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0);
+  ASSERT_EQ(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0);
+  auto num_files = logger->get(l_bluefs_num_files);
+  fs.umount();
+
+  fs.mount();
+  ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+  fs.umount();
+}
+
 int main(int argc, char **argv) {
   auto args = argv_to_vec(argc, argv);
   map<string,string> defaults = {

From 62ae4e4dc68dd7fc97f9f80e4da7699ec3668d2b Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Fri, 11 Nov 2022 03:17:51 +0300
Subject: [PATCH 12/13] os/bluestore: get rid off
 BlueFS::allocate_without_fallback.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/os/bluestore/BlueFS.cc | 103 +++++++++++--------------------------
 src/os/bluestore/BlueFS.h  |  11 ++--
 2 files changed, 35 insertions(+), 79 deletions(-)

diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 326cbce5472ce..c37d28ce751b9 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -1769,8 +1769,8 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // write entire file
-      auto l = _allocate_without_fallback(dev_target, bl.length(), 0,
-        &file_ref->fnode);
+      auto l = _allocate(dev_target, bl.length(), 0,
+        &file_ref->fnode, 0, false);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
 	     << bl.length() << std::dec << " from " << (int)dev_target
@@ -1909,8 +1909,8 @@ int BlueFS::device_migrate_to_new(
       }
 
       // write entire file
-      auto l = _allocate_without_fallback(dev_target, bl.length(), 0,
-        &file_ref->fnode);
+      auto l = _allocate(dev_target, bl.length(), 0,
+        &file_ref->fnode, 0, false);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
 	     << bl.length() << std::dec << " from " << (int)dev_target
@@ -2470,7 +2470,7 @@ void BlueFS::_compact_log_sync_LNF_LD()
  * 4. Finalization. Old space release.
  */
 
-void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
+void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
 					 int super_dev,
 					 int log_dev,
 					 int log_dev_new,
@@ -2479,8 +2479,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
 {
   // we substitute log_dev with log_dev_new for new allocations below
   // and permitting fallback allocations prevents such a substitution
-  ceph_assert((allocate_with_fallback && log_dev == log_dev_new) ||
-              !allocate_with_fallback);
+  ceph_assert((permit_dev_fallback && log_dev == log_dev_new) ||
+              !permit_dev_fallback);
 
   dout(10) << __func__ << " super_dev:" << super_dev
                        << " log_dev:" << log_dev
@@ -2552,9 +2552,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
 
   dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
 
-  int r = allocate_with_fallback ?
-    _allocate(log_dev, compacted_meta_need, 0, &fnode_tail) :
-    _allocate_without_fallback(log_dev, compacted_meta_need, 0, &fnode_tail);
+  int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
+    permit_dev_fallback);
   ceph_assert(r == 0);
 
 
@@ -2564,9 +2563,8 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
 
   bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
-  r = allocate_with_fallback ?
-    _allocate(log_dev, starter_need, 0, &fnode_starter) :
-    _allocate_without_fallback(log_dev, starter_need, 0, &fnode_starter);
+  r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
+    permit_dev_fallback);
   ceph_assert(r == 0);
 
   // 1.4 Building starter fnode
@@ -3733,65 +3731,11 @@ const char* BlueFS::get_device_name(unsigned id)
   return names[id];
 }
 
-int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
-                      uint64_t alloc_unit,
-		      bluefs_fnode_t* node)
-{
-  dout(10) << __func__ << " len 0x" << std::hex << len
-           << " alloc_unit hint 0x " << alloc_unit
-           << std::dec
-           << " from " << (int)id << dendl;
-  assert(id < alloc.size());
-  if (!alloc[id]) {
-    return -ENOENT;
-  }
-  if (!alloc_unit) {
-   alloc_unit = alloc_size[id];
-  }
-  PExtentVector extents;
-  extents.reserve(4);  // 4 should be (more than) enough for most allocations
-  int64_t need = round_up_to(len, alloc_unit);
-  int64_t alloc_len = alloc[id]->allocate(need, alloc_unit, 0, &extents);
-  if (alloc_len < 0 || alloc_len < need) {
-    if (alloc_len > 0) {
-      alloc[id]->release(extents);
-    }
-    derr << __func__ << " unable to allocate 0x" << std::hex << need
-	 << " on bdev " << (int)id
-         << ", allocator name " << alloc[id]->get_name()
-         << ", allocator type " << alloc[id]->get_type()
-         << ", capacity 0x" << alloc[id]->get_capacity()
-         << ", block size 0x" << alloc[id]->get_block_size()
-         << ", alloc size 0x" << alloc_size[id]
-         << ", free 0x" << alloc[id]->get_free()
-         << ", fragmentation " << alloc[id]->get_fragmentation()
-         << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
-	 << std::dec << dendl;
-    if (is_shared_alloc(id) && alloc_unit != shared_alloc->alloc_unit) {
-      // fallback to shared alloc unit is permitted though
-      alloc_unit = shared_alloc->alloc_unit;
-      dout(20) << __func__ << " fallback to bdev "
-	       << (int)id
-               << " with alloc unit 0x" << std::hex << alloc_unit
-               << std::dec << dendl;
-      return _allocate_without_fallback(id, len, alloc_unit, node);
-    }
-    alloc[id]->dump();
-    return -ENOSPC;
-  }
-  if (is_shared_alloc(id)) {
-    shared_alloc->bluefs_used += alloc_len;
-  }
-  for (auto& p : extents) {
-    node->append_extent(bluefs_extent_t(id, p.offset, p.length));
-  }
-
-  return 0;
-}
-
 int BlueFS::_allocate(uint8_t id, uint64_t len,
 		      uint64_t alloc_unit,
-		      bluefs_fnode_t* node)
+		      bluefs_fnode_t* node,
+                      size_t alloc_attempts,
+                      bool permit_dev_fallback)
 {
   dout(10) << __func__ << " len 0x" << std::hex << len
            << " alloc unit hint 0x" << alloc_unit
@@ -3809,6 +3753,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
     if (!node->extents.empty() && node->extents.back().bdev == id) {
       hint = node->extents.back().end();
     }   
+    ++alloc_attempts;
     extents.reserve(4);  // 4 should be (more than) enough for most allocations
     alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
   }
@@ -3840,15 +3785,25 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
                << " with alloc unit 0x" << std::hex << alloc_unit
                << std::dec << dendl;
       logger->inc(l_bluefs_alloc_shared_size_fallbacks);
-      return _allocate(id, len, alloc_unit, node);
-    } else if (id != BDEV_SLOW && alloc[id + 1]) {
+      return _allocate(id,
+                       len,
+                       alloc_unit,
+                       node,
+                       alloc_attempts,
+                       permit_dev_fallback);
+    } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
       dout(20) << __func__ << " fallback to bdev "
 	       << (int)id + 1
 	       << dendl;
-      if (alloc[id] && is_shared_alloc(id + 1)) {
+      if (alloc_attempts > 0 && is_shared_alloc(id + 1)) {
         logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
       }
-      return _allocate(id + 1, len, 0, node); // back to default alloc unit
+      return _allocate(id + 1,
+                       len,
+                       0, // back to default alloc unit
+                       node,
+                       alloc_attempts,
+                       permit_dev_fallback);
     } else {
       derr << __func__ << " allocation failed, needed 0x" << std::hex << need
            << dendl;
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 88e47d9f8d97f..0f3a1729f1994 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -337,6 +337,8 @@ class BlueFS {
     l_bluefs_max_bytes_wal,
     l_bluefs_max_bytes_db,
     l_bluefs_max_bytes_slow,
+    l_bluefs_max_bytes_wal,
+    l_bluefs_max_bytes_db,
   };
 
   // cache
@@ -426,10 +428,9 @@ class BlueFS {
   const char* get_device_name(unsigned id);
   int _allocate(uint8_t bdev, uint64_t len,
                 uint64_t alloc_unit,
-		bluefs_fnode_t* node);
-  int _allocate_without_fallback(uint8_t id, uint64_t len,
-				 uint64_t alloc_unit,
-				 bluefs_fnode_t* node);
+		bluefs_fnode_t* node,
+                size_t alloc_attempts = 0,
+                bool permit_dev_fallback = true);
 
   /* signal replay log to include h->file in nearest log flush */
   int _signal_dirty_to_log_D(FileWriter *h);
@@ -478,7 +479,7 @@ class BlueFS {
   void _compact_log_sync_LNF_LD();
   void _compact_log_async_LD_LNF_D();
 
-  void _rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
+  void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
 				    int super_dev,
 				    int log_dev,
 				    int new_log_dev,

From e52bcc852ce51ab99138420f9069e2f59e1cb706 Mon Sep 17 00:00:00 2001
From: Igor Fedotov <igor.fedotov@croit.io>
Date: Fri, 11 Nov 2022 17:31:19 +0300
Subject: [PATCH 13/13] os/bluestore: introduce a cooldown period for failed
 BlueFS allocations.

When using bluefs_shared_alloc_size one might get a long-lasting state when
that large chunks are not available any more and fallback to shared
device min alloc size occurs. The introduced cooldown is intended to
prevent repetitive allocation attempts with bluefs_shared_alloc_size for
a while. The rationale is to eliminate performance penalty these failing
attempts might cause.

Signed-off-by: Igor Fedotov <igor.fedotov@croit.io>
---
 src/common/options/global.yaml.in | 14 ++++++++++-
 src/os/bluestore/BlueFS.cc        | 42 ++++++++++++++++++++++++++-----
 src/os/bluestore/BlueFS.h         |  1 +
 3 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in
index bfab547af2200..29bd39c704f63 100644
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -4025,6 +4025,18 @@ options:
   desc: Allocation unit size for primary/shared device
   default: 64_K
   with_legacy: true
+- name: bluefs_failed_shared_alloc_cooldown
+  type: float
+  level: advanced
+  desc: duration(in seconds) untill the next attempt to use
+   'bluefs_shared_alloc_size' after facing ENOSPC failure.
+  long_desc: Cooldown period(in seconds) when BlueFS uses shared/slow device
+   allocation size instead of "bluefs_shared_alloc_size' one after facing
+   recoverable (via fallback to smaller chunk size) ENOSPC failure. Intended
+   primarily to avoid repetitive unsuccessful allocations which might be
+   expensive.
+  default: 600
+  with_legacy: true
 - name: bluefs_max_prefetch
   type: size
   level: advanced
@@ -4181,7 +4193,7 @@ options:
 - name: bluestore_bluefs_alloc_failure_dump_interval
   type: float
   level: advanced
-  desc: How frequently (in seconds) to dump allocator onBlueFS space allocation failure
+  desc: How frequently (in seconds) to dump allocator on BlueFS space allocation failure
   default: 0
   with_legacy: true
 - name: bluestore_spdk_mem
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index c37d28ce751b9..adfe9d0800d2b 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -1,6 +1,6 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-
+#include <chrono>
 #include "boost/algorithm/string.hpp" 
 #include "bluestore_common.h"
 #include "BlueFS.h"
@@ -28,6 +28,8 @@ using std::set;
 using std::string;
 using std::to_string;
 using std::vector;
+using std::chrono::duration;
+using std::chrono::seconds;
 
 using ceph::bufferlist;
 using ceph::decode;
@@ -3738,17 +3740,37 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
                       bool permit_dev_fallback)
 {
   dout(10) << __func__ << " len 0x" << std::hex << len
-           << " alloc unit hint 0x" << alloc_unit
-           << std::dec << " from " << (int)id << dendl;
+           << " au 0x" << alloc_unit
+           << std::dec << " from " << (int)id
+           << " cooldown " << cooldown_deadline
+           << dendl;
   ceph_assert(id < alloc.size());
   int64_t alloc_len = 0;
   PExtentVector extents;
   uint64_t hint = 0;
   int64_t need = len;
+  bool shared = is_shared_alloc(id);
+  auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0;
+  bool was_cooldown = false;
   if (alloc[id]) {
     if (!alloc_unit) {
       alloc_unit = alloc_size[id];
     }
+    // do not attempt shared_allocator with bluefs alloc unit
+    // when cooling down, fallback to slow dev alloc unit.
+    if (shared && alloc_unit != shared_unit) {
+       if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() <
+           cooldown_deadline) {
+         logger->inc(l_bluefs_alloc_shared_size_fallbacks);
+         alloc_unit = shared_unit;
+         was_cooldown = true;
+       } else if (cooldown_deadline.fetch_and(0)) {
+         // we might get false cooldown_deadline reset at this point
+         // but that's mostly harmless.
+         dout(1) << __func__ << " shared allocation cooldown period elapsed"
+                 << dendl;
+       }
+    }
     need = round_up_to(len, alloc_unit);
     if (!node->extents.empty() && node->extents.back().bdev == id) {
       hint = node->extents.back().end();
@@ -3762,6 +3784,14 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
       if (alloc_len > 0) {
         alloc[id]->release(extents);
       }
+      if (!was_cooldown && shared) {
+        auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown;
+        cooldown_deadline = delay_s +
+          duration_cast<seconds>(real_clock::now().time_since_epoch()).count();
+        dout(1) << __func__ << " shared allocation cooldown set for "
+                << delay_s << "s"
+                << dendl;
+      }
       dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
 	      << " on bdev " << (int)id
               << ", allocator name " << alloc[id]->get_name()
@@ -3778,8 +3808,8 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
                << " unable to allocate 0x" << std::hex << need
 	       << " on bdev " << (int)id << std::dec << dendl;
     }
-    if (alloc[id] && is_shared_alloc(id) && alloc_unit != shared_alloc->alloc_unit) {
-      alloc_unit = shared_alloc->alloc_unit;
+    if (alloc[id] && shared && alloc_unit != shared_unit) {
+      alloc_unit = shared_unit;
       dout(20) << __func__ << " fallback to bdev "
 	       << (int)id
                << " with alloc unit 0x" << std::hex << alloc_unit
@@ -3815,7 +3845,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
       logger->set(max_bytes_pcounters[id], used);
       max_bytes[id] = used;
     }
-    if (is_shared_alloc(id)) {
+    if (shared) {
       shared_alloc->bluefs_used += alloc_len;
     }
   }
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 0f3a1729f1994..1b4cef63e4441 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -399,6 +399,7 @@ class BlueFS {
   inline bool is_shared_alloc(unsigned id) const {
     return id == shared_alloc_id;
   }
+  std::atomic<int64_t> cooldown_deadline = 0;
 
   class SocketHook;
   SocketHook* asok_hook = nullptr;