diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in
index 08d388c4e05f0..0bccbad81beeb 100644
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -4055,6 +4055,18 @@ options:
   desc: Allocation unit size for primary/shared device
   default: 64_K
   with_legacy: true
+- name: bluefs_failed_shared_alloc_cooldown
+  type: float
+  level: advanced
+  desc: duration(in seconds) untill the next attempt to use
+   'bluefs_shared_alloc_size' after facing ENOSPC failure.
+  long_desc: Cooldown period(in seconds) when BlueFS uses shared/slow device
+   allocation size instead of "bluefs_shared_alloc_size' one after facing
+   recoverable (via fallback to smaller chunk size) ENOSPC failure. Intended
+   primarily to avoid repetitive unsuccessful allocations which might be
+   expensive.
+  default: 600
+  with_legacy: true
 - name: bluefs_max_prefetch
   type: size
   level: advanced
@@ -4211,7 +4223,7 @@ options:
 - name: bluestore_bluefs_alloc_failure_dump_interval
   type: float
   level: advanced
-  desc: How frequently (in seconds) to dump allocator onBlueFS space allocation failure
+  desc: How frequently (in seconds) to dump allocator on BlueFS space allocation failure
   default: 0
   with_legacy: true
 - name: bluestore_spdk_mem
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index fc5285d135bbe..274ab0a5c9da2 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -1,6 +1,6 @@
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
 // vim: ts=8 sw=2 smarttab
-
+#include <chrono>
 #include "boost/algorithm/string.hpp" 
 #include "bluestore_common.h"
 #include "BlueFS.h"
@@ -28,6 +28,8 @@ using std::set;
 using std::string;
 using std::to_string;
 using std::vector;
+using std::chrono::duration;
+using std::chrono::seconds;
 
 using ceph::bufferlist;
 using ceph::decode;
@@ -369,6 +371,24 @@ void BlueFS::_init_logger()
 		    "Bytes requested in prefetch read mode",
 		     NULL,
 		    PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
+ b.add_time_avg     (l_bluefs_compaction_lat, "compact_lat",
+                    "Average bluefs log compaction latency",
+                    "c__t",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+ b.add_time_avg     (l_bluefs_compaction_lock_lat, "compact_lock_lat",
+                    "Average lock duration while compacting bluefs log",
+                    "c_lt",
+                    PerfCountersBuilder::PRIO_INTERESTING);
+  b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback",
+		    "Amount of allocations that required fallback to "
+                    " slow/shared device",
+		     "asdf",
+		    PerfCountersBuilder::PRIO_USEFUL);
+  b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback",
+		    "Amount of allocations that required fallback to shared device's "
+                    "regular unit size",
+		     "assf",
+		    PerfCountersBuilder::PRIO_USEFUL);
   b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
 	    "How many times bluefs read found page with all 0s");
   b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
@@ -567,7 +587,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   _init_logger();
   _init_alloc();
 
-  super.version = 1;
+  super.version = 0;
   super.block_size = bdev[BDEV_DB]->get_block_size();
   super.osd_uuid = osd_uuid;
   super.uuid.generate_random();
@@ -580,6 +600,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
   int r = _allocate(
     vselector->select_prefer_bdev(log_file->vselector_hint),
     cct->_conf->bluefs_max_log_runway,
+    0,
     &log_file->fnode);
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
   ceph_assert(r == 0);
@@ -990,7 +1011,6 @@ int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
       new_log_dev_next,
       RENAME_DB2SLOW,
       layout);
-    //}
   } else if(id == BDEV_NEWWAL) {
     _rewrite_log_and_layout_sync_LNF_LD(false,
       BDEV_DB,
@@ -1030,6 +1050,7 @@ int BlueFS::fsck()
 
 int BlueFS::_write_super(int dev)
 {
+  ++super.version;
   // build superblock
   bufferlist bl;
   encode(super, bl);
@@ -1092,12 +1113,17 @@ int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
     auto id = e.bdev;
     bool fail = false;
     ceph_assert(id < MAX_BDEV);
+    ceph_assert(bdev[id]);
+    // let's use minimal allocation unit we can have
+    auto alloc_unit = bdev[id]->get_block_size();
+
     if (int r = _verify_alloc_granularity(id, e.offset, e.length,
+                                          alloc_unit,
 					  op_name); r < 0) {
       return r;
     }
 
-    apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
+    apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id],
       [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
 	if (is_alloc == bs.test(pos)) {
 	  fail = true;
@@ -1119,31 +1145,14 @@ int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
 }
 
 int BlueFS::_verify_alloc_granularity(
-  __u8 id, uint64_t offset, uint64_t length, const char *op)
+  __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op)
 {
-  if ((offset & (alloc_size[id] - 1)) ||
-      (length & (alloc_size[id] - 1))) {
+  if ((offset & (alloc_unit - 1)) ||
+      (length & (alloc_unit - 1))) {
     derr << __func__ << " " << op << " of " << (int)id
 	 << ":0x" << std::hex << offset << "~" << length << std::dec
 	 << " does not align to alloc_size 0x"
-	 << std::hex << alloc_size[id] << std::dec << dendl;
-    // be helpful
-    auto need = alloc_size[id];
-    while (need && ((offset & (need - 1)) ||
-		    (length & (need - 1)))) {
-      need >>= 1;
-    }
-    if (need) {
-      const char *which;
-      if (id == BDEV_SLOW ||
-	  (id == BDEV_DB && !bdev[BDEV_SLOW])) {
-	which = "bluefs_shared_alloc_size";
-      } else {
-	which = "bluefs_alloc_size";
-      }
-      derr << "work-around by setting " << which << " = " << need
-	   << " for this OSD" << dendl;
-    }
+	 << std::hex << alloc_unit << std::dec << dendl;
     return -EFAULT;
   }
   return 0;
@@ -1162,11 +1171,6 @@ int BlueFS::_replay(bool noop, bool to_stdout)
   if (!noop) {
     log_file->vselector_hint =
       vselector->get_hint_for_log();
-  } else {
-    // do not use fnode from superblock in 'noop' mode - log_file's one should
-    // be fine and up-to-date
-    ceph_assert(log_file->fnode.ino == 1);
-    ceph_assert(log_file->fnode.extents.size() != 0);
   }
   dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
   if (unlikely(to_stdout)) {
@@ -1185,8 +1189,11 @@ int BlueFS::_replay(bool noop, bool to_stdout)
   if (!noop) {
     if (cct->_conf->bluefs_log_replay_check_allocations) {
       for (size_t i = 0; i < MAX_BDEV; ++i) {
-	if (alloc_size[i] != 0 && bdev[i] != nullptr) {
-	  used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
+	if (bdev[i] != nullptr) {
+          // let's use minimal allocation unit we can have
+          auto au = bdev[i]->get_block_size();
+          //hmm... on 32TB/4K drive this would take 1GB RAM!!!
+	  used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au);
 	}
       }
       // check initial log layout
@@ -1309,7 +1316,9 @@ int BlueFS::_replay(bool noop, bool to_stdout)
     }
 
     auto p = t.op_bl.cbegin();
+    auto pos0 = pos;
     while (!p.end()) {
+      pos = pos0 + p.get_off();
       __u8 op;
       decode(op, p);
       switch (op) {
@@ -1727,26 +1736,27 @@ int BlueFS::device_migrate_to_existing(
 
   for (auto& [ino, file_ref] : nodes.file_map) {
     //do not copy log
-    if (file_ref->fnode.ino == 1) {
+    if (ino == 1) {
       continue;
     }
     dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
 
-    auto& fnode_extents = file_ref->fnode.extents;
     vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
 
     bool rewrite = std::any_of(
-      fnode_extents.begin(),
-      fnode_extents.end(),
+      file_ref->fnode.extents.begin(),
+      file_ref->fnode.extents.end(),
       [=](auto& ext) {
 	return ext.bdev != dev_target && devs_source.count(ext.bdev);
       });
     if (rewrite) {
       dout(10) << __func__ << "  migrating" << dendl;
-
+      bluefs_fnode_t old_fnode;
+      old_fnode.swap_extents(file_ref->fnode);
+      auto& old_fnode_extents = old_fnode.extents;
       // read entire file
       bufferlist bl;
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	buf.resize(old_ext.length);
 	int r = _bdev_read_random(old_ext.bdev,
 	  old_ext.offset,
@@ -1763,8 +1773,8 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // write entire file
-      PExtentVector extents;
-      auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+      auto l = _allocate(dev_target, bl.length(), 0,
+        &file_ref->fnode, 0, false);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
 	     << bl.length() << std::dec << " from " << (int)dev_target
@@ -1773,7 +1783,7 @@ int BlueFS::device_migrate_to_existing(
       }
 
       uint64_t off = 0;
-      for (auto& i : extents) {
+      for (auto& i : file_ref->fnode.extents) {
 	bufferlist cur;
 	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
 	ceph_assert(cur_len > 0);
@@ -1784,7 +1794,7 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // release old extents
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	PExtentVector to_release;
 	to_release.emplace_back(old_ext.offset, old_ext.length);
 	alloc[old_ext.bdev]->release(to_release);
@@ -1794,12 +1804,11 @@ int BlueFS::device_migrate_to_existing(
       }
 
       // update fnode
-      fnode_extents.clear();
-      for (auto& i : extents) {
-	fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+      for (auto& i : file_ref->fnode.extents) {
+	i.bdev = dev_target_new;
       }
     } else {
-      for (auto& ext : fnode_extents) {
+      for (auto& ext : file_ref->fnode.extents) {
 	if (dev_target != dev_target_new && ext.bdev == dev_target) {
 	  dout(20) << __func__ << "  " << " ... adjusting extent 0x"
 		   << std::hex << ext.offset << std::dec
@@ -1865,30 +1874,29 @@ int BlueFS::device_migrate_to_new(
   flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
   int dev_target_new = dev_target; //FIXME: remove, makes no sense
 
-  for (auto& p : nodes.file_map) {
+  for (auto& [ino, file_ref] : nodes.file_map) {
     //do not copy log
-    if (p.second->fnode.ino == 1) {
+    if (ino == 1) {
       continue;
     }
-    dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
+    dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
 
-    auto& fnode_extents = p.second->fnode.extents;
+    vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
 
-    bool rewrite = false;
-    for (auto ext_it = fnode_extents.begin();
-	 ext_it != p.second->fnode.extents.end();
-	 ++ext_it) {
-      if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
-	rewrite = true;
-	break;
-      }
-    }
+    bool rewrite = std::any_of(
+      file_ref->fnode.extents.begin(),
+      file_ref->fnode.extents.end(),
+      [=](auto& ext) {
+	return ext.bdev != dev_target && devs_source.count(ext.bdev);
+      });
     if (rewrite) {
       dout(10) << __func__ << "  migrating" << dendl;
-
+      bluefs_fnode_t old_fnode;
+      old_fnode.swap_extents(file_ref->fnode);
+      auto& old_fnode_extents = old_fnode.extents;
       // read entire file
       bufferlist bl;
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	buf.resize(old_ext.length);
 	int r = _bdev_read_random(old_ext.bdev,
 	  old_ext.offset,
@@ -1905,8 +1913,8 @@ int BlueFS::device_migrate_to_new(
       }
 
       // write entire file
-      PExtentVector extents;
-      auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
+      auto l = _allocate(dev_target, bl.length(), 0,
+        &file_ref->fnode, 0, false);
       if (l < 0) {
 	derr << __func__ << " unable to allocate len 0x" << std::hex
 	     << bl.length() << std::dec << " from " << (int)dev_target
@@ -1915,7 +1923,7 @@ int BlueFS::device_migrate_to_new(
       }
 
       uint64_t off = 0;
-      for (auto& i : extents) {
+      for (auto& i : file_ref->fnode.extents) {
 	bufferlist cur;
 	uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
 	ceph_assert(cur_len > 0);
@@ -1926,7 +1934,7 @@ int BlueFS::device_migrate_to_new(
       }
 
       // release old extents
-      for (auto old_ext : fnode_extents) {
+      for (const auto &old_ext : old_fnode_extents) {
 	PExtentVector to_release;
 	to_release.emplace_back(old_ext.offset, old_ext.length);
 	alloc[old_ext.bdev]->release(to_release);
@@ -1936,9 +1944,8 @@ int BlueFS::device_migrate_to_new(
       }
 
       // update fnode
-      fnode_extents.clear();
-      for (auto& i : extents) {
-	fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
+      for (auto& i : file_ref->fnode.extents) {
+	i.bdev = dev_target_new;
       }
     }
   }
@@ -2133,7 +2140,9 @@ int64_t BlueFS::_read_random(
       buf->pos += r;
     }
   }
-  dout(20) << __func__ << " got " << ret << dendl;
+  dout(20) << __func__ << std::hex
+           << " got 0x" << ret
+           << std::dec  << dendl;
   --h->file->num_reading;
   return ret;
 }
@@ -2258,7 +2267,9 @@ int64_t BlueFS::_read(
     buf->pos += r;
   }
 
-  dout(20) << __func__ << " got " << ret << dendl;
+  dout(20) << __func__ << std::hex
+           << " got 0x" << ret
+           << std::dec  << dendl;
   ceph_assert(!outbl || (int)outbl->length() == ret);
   --h->file->num_reading;
   return ret;
@@ -2286,6 +2297,44 @@ void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
   }
 }
 
+
+uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
+{
+  uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
+				     std::max(alloc_size[BDEV_DB],
+					      alloc_size[BDEV_SLOW]));
+
+  // conservative estimate for final encoded size
+  return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
+}
+
+uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq,
+                                           bluefs_fnode_t& fnode,
+                                           uint64_t expected_final_size,
+                                           bufferlist* out)
+{
+  bluefs_transaction_t t0;
+  t0.seq = start_seq;
+  t0.uuid = super.uuid;
+  t0.op_init();
+  t0.op_file_update_inc(fnode);
+  t0.op_jump(start_seq, expected_final_size); // this is a fixed size op,
+                                              // hence it's valid with fake
+                                              // params for overall txc size
+                                              // estimation
+  if (!out) {
+    return _estimate_transaction_size(&t0);
+  }
+
+  ceph_assert(expected_final_size > 0);
+  out->reserve(expected_final_size);
+  encode(t0, *out);
+  // make sure we're not wrong aboth the size
+  ceph_assert(out->length() <= expected_final_size);
+  _pad_bl(*out, expected_final_size);
+  return expected_final_size;
+}
+
 uint64_t BlueFS::_estimate_log_size_N()
 {
   std::lock_guard nl(nodes.lock);
@@ -2333,77 +2382,51 @@ bool BlueFS::_should_start_compact_log_L_N()
   return true;
 }
 
-void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t *t,
-					int flags)
+void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq,
+                                        bluefs_transaction_t *t,
+					int bdev_update_flags,
+                                        uint64_t capture_before_seq)
 {
-  std::lock_guard nl(nodes.lock);
-
-  t->seq = 1;
+  dout(20) << __func__ << dendl;
+  t->seq = start_seq;
   t->uuid = super.uuid;
-  dout(20) << __func__ << " op_init" << dendl;
 
-  t->op_init();
-  for (auto& [ino, file_ref] : nodes.file_map) {
-    if (ino == 1)
-      continue;
-    ceph_assert(ino > 1);
-    std::lock_guard fl(file_ref->lock);
-    for(auto& e : file_ref->fnode.extents) {
-      auto bdev = e.bdev;
-      auto bdev_new = bdev;
-      ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
-      if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
-	bdev_new = BDEV_DB;
-      }
-      if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
-	bdev_new = BDEV_SLOW;
-      }
-      if (bdev == BDEV_NEWDB) {
-	// REMOVE_DB xor RENAME_DB
-	ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
-	ceph_assert(!(flags & RENAME_SLOW2DB));
-	bdev_new = BDEV_DB;
-      }
-      if (bdev == BDEV_NEWWAL) {
-	ceph_assert(flags & REMOVE_WAL);
-	bdev_new = BDEV_WAL;
-      }
-      e.bdev = bdev_new;
-    }
-    dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
-    t->op_file_update(file_ref->fnode);
-  }
-  for (auto& [path, dir_ref] : nodes.dir_map) {
-    dout(20) << __func__ << " op_dir_create " << path << dendl;
-    t->op_dir_create(path);
-    for (auto& [fname, file_ref] : dir_ref->file_map) {
-      dout(20) << __func__ << " op_dir_link " << path << "/" << fname
-	       << " to " << file_ref->fnode.ino << dendl;
-      t->op_dir_link(path, fname, file_ref->fnode.ino);
-    }
-  }
-}
-/* Streams to t files modified before *capture_before_seq* and all dirs */
-void BlueFS::_compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
-						 uint64_t capture_before_seq)
-{
   std::lock_guard nl(nodes.lock);
 
-  t->seq = 1;
-  t->uuid = super.uuid;
-  dout(20) << __func__ << " op_init" << dendl;
-
-  t->op_init();
   for (auto& [ino, file_ref] : nodes.file_map) {
     if (ino == 1)
       continue;
     ceph_assert(ino > 1);
     std::lock_guard fl(file_ref->lock);
-    if (file_ref->dirty_seq < capture_before_seq) {
+    if (bdev_update_flags) {
+      for(auto& e : file_ref->fnode.extents) {
+        auto bdev = e.bdev;
+        auto bdev_new = bdev;
+        ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL));
+        if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
+	  bdev_new = BDEV_DB;
+        }
+        if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
+	  bdev_new = BDEV_SLOW;
+        }
+        if (bdev == BDEV_NEWDB) {
+	  // REMOVE_DB xor RENAME_DB
+	  ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW));
+	  ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB));
+	  bdev_new = BDEV_DB;
+        }
+        if (bdev == BDEV_NEWWAL) {
+	  ceph_assert(bdev_update_flags & REMOVE_WAL);
+	  bdev_new = BDEV_WAL;
+        }
+        e.bdev = bdev_new;
+      }
+    }
+    if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) {
       dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
     } else {
       dout(20) << __func__ << " op_file_update just modified, dirty_seq="
-	       << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
+               << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
     }
     t->op_file_update(file_ref->fnode);
   }
@@ -2436,17 +2459,49 @@ void BlueFS::_compact_log_sync_LNF_LD()
   logger->inc(l_bluefs_log_compactions);
 }
 
-void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
+/*
+ * SYNC LOG COMPACTION
+ *
+ * 0. Lock the log completely through the whole procedure
+ *
+ * 1. Build new log. It will include log's starter and compacted metadata
+ *    body. Jump op appended to the starter will link the pieces together.
+ *
+ * 2. Write out new log's content
+ *
+ * 3. Write out new superblock. This includes relevant device layout update.
+ *
+ * 4. Finalization. Old space release.
+ */
+
+void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
 					 int super_dev,
 					 int log_dev,
 					 int log_dev_new,
 					 int flags,
 					 std::optional<bluefs_layout_t> layout)
 {
+  // we substitute log_dev with log_dev_new for new allocations below
+  // and permitting fallback allocations prevents such a substitution
+  ceph_assert((permit_dev_fallback && log_dev == log_dev_new) ||
+              !permit_dev_fallback);
+
+  dout(10) << __func__ << " super_dev:" << super_dev
+                       << " log_dev:" << log_dev
+                       << " log_dev_new:" << log_dev_new
+		       << " flags:" << flags
+		       << " seq:" << log.seq_live
+		       << dendl;
+  utime_t mtime = ceph_clock_now();
+  uint64_t starter_seq = 1;
+
+  // Part 0.
+  // Lock the log totally till the end of the procedure
   std::lock_guard ll(log.lock);
+  auto t0 = mono_clock::now();
 
   File *log_file = log.writer->file.get();
-
+  bluefs_fnode_t fnode_tail;
   // log.t.seq is always set to current live seq
   ceph_assert(log.t.seq == log.seq_live);
   // Capturing entire state. Dump anything that has been stored there.
@@ -2455,53 +2510,145 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   // From now on, no changes to log.t are permitted until we finish rewriting log.
   // Can allow dirty to remain dirty - log.seq_live will not change.
 
-  dout(20) << __func__ << " super_dev:" << super_dev
-                       << " log_dev:" << log_dev
-                       << " log_dev_new:" << log_dev_new
-		       << " flags:" << flags
-		       << dendl;
-  bluefs_transaction_t t;
-  _compact_log_dump_metadata_NF(&t, flags);
+  //
+  // Part 1.
+  // Build new log starter and compacted metadata body
+  // 1.1. Build full compacted meta transaction.
+  //      Encode a bluefs transaction that dumps all of the in-memory fnodes
+  //      and names.
+  //      This might be pretty large and its allocation map can exceed
+  //      superblock size. Hence instead we'll need log starter part which
+  //      goes to superblock and refers that new meta through op_update_inc.
+  // 1.2.  Allocate space for the above transaction
+  //       using its size estimation.
+  // 1.3.  Allocate the space required for the starter part of the new log.
+  //       It should be small enough to fit into superblock.
+  // 1.4   Building new log persistent fnode representation which will
+  //       finally land to disk.
+  //       Depending on input parameters we might need to perform device ids
+  //       rename - runtime and persistent replicas should be different when we
+  //       are in the device migration process.
+  // 1.5   Store starter fnode to run-time superblock, to be written out later.
+  //       It doesn't contain compacted meta to fit relevant alocation map into
+  //       superblock.
+  // 1.6   Proceed building new log persistent fnode representation.
+  //       Will add log tail with compacted meta extents from 1.1.
+  //       Device rename applied as well
+  //
+  // 1.7.  Encode new log fnode starter,
+  //       It will include op_init, new log's op_update_inc
+  //       and jump to the compacted meta transaction beginning.
+  //       Superblock will reference this starter part
+  //
+  // 1.8.  Encode compacted meta transaction,
+  //       extend the transaction with a jump to proper sequence no
+  //
+
+
+  // 1.1 Build full compacted meta transaction
+  bluefs_transaction_t compacted_meta_t;
+  _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0);
+
+  // 1.2 Allocate the space required for the compacted meta transaction
+  uint64_t compacted_meta_need =
+    _estimate_transaction_size(&compacted_meta_t) +
+      cct->_conf->bluefs_max_log_runway;
+
+  dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
+
+  int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
+    permit_dev_fallback);
+  ceph_assert(r == 0);
 
-  dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
-  t.op_jump_seq(log.seq_live);
 
-  bufferlist bl;
-  encode(t, bl);
-  _pad_bl(bl);
+  // 1.3 Allocate the space required for the starter part of the new log.
+  // estimate new log fnode size to be referenced from superblock
+  // hence use dummy fnode and jump parameters
+  uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
 
-  uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
-  dout(20) << __func__ << " need " << need << dendl;
+  bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
+  r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
+    permit_dev_fallback);
+  ceph_assert(r == 0);
 
-  bluefs_fnode_t old_fnode;
-  int r;
-  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
-  log_file->fnode.swap_extents(old_fnode);
-  if (allocate_with_fallback) {
-    r = _allocate(log_dev, need, &log_file->fnode);
-    ceph_assert(r == 0);
-  } else {
-    PExtentVector extents;
-    r = _allocate_without_fallback(log_dev,
-			       need,
-			       &extents);
-    ceph_assert(r == 0);
-    for (auto& p : extents) {
-      log_file->fnode.append_extent(
-	bluefs_extent_t(log_dev, p.offset, p.length));
+  // 1.4 Building starter fnode
+  bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime);
+  for (auto p : fnode_starter.extents) {
+    // rename device if needed - this is possible when fallback allocations
+    // are prohibited only. Which means every extent is targeted to the same
+    // device and we can unconditionally update them.
+    if (log_dev != log_dev_new) {
+      dout(10) << __func__ << " renaming log extents to "
+               << log_dev_new << dendl;
+      p.bdev = log_dev_new;
     }
+    fnode_persistent.append_extent(p);
+  }
+
+  // 1.5 Store starter fnode to run-time superblock, to be written out later
+  super.log_fnode = fnode_persistent;
+
+  // 1.6 Proceed building new log persistent fnode representation
+  // we'll build incremental update starting from this point
+  fnode_persistent.reset_delta();
+  for (auto p : fnode_tail.extents) {
+    // rename device if needed - this is possible when fallback allocations
+    // are prohibited only. Which means every extent is targeted to the same
+    // device and we can unconditionally update them.
+    if (log_dev != log_dev_new) {
+      dout(10) << __func__ << " renaming log extents to "
+               << log_dev_new << dendl;
+      p.bdev = log_dev_new;
+    }
+    fnode_persistent.append_extent(p);
   }
 
-  _close_writer(log.writer);
-
-  // we will write it to super
-  log_file->fnode.reset_delta();
-  log_file->fnode.size = bl.length();
+  // 1.7 Encode new log fnode
+  // This will flush incremental part of fnode_persistent only.
+  bufferlist starter_bl;
+  _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl);
 
+  // 1.8 Encode compacted meta transaction
+  dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
+  // hopefully "compact_meta_need" estimation provides enough extra space
+  // for this op, assert below if not
+  compacted_meta_t.op_jump_seq(log.seq_live);
+
+  bufferlist compacted_meta_bl;
+  encode(compacted_meta_t, compacted_meta_bl);
+  _pad_bl(compacted_meta_bl);
+  ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
+
+  //
+  // Part 2
+  // Write out new log's content
+  // 2.1. Build the full runtime new log's fnode
+  //
+  // 2.2. Write out new log's
+  //
+  // 2.3. Do flush and wait for completion through flush_bdev()
+  //
+  // 2.4. Finalize log update
+  //      Update all sequence numbers
+  //
+
+  // 2.1 Build the full runtime new log's fnode
+  bluefs_fnode_t old_log_fnode;
+  old_log_fnode.swap(fnode_starter);
+  old_log_fnode.clone_extents(fnode_tail);
+  old_log_fnode.reset_delta();
+  log_file->fnode.swap(old_log_fnode);
+
+  // 2.2 Write out new log's content
+  // Get rid off old writer
+  _close_writer(log.writer);
+  // Make new log writer and stage new log's content writing
   log.writer = _create_writer(log_file);
-  log.writer->append(bl);
+  log.writer->append(starter_bl);
+  log.writer->append(compacted_meta_bl);
+
+  // 2.3 Do flush and wait for completion through flush_bdev()
   _flush_special(log.writer);
-  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
 #ifdef HAVE_LIBAIO
   if (!cct->_conf->bluefs_sync_write) {
     list<aio_t> completed_ios;
@@ -2511,74 +2658,86 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
   }
 #endif
   _flush_bdev();
+
+  // 2.4 Finalize log update
   ++log.seq_live;
   dirty.seq_live = log.seq_live;
   log.t.seq = log.seq_live;
+  vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
+  vselector->add_usage(log_file->vselector_hint, log_file->fnode);
 
-  super.memorized_layout = layout;
-  super.log_fnode = log_file->fnode;
-  // rename device if needed
-  if (log_dev != log_dev_new) {
-    dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
-    for (auto& p : super.log_fnode.extents) {
-      p.bdev = log_dev_new;
-    }
-  }
-  dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
+  // Part 3.
+  // Write out new superblock to reflect all the changes.
+  //
 
-  ++super.version;
+  super.memorized_layout = layout;
   _write_super(super_dev);
   _flush_bdev();
 
-  dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
-  std::lock_guard dl(dirty.lock);
-  for (auto& r : old_fnode.extents) {
-    dirty.pending_release[r.bdev].insert(r.offset, r.length);
+  // we're mostly done
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+
+  // Part 4
+  // Finalization. Release old space.
+  //
+  {
+    dout(10) << __func__
+             << " release old log extents " << old_log_fnode.extents
+             << dendl;
+    std::lock_guard dl(dirty.lock);
+    for (auto& r : old_log_fnode.extents) {
+      dirty.pending_release[r.bdev].insert(r.offset, r.length);
+    }
   }
+  logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
 }
 
 /*
- * 1. Allocate a new extent to continue the log, and then log an event
- * that jumps the log write position to the new extent.  At this point, the
- * old extent(s) won't be written to, and reflect everything to compact.
- * New events will be written to the new region that we'll keep.
+ * ASYNC LOG COMPACTION
  *
- * 2. While still holding the lock, encode a bufferlist that dumps all of the
- * in-memory fnodes and names.  This will become the new beginning of the
- * log.  The last event will jump to the log continuation extent from #1.
- *
- * 3. Queue a write to a new extent for the new beginnging of the log.
+ * 0. Lock the log and forbid its extension. The former covers just
+ *    a part of the below procedure while the latter spans over it
+ *    completely.
+ * 1. Allocate a new extent to continue the log, and then log an event
+ *    that jumps the log write position to the new extent.  At this point, the
+ *    old extent(s) won't be written to, and reflect everything to compact.
+ *    New events will be written to the new region that we'll keep.
+ *    The latter will finally become new log tail on compaction completion.
  *
- * 4. Drop lock and wait
+ * 2. Build new log. It will include log's starter, compacted metadata
+ *    body and the above tail. Jump ops appended to the starter and meta body
+ *    will link the pieces togather. Log's lock is releases in the mid of the
+ *    process to permit parallel access to it.
  *
- * 5. Retake the lock.
+ * 3. Write out new log's content.
  *
- * 6. Update the log_fnode to splice in the new beginning.
+ * 4. Write out new superblock to reflect all the changes.
  *
- * 7. Write the new superblock.
+ * 5. Apply new log fnode, log is locked for a while.
  *
- * 8. Release the old log space.  Clean up.
+ * 6. Finalization. Clean up, old space release and total unlocking.
  */
 
 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
 {
   dout(10) << __func__ << dendl;
+  utime_t mtime = ceph_clock_now();
+  uint64_t starter_seq = 1;
+  uint64_t old_log_jump_to = 0;
+
+  // Part 0.
+  // Lock the log and forbid its expansion and other compactions
+
   // only one compaction allowed at one time
   bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
   if (old_is_comp) {
     dout(10) << __func__ << " ongoing" <<dendl;
     return;
   }
-
+  // lock log's run-time structures for a while
   log.lock.lock();
-  File *log_file = log.writer->file.get();
-  FileWriter *new_log_writer = nullptr;
-  FileRef new_log = nullptr;
-  uint64_t new_log_jump_to = 0;
-  uint64_t old_log_jump_to = 0;
-
-  new_log = ceph::make_ref<File>();
-  new_log->fnode.ino = 0;   // we use _flush_special to avoid log of the fnode
+  auto t0 = mono_clock::now();
 
   // Part 1.
   // Prepare current log for jumping into it.
@@ -2592,25 +2751,42 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
   ceph_assert(old_forbidden == false);
 
-  vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
+  //
+  // Part 1.
+  // Prepare current log for jumping into it.
+  // 1.1. Allocate extent
+  // 1.2. Save log's fnode extents and add new extents
+  // 1.3. Update op to log
+  // 1.4. Jump op to log
+  // During that, no one else can write to log, otherwise we risk jumping backwards.
+  // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
 
-  // 1.1 allocate new log space and jump to it.
+  // 1.1 allocate new log extents and store them at fnode_tail
+  File *log_file = log.writer->file.get();
   old_log_jump_to = log_file->fnode.get_allocated();
+  bluefs_fnode_t fnode_tail;
   uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
   dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
-           << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
+           << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
   int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
 		    cct->_conf->bluefs_max_log_runway,
-                    &log_file->fnode);
+                    0,
+                    &fnode_tail);
   ceph_assert(r == 0);
+
+  // 1.2 save log's fnode extents and add new extents
+  bluefs_fnode_t old_log_fnode(log_file->fnode);
+  log_file->fnode.clone_extents(fnode_tail);
   //adjust usage as flush below will need it
+  vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
   dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
 
-  // update the log file change and log a jump to the offset where we want to
+  // 1.3 update the log file change and log a jump to the offset where we want to
   // write the new entries
-  log.t.op_file_update(log_file->fnode);
-  // jump to new position should mean next seq
+  log.t.op_file_update_inc(log_file->fnode);
+
+  // 1.4 jump to new position should mean next seq
   log.t.op_jump(log.seq_live + 1, old_log_jump_to);
   uint64_t seq_now = log.seq_live;
   // we need to flush all bdev because we will be streaming all dirty files to log
@@ -2619,143 +2795,203 @@ void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
   _flush_bdev();
   _flush_and_sync_log_jump_D(old_log_jump_to, runway);
 
-  // out of jump section
-
-  // 2. prepare compacted log
-  bluefs_transaction_t t;
-  _compact_log_async_dump_metadata_NF(&t, seq_now);
-
-  // now state is captured to bufferlist
-  // log can be used to write to, ops in log will be continuation of captured state
+  //
+  // Part 2.
+  // Build new log starter and compacted metadata body
+  // 2.1.  Build full compacted meta transaction.
+  //       While still holding the lock, encode a bluefs transaction
+  //       that dumps all of the in-memory fnodes and names.
+  //       This might be pretty large and its allocation map can exceed
+  //       superblock size. Hence instead we'll need log starter part which
+  //       goes to superblock and refers that new meta through op_update_inc.
+  // 2.2.  After releasing the lock allocate space for the above transaction
+  //       using its size estimation.
+  //       Then build tailing list of extents which consists of these
+  //       newly allocated extents followed by ones from Part 1.
+  // 2.3.  Allocate the space required for the starter part of the new log.
+  //       It should be small enough to fit into superblock.
+  //       Effectively we start building new log fnode here.
+  // 2.4.  Store starter fnode to run-time superblock, to be written out later
+  // 2.5.  Finalize new log's fnode building
+  //       This will include log's starter and tailing extents built at 2.2
+  // 2.6.  Encode new log fnode starter,
+  //       It will include op_init, new log's op_update_inc
+  //       and jump to the compacted meta transaction beginning.
+  //       Superblock will reference this starter part
+  // 2.7.  Encode compacted meta transaction,
+  //       extend the transaction with a jump to the log tail from 1.1 before
+  //       encoding.
+  //
+
+  // 2.1 Build full compacted meta transaction
+  bluefs_transaction_t compacted_meta_t;
+  _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now);
+
+  // now state is captured to compacted_meta_t,
+  // current log can be used to write to,
+  //ops in log will be continuation of captured state
+  logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
   log.lock.unlock();
 
-  uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
-				     std::max(alloc_size[BDEV_DB],
-					      alloc_size[BDEV_SLOW]));
+  // 2.2 Allocate the space required for the compacted meta transaction
+  uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t);
+  dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need
+           << dendl;
+  {
+    bluefs_fnode_t fnode_pre_tail;
+    // do allocate
+    r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+                  compacted_meta_need,
+                  0,
+                  &fnode_pre_tail);
+    ceph_assert(r == 0);
+    // build trailing list of extents in fnode_tail,
+    // this will include newly allocated extents for compacted meta
+    // and aux extents allocated at step 1.1
+    fnode_pre_tail.claim_extents(fnode_tail.extents);
+    fnode_tail.swap_extents(fnode_pre_tail);
+  }
 
-  // conservative estimate for final encoded size
-  new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
-                                max_alloc_size);
-  //newly constructed log head will jump to what we had before
-  t.op_jump(seq_now, new_log_jump_to);
-
-  // allocate
-  //FIXME: check if we want DB here?
-  r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
-                    &new_log->fnode);
+  // 2.3 Allocate the space required for the starter part of the new log.
+  // Start building New log fnode
+  FileRef new_log = nullptr;
+  new_log = ceph::make_ref<File>();
+  new_log->fnode.ino = log_file->fnode.ino;
+  new_log->fnode.mtime = mtime;
+  // Estimate the required space
+  uint64_t starter_need =
+    _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
+  // and now allocate and store at new_log_fnode
+  r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
+                starter_need,
+                0,
+                &new_log->fnode);
   ceph_assert(r == 0);
 
-  bufferlist bl;
-  encode(t, bl);
-  _pad_bl(bl);
+  // 2.4 Store starter fnode to run-time superblock, to be written out later
+  super.log_fnode = new_log->fnode;
 
-  dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
-	   << std::dec << dendl;
+  // 2.5 Finalize new log's fnode building
+  // start collecting new log fnode updates (to make op_update_inc later)
+  // since this point. This will include compacted meta from 2.2 and aux
+  // extents from 1.1.
+  new_log->fnode.reset_delta();
+  new_log->fnode.claim_extents(fnode_tail.extents);
 
-  new_log_writer = _create_writer(new_log);
+  // 2.6 Encode new log fnode
+  bufferlist starter_bl;
+  _make_initial_transaction(starter_seq, new_log->fnode, starter_need,
+    &starter_bl);
 
-  new_log_writer->append(bl);
-  // 3. flush
+  // 2.7 Encode compacted meta transaction,
+  dout(20) << __func__
+           << " new_log jump seq " << seq_now
+           << std::hex << " offset 0x" << starter_need + compacted_meta_need
+	   << std::dec << dendl;
+  // Extent compacted_meta transaction with a just to new log tail.
+  // Hopefully "compact_meta_need" estimation provides enough extra space
+  // for this new jump, assert below if not
+  compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need);
+  // Now do encodeing and padding
+  bufferlist compacted_meta_bl;
+  compacted_meta_bl.reserve(compacted_meta_need);
+  encode(compacted_meta_t, compacted_meta_bl);
+  ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
+  _pad_bl(compacted_meta_bl, compacted_meta_need);
+
+  //
+  // Part 3.
+  // Write out new log's content
+  // 3.1 Stage new log's content writing
+  // 3.2 Do flush and wait for completion through flush_bdev()
+  //
+
+  // 3.1 Stage new log's content writing
+  // Make new log writer and append bufferlists to write out.
+  FileWriter *new_log_writer = _create_writer(new_log);
+  // And append all new log's bufferlists to write out.
+  new_log_writer->append(starter_bl);
+  new_log_writer->append(compacted_meta_bl);
+
+  // 3.2. flush and wait
   _flush_special(new_log_writer);
+  _flush_bdev(new_log_writer, false); // do not check log.lock is locked
 
-  // 4. wait
-  _flush_bdev(new_log_writer);
-  // 5. update our log fnode
-  // we need to append to new_log the extents that were allocated in step 1.1
-  // we do it by inverse logic - we drop 'old_log_jump_to' bytes and keep rest
-  // todo - maybe improve _allocate so we will give clear set of new allocations
-  uint64_t processed = 0;
-  mempool::bluefs::vector<bluefs_extent_t> old_extents;
-  for (auto& e : log_file->fnode.extents) {
-    if (processed + e.length <= old_log_jump_to) {
-      // drop whole extent
-      dout(10) << __func__ << " remove old log extent " << e << dendl;
-      old_extents.push_back(e);
-    } else {
-      // keep, but how much?
-      if (processed < old_log_jump_to) {
-	ceph_assert(processed + e.length > old_log_jump_to);
-	ceph_assert(old_log_jump_to - processed <= std::numeric_limits<uint32_t>::max());
-	uint32_t cut_at = uint32_t(old_log_jump_to - processed);
-	// need to cut, first half gets dropped
-	bluefs_extent_t retire(e.bdev, e.offset, cut_at);
-	old_extents.push_back(retire);
-	// second half goes to new log
-	bluefs_extent_t keep(e.bdev, e.offset + cut_at, e.length - cut_at);
-	new_log->fnode.append_extent(keep);
-	dout(10) << __func__ << " kept " << keep << " removed " << retire << dendl;
-      } else {
-	// take entire extent
-	ceph_assert(processed >= old_log_jump_to);
-	new_log->fnode.append_extent(e);
-	dout(10) << __func__ << " kept " << e << dendl;
-      }
-    }
-    processed += e.length;
-  }
-  // we will write it to super
-  new_log->fnode.reset_delta();
+  // Part 4.
+  // Write out new superblock to reflect all the changes.
+  //
 
-  // 6. write the super block to reflect the changes
-  dout(10) << __func__ << " writing super" << dendl;
-  new_log->fnode.ino = log_file->fnode.ino;
-  new_log->fnode.size = 0;
-  new_log->fnode.mtime = ceph_clock_now();
-  super.log_fnode = new_log->fnode;
-  ++super.version;
   _write_super(BDEV_DB);
   _flush_bdev();
 
+  // Part 5.
+  // Apply new log fnode
+  //
+
+  // we need to acquire log's lock back at this point
   log.lock.lock();
-  // swapping log_file and new_log
+  // Reconstruct actual log object from the new one.
   vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
-
-  // clear the extents from old log file, they are added to new log
-  log_file->fnode.clear_extents();
-  // swap the log files. New log file is the log file now.
-  new_log->fnode.swap_extents(log_file->fnode);
-
-  log.writer->pos = log.writer->file->fnode.size =
-    log.writer->pos - old_log_jump_to + new_log_jump_to;
-
+  log_file->fnode.size =
+    log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need;
+  log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime);
+  log_file->fnode.swap_extents(new_log->fnode);
+  // update log's writer
+  log.writer->pos = log.writer->file->fnode.size;
   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
-
+  // and unlock
   log.lock.unlock();
 
+  // we're mostly done
+  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
+  logger->inc(l_bluefs_log_compactions);
+
+  //Part 6.
+  // Finalization
+  // 6.1 Permit log's extension, forbidden at step 0.
+  //
+  // 6.2 Release the new log writer
+  //
+  // 6.3 Release old space
+  //
+  // 6.4. Enable other compactions
+  //
+
+  // 6.1 Permit log's extension, forbidden at step 0.
   old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
   ceph_assert(old_forbidden == true);
   //to wake up if someone was in need of expanding log
   log_cond.notify_all();
 
-  // 7. release old space
-  dout(10) << __func__ << " release old log extents " << old_extents << dendl;
+  // 6.2 Release the new log writer
+  _close_writer(new_log_writer);
+  new_log_writer = nullptr;
+  new_log = nullptr;
+
+  // 6.3 Release old space
   {
+    dout(10) << __func__
+             << " release old log extents " << old_log_fnode.extents
+             << dendl;
     std::lock_guard dl(dirty.lock);
-    for (auto& r : old_extents) {
+    for (auto& r : old_log_fnode.extents) {
       dirty.pending_release[r.bdev].insert(r.offset, r.length);
     }
   }
 
-  // delete the new log, remove from the dirty files list
-  _close_writer(new_log_writer);
-  new_log_writer = nullptr;
-  new_log = nullptr;
-  log_cond.notify_all();
-
-  dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
-  logger->inc(l_bluefs_log_compactions);
-
+  // 6.4. Enable other compactions
   old_is_comp = atomic_exchange(&log_is_compacting, false);
   ceph_assert(old_is_comp);
 }
 
-void BlueFS::_pad_bl(bufferlist& bl)
+void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size)
 {
-  uint64_t partial = bl.length() % super.block_size;
+  pad_size = std::max(pad_size, uint64_t(super.block_size));
+  uint64_t partial = bl.length() % pad_size;
   if (partial) {
     dout(10) << __func__ << " padding with 0x" << std::hex
-	     << super.block_size - partial << " zeros" << std::dec << dendl;
-    bl.append_zero(super.block_size - partial);
+	     << pad_size - partial << " zeros" << std::dec << dendl;
+    bl.append_zero(pad_size - partial);
   }
 }
 
@@ -2836,6 +3072,7 @@ int64_t BlueFS::_maybe_extend_log()
     int r = _allocate(
       vselector->select_prefer_bdev(log.writer->file->vselector_hint),
       cct->_conf->bluefs_max_log_runway,
+      0,
       &log.writer->file->fnode);
     ceph_assert(r == 0);
     vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
@@ -3132,6 +3369,7 @@ int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
     // in _flush_and_sync_log.
     int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
 		      offset + length - allocated,
+                      0,
 		      &h->file->fnode);
     if (r < 0) {
       derr << __func__ << " allocated: 0x" << std::hex << allocated
@@ -3440,12 +3678,14 @@ int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
 }
 
 // be careful - either h->file->lock or log.lock must be taken
-void BlueFS::_flush_bdev(FileWriter *h)
+void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked)
 {
-  if (h->file->fnode.ino > 1) {
-    ceph_assert(ceph_mutex_is_locked(h->lock));
-  } else if (h->file->fnode.ino == 1) {
-    ceph_assert(ceph_mutex_is_locked(log.lock));
+  if (check_mutext_locked) {
+    if (h->file->fnode.ino > 1) {
+      ceph_assert(ceph_mutex_is_locked(h->lock));
+    } else if (h->file->fnode.ino == 1) {
+      ceph_assert(ceph_mutex_is_locked(log.lock));
+    }
   }
   std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
   h->dirty_devs.fill(false);
@@ -3490,86 +3730,107 @@ const char* BlueFS::get_device_name(unsigned id)
   return names[id];
 }
 
-int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
-		      PExtentVector* extents)
-{
-  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
-           << " from " << (int)id << dendl;
-  assert(id < alloc.size());
-  if (!alloc[id]) {
-    return -ENOENT;
-  }
-  extents->reserve(4);  // 4 should be (more than) enough for most allocations
-  int64_t need = round_up_to(len, alloc_size[id]);
-  int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
-  if (alloc_len < 0 || alloc_len < need) {
-    if (alloc_len > 0) {
-      alloc[id]->release(*extents);
-    }
-    derr << __func__ << " unable to allocate 0x" << std::hex << need
-	 << " on bdev " << (int)id
-         << ", allocator name " << alloc[id]->get_name()
-         << ", allocator type " << alloc[id]->get_type()
-         << ", capacity 0x" << alloc[id]->get_capacity()
-         << ", block size 0x" << alloc[id]->get_block_size()
-         << ", alloc size 0x" << alloc_size[id]
-         << ", free 0x" << alloc[id]->get_free()
-         << ", fragmentation " << alloc[id]->get_fragmentation()
-         << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
-	 << std::dec << dendl;
-    alloc[id]->dump();
-    return -ENOSPC;
-  }
-  if (is_shared_alloc(id)) {
-    shared_alloc->bluefs_used += alloc_len;
-  }
-
-  return 0;
-}
-
 int BlueFS::_allocate(uint8_t id, uint64_t len,
-		      bluefs_fnode_t* node)
+		      uint64_t alloc_unit,
+		      bluefs_fnode_t* node,
+                      size_t alloc_attempts,
+                      bool permit_dev_fallback)
 {
-  dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
-           << " from " << (int)id << dendl;
+  dout(10) << __func__ << " len 0x" << std::hex << len
+           << " au 0x" << alloc_unit
+           << std::dec << " from " << (int)id
+           << " cooldown " << cooldown_deadline
+           << dendl;
   ceph_assert(id < alloc.size());
   int64_t alloc_len = 0;
   PExtentVector extents;
   uint64_t hint = 0;
   int64_t need = len;
+  bool shared = is_shared_alloc(id);
+  auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0;
+  bool was_cooldown = false;
   if (alloc[id]) {
-    need = round_up_to(len, alloc_size[id]);
+    if (!alloc_unit) {
+      alloc_unit = alloc_size[id];
+    }
+    // do not attempt shared_allocator with bluefs alloc unit
+    // when cooling down, fallback to slow dev alloc unit.
+    if (shared && alloc_unit != shared_unit) {
+       if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() <
+           cooldown_deadline) {
+         logger->inc(l_bluefs_alloc_shared_size_fallbacks);
+         alloc_unit = shared_unit;
+         was_cooldown = true;
+       } else if (cooldown_deadline.fetch_and(0)) {
+         // we might get false cooldown_deadline reset at this point
+         // but that's mostly harmless.
+         dout(1) << __func__ << " shared allocation cooldown period elapsed"
+                 << dendl;
+       }
+    }
+    need = round_up_to(len, alloc_unit);
     if (!node->extents.empty() && node->extents.back().bdev == id) {
       hint = node->extents.back().end();
     }   
+    ++alloc_attempts;
     extents.reserve(4);  // 4 should be (more than) enough for most allocations
-    alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
+    alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
   }
   if (alloc_len < 0 || alloc_len < need) {
     if (alloc[id]) {
       if (alloc_len > 0) {
         alloc[id]->release(extents);
       }
+      if (!was_cooldown && shared) {
+        auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown;
+        cooldown_deadline = delay_s +
+          duration_cast<seconds>(real_clock::now().time_since_epoch()).count();
+        dout(1) << __func__ << " shared allocation cooldown set for "
+                << delay_s << "s"
+                << dendl;
+      }
       dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
 	      << " on bdev " << (int)id
               << ", allocator name " << alloc[id]->get_name()
               << ", allocator type " << alloc[id]->get_type()
               << ", capacity 0x" << alloc[id]->get_capacity()
               << ", block size 0x" << alloc[id]->get_block_size()
-              << ", alloc size 0x" << alloc_size[id]
+              << ", alloc unit 0x" << alloc_unit
               << ", free 0x" << alloc[id]->get_free()
               << ", fragmentation " << alloc[id]->get_fragmentation()
               << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
 	      << std::dec << dendl;
     } else {
-      dout(20) << __func__ << " alloc-id not set on index="<< (int)id << " unable to allocate 0x" << std::hex << need
+      dout(20) << __func__ << " alloc-id not set on index="<< (int)id
+               << " unable to allocate 0x" << std::hex << need
 	       << " on bdev " << (int)id << std::dec << dendl;
     }
-    if (id != BDEV_SLOW) {
+    if (alloc[id] && shared && alloc_unit != shared_unit) {
+      alloc_unit = shared_unit;
+      dout(20) << __func__ << " fallback to bdev "
+	       << (int)id
+               << " with alloc unit 0x" << std::hex << alloc_unit
+               << std::dec << dendl;
+      logger->inc(l_bluefs_alloc_shared_size_fallbacks);
+      return _allocate(id,
+                       len,
+                       alloc_unit,
+                       node,
+                       alloc_attempts,
+                       permit_dev_fallback);
+    } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
       dout(20) << __func__ << " fallback to bdev "
 	       << (int)id + 1
 	       << dendl;
-      return _allocate(id + 1, len, node);
+      if (alloc_attempts > 0 && is_shared_alloc(id + 1)) {
+        logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
+      }
+      return _allocate(id + 1,
+                       len,
+                       0, // back to default alloc unit
+                       node,
+                       alloc_attempts,
+                       permit_dev_fallback);
     } else {
       derr << __func__ << " allocation failed, needed 0x" << std::hex << need
            << dendl;
@@ -3581,7 +3842,7 @@ int BlueFS::_allocate(uint8_t id, uint64_t len,
       logger->set(max_bytes_pcounters[id], used);
       max_bytes[id] = used;
     }
-    if (is_shared_alloc(id)) {
+    if (shared) {
       shared_alloc->bluefs_used += alloc_len;
     }
   }
@@ -3611,6 +3872,7 @@ int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
     vselector->sub_usage(f->vselector_hint, f->fnode);
     int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
       want,
+      0,
       &f->fnode);
     vselector->add_usage(f->vselector_hint, f->fnode);
     if (r < 0)
@@ -3650,11 +3912,13 @@ void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
 {
   if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
       _should_start_compact_log_L_N()) {
+    auto t0 = mono_clock::now();
     if (cct->_conf->bluefs_compact_log_sync) {
       _compact_log_sync_LNF_LD();
     } else {
       _compact_log_async_LD_LNF_D();
     }
+    logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0);
   }
 }
 
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 7132e1e94c929..1b4cef63e4441 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -61,9 +61,12 @@ enum {
   l_bluefs_read_disk_bytes_slow,
   l_bluefs_read_prefetch_count,
   l_bluefs_read_prefetch_bytes,
+  l_bluefs_compaction_lat,
+  l_bluefs_compaction_lock_lat,
+  l_bluefs_alloc_shared_dev_fallbacks,
+  l_bluefs_alloc_shared_size_fallbacks,
   l_bluefs_read_zeros_candidate,
   l_bluefs_read_zeros_errors,
-
   l_bluefs_last,
 };
 
@@ -92,16 +95,19 @@ class BlueFSVolumeSelector {
 struct bluefs_shared_alloc_context_t {
   bool need_init = false;
   Allocator* a = nullptr;
+  uint64_t alloc_unit = 0;
 
   std::atomic<uint64_t> bluefs_used = 0;
 
-  void set(Allocator* _a) {
+  void set(Allocator* _a, uint64_t _au) {
     a = _a;
+    alloc_unit = _au;
     need_init = true;
     bluefs_used = 0;
   }
   void reset() {
     a = nullptr;
+    alloc_unit = 0;
   }
 };
 
@@ -331,6 +337,8 @@ class BlueFS {
     l_bluefs_max_bytes_wal,
     l_bluefs_max_bytes_db,
     l_bluefs_max_bytes_slow,
+    l_bluefs_max_bytes_wal,
+    l_bluefs_max_bytes_db,
   };
 
   // cache
@@ -391,6 +399,7 @@ class BlueFS {
   inline bool is_shared_alloc(unsigned id) const {
     return id == shared_alloc_id;
   }
+  std::atomic<int64_t> cooldown_deadline = 0;
 
   class SocketHook;
   SocketHook* asok_hook = nullptr;
@@ -404,7 +413,8 @@ class BlueFS {
   void _init_alloc();
   void _stop_alloc();
 
-  void _pad_bl(ceph::buffer::list& bl);  ///< pad ceph::buffer::list to block size w/ zeros
+  ///< pad ceph::buffer::list to max(block size, pad_size) w/ zeros
+  void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
 
   uint64_t _get_used(unsigned id) const;
   uint64_t _get_total(unsigned id) const;
@@ -418,9 +428,10 @@ class BlueFS {
   }
   const char* get_device_name(unsigned id);
   int _allocate(uint8_t bdev, uint64_t len,
-		bluefs_fnode_t* node);
-  int _allocate_without_fallback(uint8_t id, uint64_t len,
-				 PExtentVector* extents);
+                uint64_t alloc_unit,
+		bluefs_fnode_t* node,
+                size_t alloc_attempts = 0,
+                bool permit_dev_fallback = true);
 
   /* signal replay log to include h->file in nearest log flush */
   int _signal_dirty_to_log_D(FileWriter *h);
@@ -447,6 +458,11 @@ class BlueFS {
 			       int64_t available_runway);
   int _flush_and_sync_log_LD(uint64_t want_seq = 0);
 
+  uint64_t _estimate_transaction_size(bluefs_transaction_t* t);
+  uint64_t _make_initial_transaction(uint64_t start_seq,
+                                     bluefs_fnode_t& fnode,
+                                     uint64_t expected_final_size,
+                                     bufferlist* out);
   uint64_t _estimate_log_size_N();
   bool _should_start_compact_log_L_N();
 
@@ -456,15 +472,15 @@ class BlueFS {
     RENAME_SLOW2DB = 4,
     RENAME_DB2SLOW = 8,
   };
-  void _compact_log_dump_metadata_NF(bluefs_transaction_t *t,
-				 int flags);
-  void _compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
-					   uint64_t capture_before_seq);
+  void _compact_log_dump_metadata_NF(uint64_t start_seq,
+                                     bluefs_transaction_t *t,
+				     int flags,
+				     uint64_t capture_before_seq);
 
   void _compact_log_sync_LNF_LD();
   void _compact_log_async_LD_LNF_D();
 
-  void _rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
+  void _rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
 				    int super_dev,
 				    int log_dev,
 				    int new_log_dev,
@@ -473,7 +489,7 @@ class BlueFS {
 
   //void _aio_finish(void *priv);
 
-  void _flush_bdev(FileWriter *h);
+  void _flush_bdev(FileWriter *h, bool check_mutex_locked = true);
   void _flush_bdev();  // this is safe to call without a lock
   void _flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs);  // this is safe to call without a lock
 
@@ -500,6 +516,7 @@ class BlueFS {
     const char* op_name);
   int _verify_alloc_granularity(
     __u8 id, uint64_t offset, uint64_t length,
+    uint64_t alloc_unit,
     const char *op);
   int _replay(bool noop, bool to_stdout = false); ///< replay journal
 
diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc
index 02d895dee95a4..21b5a7b6804c6 100644
--- a/src/os/bluestore/BlueStore.cc
+++ b/src/os/bluestore/BlueStore.cc
@@ -5841,12 +5841,12 @@ int BlueStore::_create_alloc()
       delete alloc;
       return -EINVAL;
     }
-    shared_alloc.set(a);
+    shared_alloc.set(a, alloc_size);
   } else
 #endif
   {
     // BlueFS will share the same allocator
-    shared_alloc.set(alloc);
+    shared_alloc.set(alloc, alloc_size);
   }
 
   return 0;
diff --git a/src/os/bluestore/bluefs_types.cc b/src/os/bluestore/bluefs_types.cc
index 3a812cf5f018d..c8d2ede7bed92 100644
--- a/src/os/bluestore/bluefs_types.cc
+++ b/src/os/bluestore/bluefs_types.cc
@@ -167,7 +167,6 @@ bluefs_fnode_delta_t* bluefs_fnode_t::make_delta(bluefs_fnode_delta_t* delta) {
       delta->extents.push_back(*p);
       ++p;
     }
-    reset_delta();
   }
   return delta;
 }
diff --git a/src/os/bluestore/bluefs_types.h b/src/os/bluestore/bluefs_types.h
index b53000188ae77..d5d8ee5a62826 100644
--- a/src/os/bluestore/bluefs_types.h
+++ b/src/os/bluestore/bluefs_types.h
@@ -62,7 +62,7 @@ struct bluefs_fnode_t {
   uint64_t ino;
   uint64_t size;
   utime_t mtime;
-  uint8_t __unused__; // was prefer_bdev
+  uint8_t __unused__ = 0; // was prefer_bdev
   mempool::bluefs::vector<bluefs_extent_t> extents;
 
   // precalculated logical offsets for extents vector entries
@@ -72,7 +72,15 @@ struct bluefs_fnode_t {
   uint64_t allocated;
   uint64_t allocated_commited;
 
-  bluefs_fnode_t() : ino(0), size(0), __unused__(0), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t() : ino(0), size(0), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t(uint64_t _ino, uint64_t _size, utime_t _mtime) :
+    ino(_ino), size(_size), mtime(_mtime), allocated(0), allocated_commited(0) {}
+  bluefs_fnode_t(const bluefs_fnode_t& other) :
+    ino(other.ino), size(other.size), mtime(other.mtime),
+    allocated(other.allocated),
+    allocated_commited(other.allocated_commited) {
+    clone_extents(other);
+  }
 
   uint64_t get_allocated() const {
     return allocated;
@@ -111,10 +119,14 @@ struct bluefs_fnode_t {
     denc(v.extents, p);
     DENC_FINISH(p);
   }
-
   void reset_delta() {
     allocated_commited = allocated;
   }
+  void clone_extents(const bluefs_fnode_t& fnode) {
+    for (const auto& p : fnode.extents) {
+      append_extent(p);
+    }
+  }
   void claim_extents(mempool::bluefs::vector<bluefs_extent_t>& extents) {
     for (const auto& p : extents) {
       append_extent(p);
@@ -144,6 +156,12 @@ struct bluefs_fnode_t {
     extents.erase(it);
   }
   
+  void swap(bluefs_fnode_t& other) {
+    std::swap(ino, other.ino);
+    std::swap(size, other.size);
+    std::swap(mtime, other.mtime);
+    swap_extents(other);
+  }
   void swap_extents(bluefs_fnode_t& other) {
     other.extents.swap(extents);
     other.extents_index.swap(extents_index);
@@ -285,9 +303,10 @@ struct bluefs_transaction_t {
   void op_file_update_inc(bluefs_fnode_t& file) {
     using ceph::encode;
     bluefs_fnode_delta_t delta;
-    file.make_delta(&delta); //also resets delta to zero
+    file.make_delta(&delta);
     encode((__u8)OP_FILE_UPDATE_INC, op_bl);
     encode(delta, op_bl);
+    file.reset_delta();
   }
   void op_file_remove(uint64_t ino) {
     using ceph::encode;
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index ca0afa49f7fb9..f665ca1f82ee7 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -10,13 +10,14 @@
 #include <random>
 #include <thread>
 #include <stack>
+#include <gtest/gtest.h>
 #include "global/global_init.h"
 #include "common/ceph_argparse.h"
 #include "include/stringify.h"
 #include "include/scope_guard.h"
 #include "common/errno.h"
-#include <gtest/gtest.h>
 
+#include "os/bluestore/Allocator.h"
 #include "os/bluestore/BlueFS.h"
 
 using namespace std;
@@ -234,7 +235,7 @@ TEST(BlueFS, very_large_write) {
     delete h;
     ASSERT_EQ(0, fs.open_for_read("dir", "bigfile", &h));
     ASSERT_EQ(h->file->fnode.size, total_written);
-    unique_ptr<char> huge_buf(new char[h->file->fnode.size]);
+    auto huge_buf = std::make_unique<char[]>(h->file->fnode.size);
     auto l = h->file->fnode.size;
     int64_t r = fs.read(h, 0, l, NULL, huge_buf.get());
     ASSERT_EQ(r, l);
@@ -1111,6 +1112,225 @@ TEST(BlueFS, truncate_fsync) {
   }
 }
 
+TEST(BlueFS, test_shared_alloc) {
+  uint64_t size = 1048576 * 128;
+  TempBdev bdev_slow{size};
+  uint64_t size_db = 1048576 * 8;
+  TempBdev bdev_db{size_db};
+
+  ConfSaver conf(g_ceph_context->_conf);
+  conf.SetVal("bluefs_shared_alloc_size", "1048576");
+
+  bluefs_shared_alloc_context_t shared_alloc;
+  uint64_t shared_alloc_unit = 4096;
+  shared_alloc.set(
+    Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+                      size, shared_alloc_unit, 0, 0, "test shared allocator"),
+    shared_alloc_unit);
+  shared_alloc.a->init_add_free(0, size);
+
+  BlueFS fs(g_ceph_context);
+  // DB device is fully utilized
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, size_db - 0x1000));
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0,
+                                   &shared_alloc));
+  uuid_d fsid;
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+  ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+  {
+    for (int i=0; i<10; i++) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       ASSERT_EQ(0, fs.mkdir(dir));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          BlueFS::FileWriter *h;
+          ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+          ASSERT_NE(nullptr, h);
+          auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+          bufferlist bl;
+          std::unique_ptr<char[]> buf = gen_buffer(4096);
+	  bufferptr bp = buffer::claim_char(4096, buf.get());
+	  bl.push_back(bp);
+          h->append(bl.c_str(), bl.length());
+          fs.fsync(h);
+       }
+    }
+  }
+  {
+    for (int i=0; i<10; i+=2) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          fs.unlink(dir, file);
+	  fs.sync_metadata(false);
+       }
+       ASSERT_EQ(0, fs.rmdir(dir));
+       fs.sync_metadata(false);
+    }
+  }
+  fs.compact_log();
+  auto *logger = fs.get_perf_counters();
+  ASSERT_NE(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0);
+  auto num_files = logger->get(l_bluefs_num_files);
+  fs.umount();
+  fs.mount();
+  ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+  fs.umount();
+}
+
+TEST(BlueFS, test_shared_alloc_sparse) {
+  uint64_t size = 1048576 * 128 * 2;
+  uint64_t main_unit = 4096;
+  uint64_t bluefs_alloc_unit = 1048576;
+  TempBdev bdev_slow{size};
+
+  ConfSaver conf(g_ceph_context->_conf);
+  conf.SetVal("bluefs_shared_alloc_size",
+    stringify(bluefs_alloc_unit).c_str());
+
+  bluefs_shared_alloc_context_t shared_alloc;
+  shared_alloc.set(
+    Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+                      size, main_unit, 0, 0, "test shared allocator"),
+    main_unit);
+  // prepare sparse free space but let's have a continuous chunk at
+  // the beginning to fit initial log's fnode into superblock,
+  // we don't have any tricks to deal with sparse allocations
+  // (and hence long fnode) at mkfs
+  shared_alloc.a->init_add_free(bluefs_alloc_unit, 4 * bluefs_alloc_unit);
+  for(uint64_t i = 5 * bluefs_alloc_unit; i < size; i += 2 * main_unit) {
+    shared_alloc.a->init_add_free(i, main_unit);
+  }
+
+  BlueFS fs(g_ceph_context);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0,
+                                   &shared_alloc));
+  uuid_d fsid;
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+  ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+  {
+    for (int i=0; i<10; i++) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       ASSERT_EQ(0, fs.mkdir(dir));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          BlueFS::FileWriter *h;
+          ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+          ASSERT_NE(nullptr, h);
+          auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+          bufferlist bl;
+          std::unique_ptr<char[]> buf = gen_buffer(4096);
+	  bufferptr bp = buffer::claim_char(4096, buf.get());
+	  bl.push_back(bp);
+          h->append(bl.c_str(), bl.length());
+          fs.fsync(h);
+       }
+    }
+  }
+  {
+    for (int i=0; i<10; i+=2) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          fs.unlink(dir, file);
+	  fs.sync_metadata(false);
+       }
+       ASSERT_EQ(0, fs.rmdir(dir));
+       fs.sync_metadata(false);
+    }
+  }
+  fs.compact_log();
+  auto *logger = fs.get_perf_counters();
+  ASSERT_NE(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0);
+  auto num_files = logger->get(l_bluefs_num_files);
+  fs.umount();
+
+  fs.mount();
+  ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+  fs.umount();
+}
+
+TEST(BlueFS, test_4k_shared_alloc) {
+  uint64_t size = 1048576 * 128 * 2;
+  uint64_t main_unit = 4096;
+  uint64_t bluefs_alloc_unit = main_unit;
+  TempBdev bdev_slow{size};
+
+  ConfSaver conf(g_ceph_context->_conf);
+  conf.SetVal("bluefs_shared_alloc_size",
+    stringify(bluefs_alloc_unit).c_str());
+
+  bluefs_shared_alloc_context_t shared_alloc;
+  shared_alloc.set(
+    Allocator::create(g_ceph_context, g_ceph_context->_conf->bluefs_allocator,
+                      size, main_unit, 0, 0, "test shared allocator"),
+    main_unit);
+  shared_alloc.a->init_add_free(bluefs_alloc_unit, size - bluefs_alloc_unit);
+
+  BlueFS fs(g_ceph_context);
+  ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_slow.path, false, 0,
+                                   &shared_alloc));
+  uuid_d fsid;
+  ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, false, false }));
+  ASSERT_EQ(0, fs.mount());
+  ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, false, false }));
+  {
+    for (int i=0; i<10; i++) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       ASSERT_EQ(0, fs.mkdir(dir));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          BlueFS::FileWriter *h;
+          ASSERT_EQ(0, fs.open_for_write(dir, file, &h, false));
+          ASSERT_NE(nullptr, h);
+          auto sg = make_scope_guard([&fs, h] { fs.close_writer(h); });
+          bufferlist bl;
+          std::unique_ptr<char[]> buf = gen_buffer(4096);
+	  bufferptr bp = buffer::claim_char(4096, buf.get());
+	  bl.push_back(bp);
+          h->append(bl.c_str(), bl.length());
+          fs.fsync(h);
+       }
+    }
+  }
+  {
+    for (int i=0; i<10; i+=2) {
+       string dir = "dir.";
+       dir.append(to_string(i));
+       for (int j=0; j<10; j++) {
+          string file = "file.";
+	  file.append(to_string(j));
+          fs.unlink(dir, file);
+	  fs.sync_metadata(false);
+       }
+       ASSERT_EQ(0, fs.rmdir(dir));
+       fs.sync_metadata(false);
+    }
+  }
+  fs.compact_log();
+  auto *logger = fs.get_perf_counters();
+  ASSERT_EQ(logger->get(l_bluefs_alloc_shared_dev_fallbacks), 0);
+  ASSERT_EQ(logger->get(l_bluefs_alloc_shared_size_fallbacks), 0);
+  auto num_files = logger->get(l_bluefs_num_files);
+  fs.umount();
+
+  fs.mount();
+  ASSERT_EQ(num_files, logger->get(l_bluefs_num_files));
+  fs.umount();
+}
+
 int main(int argc, char **argv) {
   auto args = argv_to_vec(argc, argv);
   map<string,string> defaults = {