Merge 'lsa: background reclaim' from Avi Kivity

Add a background fiber that works to free memory using spare cycles, so that allocations don't have to evict cache synchronously. The shares for the fiber are increased the closer we are to running out of memory, preferring to steal cycles from the workload rather than encountering stalls. The last patch is not strictly related but is a good idea. See backport notes in the first patch. The others were trivial. Test: unit (dev) Ref scylladb#2113 Ref scylladb#2106 Ref scylladb#2071 Ref scylladb#2039 Closes scylladb#2129 * github.com:scylladb/scylla-enterprise: lsa: Mark compact_segment_locked() as noexcept lsa: Avoid excessive eviction if region is not compactible logalloc: fix quadratic behaviour of reclaim_from_evictable logalloc: reduce minimum lsa reserve in allocating_section to 1 main: start background reclaim before bootstrap Merge 'lsa: background reclaim' from Avi Kivity logalloc: background reclaim
benipeled · Mar 7, 2022 · b022822 · b022822
2 parents 9a3370b + b674896
commit b022822
Show file tree

Hide file tree

Showing 5 changed files with 267 additions and 46 deletions.
diff --git a/configure.py b/configure.py
@@ -1383,7 +1383,7 @@ def configure_seastar(build_dir, mode):
         '-DSeastar_CXX_DIALECT=gnu++20',
         '-DSeastar_API_LEVEL=6',
         '-DSeastar_UNUSED_RESULT_ERROR=ON',
-        '-DSeastar_SCHEDULING_GROUPS_COUNT=17',
+        '-DSeastar_SCHEDULING_GROUPS_COUNT=18',
     ]
 
     if args.stack_guards is not None:

diff --git a/main.cc b/main.cc
@@ -581,7 +581,24 @@ int main(int ac, char** av) {
                     return seastar::scheduling_group();
                 }
             };
+            auto background_reclaim_scheduling_group = make_sched_group("background_reclaim", 50);
             auto maintenance_scheduling_group = make_sched_group("streaming", 200);
+
+            smp::invoke_on_all([&cfg, background_reclaim_scheduling_group] {
+                logalloc::tracker::config st_cfg;
+                st_cfg.defragment_on_idle = cfg->defragment_memory_on_idle();
+                st_cfg.abort_on_lsa_bad_alloc = cfg->abort_on_lsa_bad_alloc();
+                st_cfg.lsa_reclamation_step = cfg->lsa_reclamation_step();
+                st_cfg.background_reclaim_sched_group = background_reclaim_scheduling_group;
+                logalloc::shard_tracker().configure(st_cfg);
+            }).get();
+
+            auto stop_lsa_background_reclaim = defer([&] {
+                smp::invoke_on_all([&] {
+                    return logalloc::shard_tracker().stop();
+                }).get();
+            });
+
             uint16_t api_port = cfg->api_port();
             ctx.api_dir = cfg->api_ui_dir();
             ctx.api_doc = cfg->api_doc_dir();
@@ -1327,14 +1344,6 @@ int main(int ac, char** av) {
                 }).get();
             }
 
-            smp::invoke_on_all([&cfg] {
-                logalloc::tracker::config st_cfg;
-                st_cfg.defragment_on_idle = cfg->defragment_memory_on_idle();
-                st_cfg.abort_on_lsa_bad_alloc = cfg->abort_on_lsa_bad_alloc();
-                st_cfg.lsa_reclamation_step = cfg->lsa_reclamation_step();
-                logalloc::shard_tracker().configure(st_cfg);
-            }).get();
-
             seastar::set_abort_on_ebadf(cfg->abort_on_ebadf());
             api::set_server_done(ctx).get();
             supervisor::notify("serving");

diff --git a/test/boost/logalloc_test.cc b/test/boost/logalloc_test.cc
@@ -29,6 +29,7 @@
 #include "utils/managed_ref.hh"
 #include "utils/managed_bytes.hh"
 #include "test/lib/log.hh"
+#include "test/lib/random_utils.hh"
 #include "log.hh"
 
 [[gnu::unused]]
@@ -1450,4 +1451,84 @@ SEASTAR_THREAD_TEST_CASE(test_decay_reserves) {
     BOOST_REQUIRE_LE(reclaims, expected_reclaims);
 }
 
+SEASTAR_THREAD_TEST_CASE(background_reclaim) {
+    prime_segment_pool(memory::stats().total_memory(), memory::min_free_memory()).get();  // if previous test cases muddied the pool
+
+    region evictable;
+    std::vector<managed_bytes> evictable_allocs;
+
+    auto& rnd = seastar::testing::local_random_engine;
+
+    auto clean_up = defer([&] {
+        with_allocator(evictable.allocator(), [&] {
+            evictable_allocs.clear();
+        });
+    });
+
+
+    // Fill up memory with allocations
+    size_t lsa_alloc_size = 300;
+
+    while (true) {
+        try {
+            with_allocator(evictable.allocator(), [&] {
+                evictable_allocs.push_back(managed_bytes(managed_bytes::initialized_later(), lsa_alloc_size));
+            });
+        } catch (std::bad_alloc&) {
+            break;
+        }
+    }
+
+    // make the reclaimer work harder
+    std::shuffle(evictable_allocs.begin(), evictable_allocs.end(), rnd);
+
+    evictable.make_evictable([&] () -> memory::reclaiming_result {
+       if (evictable_allocs.empty()) {
+           return memory::reclaiming_result::reclaimed_nothing;
+       }
+       with_allocator(evictable.allocator(), [&] {
+           evictable_allocs.pop_back();
+       });
+       return memory::reclaiming_result::reclaimed_something;
+    });
+
+    // Set up the background reclaimer
+
+    auto background_reclaim_scheduling_group = create_scheduling_group("background_reclaim", 100).get0();
+    auto kill_sched_group = defer([&] {
+        destroy_scheduling_group(background_reclaim_scheduling_group).get();
+    });
+
+    logalloc::tracker::config st_cfg;
+    st_cfg.defragment_on_idle = false;
+    st_cfg.abort_on_lsa_bad_alloc = false;
+    st_cfg.lsa_reclamation_step = 1;
+    st_cfg.background_reclaim_sched_group = background_reclaim_scheduling_group;
+    logalloc::shard_tracker().configure(st_cfg);
+
+    auto stop_lsa_background_reclaim = defer([&] {
+        return logalloc::shard_tracker().stop().get();
+    });
+
+    sleep(500ms).get(); // sleep a little, to give the reclaimer a head start
+
+    std::vector<managed_bytes> std_allocs;
+    size_t std_alloc_size = 1000000; // note that managed_bytes fragments these, even in std
+    for (int i = 0; i < 50; ++i) {
+        auto compacted_pre = logalloc::memory_compacted();
+        fmt::print("compacted {} items {} (pre)\n", compacted_pre, evictable_allocs.size());
+        std_allocs.emplace_back(managed_bytes::initialized_later(), std_alloc_size);
+        auto compacted_post = logalloc::memory_compacted();
+        fmt::print("compacted {} items {} (post)\n", compacted_post, evictable_allocs.size());
+        BOOST_REQUIRE_EQUAL(compacted_pre, compacted_post);
+
+        // Pretend to do some work. Sleeping would be too easy, as the background reclaim group would use
+        // all that time.
+        auto deadline = std::chrono::steady_clock::now() + 100ms;
+        while (std::chrono::steady_clock::now() < deadline) {
+            thread::maybe_yield();
+        }
+    }
+}
+
 #endif