From 48d01bcca76e6be133f615dcca9fe0d5fedb5283 Mon Sep 17 00:00:00 2001 From: Ryan Ofsky Date: Sun, 9 Feb 2025 04:19:30 -0800 Subject: [PATCH] bugfix: Do not lock EventLoop::mutex after EventLoop is done Currently, there are two cases where code may attempt to lock `EventLoop::mutex` after the `EventLoop::loop()` method has finished running. This is not a problem by itself, but is a problem if there is external code which deletes the `EventLoop` object immediately after the method returns, which is the case in Bitcoin Core. Both cases of locking after the loop method returns happen due to uses of the `Unlock()` utility function which unlocks a mutex temporarily, runs a callback function and relocks it. The first case happens in `EventLoop::removeClient` when the `EventLoop::done()` condition is reached and it calls `Unlock()` in order to be able to call `write(post_fd, ...)` without blocking and wake the EventLoop thread. In this case, since `EventLoop` execution is done there is not really any point to using `Unlock()` and relocking, so the code is changed to use a simple `lock.unlock()` call and not try to relock. The second case happens in `EventLoop::post` where `Unlock()` is used in a similar way, and depending on thread scheduling (if the thread is delayed for a long time before relocking) can result in failing to relock `EventLoop::m_mutex` after calling `write()`. In this case, since relocking the mutex is actually necessary for the code that follows, the fix is different: new `addClient`/`removeClient` calls are added to this code, so the `EventLoop::loop()` function will never exit while the `post()` function is waiting. These two changes are being labeled as a bugfix even though there is not technically a bug here in the libmultiprocess code or API. The `EventLoop` object itself was safe before these changes as long as outside code waited for `EventLoop` methods to finish executing before deleting the `EventLoop` object. Originally the multiprocess code added in https://github.com/bitcoin/bitcoin/pull/10102 did this, but later as more features were added for binding and connecting to unix sockets, and unit tests were added which would immediately delete the `EventLoop` object after `EventLoop::loop()` returned, it meant this code could now start failing to relock after unlocking. A previous attempt was made to fix this bug in https://github.com/bitcoin/bitcoin/pull/31815 outside of libmultiprocess code. But it only addressed the first case of a failing `Unlock()` in `EventLoop::removeClient()`, not the second case in `EventLoop::post()`. This first case described above was not observed but is theoretically possible. The second case happens intermittently on macos and was reported https://github.com/chaincodelabs/libmultiprocess/issues/154 --- include/mp/proxy-io.h | 2 +- src/mp/proxy.cpp | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/include/mp/proxy-io.h b/include/mp/proxy-io.h index 02d0dee0..4eb27fae 100644 --- a/include/mp/proxy-io.h +++ b/include/mp/proxy-io.h @@ -170,7 +170,7 @@ class EventLoop //! Add/remove remote client reference counts. void addClient(std::unique_lock& lock); - void removeClient(std::unique_lock& lock); + bool removeClient(std::unique_lock& lock); //! Check if loop should exit. bool done(std::unique_lock& lock); diff --git a/src/mp/proxy.cpp b/src/mp/proxy.cpp index f1b51115..194ded5f 100644 --- a/src/mp/proxy.cpp +++ b/src/mp/proxy.cpp @@ -225,6 +225,7 @@ void EventLoop::post(const std::function& fn) return; } std::unique_lock lock(m_mutex); + addClient(lock); m_cv.wait(lock, [this] { return m_post_fn == nullptr; }); m_post_fn = &fn; int post_fd{m_post_fd}; @@ -233,21 +234,23 @@ void EventLoop::post(const std::function& fn) KJ_SYSCALL(write(post_fd, &buffer, 1)); }); m_cv.wait(lock, [this, &fn] { return m_post_fn != &fn; }); + removeClient(lock); } void EventLoop::addClient(std::unique_lock& lock) { m_num_clients += 1; } -void EventLoop::removeClient(std::unique_lock& lock) +bool EventLoop::removeClient(std::unique_lock& lock) { m_num_clients -= 1; if (done(lock)) { m_cv.notify_all(); int post_fd{m_post_fd}; - Unlock(lock, [&] { - char buffer = 0; - KJ_SYSCALL(write(post_fd, &buffer, 1)); // NOLINT(bugprone-suspicious-semicolon) - }); + lock.unlock(); + char buffer = 0; + KJ_SYSCALL(write(post_fd, &buffer, 1)); // NOLINT(bugprone-suspicious-semicolon) + return true; } + return false; } void EventLoop::startAsyncThread(std::unique_lock& lock) @@ -263,7 +266,7 @@ void EventLoop::startAsyncThread(std::unique_lock& lock) const std::function fn = std::move(m_async_fns.front()); m_async_fns.pop_front(); Unlock(lock, fn); - removeClient(lock); + if (removeClient(lock)) break; continue; } else if (m_num_clients == 0) { break;