Skip to content

Commit

Permalink
Merge 'Do not yield while traversing the gossiper endpoint state map'…
Browse files Browse the repository at this point in the history
… from Benny Halevy

This series introduces a new gossiper method: get_endpoints that returns a vector of endpoints (by value) based on the endpoint state map.

get_endpoints is used here by gossiper and storage_service for iterations that may preempt
instead of iterating direction over the endpoint state map (`_endpoint_state_map` in gossiper or via `get_endpoint_states()`) so to prevent use-after-free that may potentially happen if the map is rehashed while the function yields causing invalidation of the loop iterators.

\Fixes scylladb#13899

\Closes scylladb#13900

* github.com:scylladb/scylladb:
  storage_service: do not preempt while traversing endpoint_state_map
  gossiper: do not preempt while traversing endpoint_state_map

(cherry picked from commit d2d53fc)
  • Loading branch information
avikivity authored and bhalevy committed Dec 17, 2023
1 parent 0da3453 commit 8048821
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 11 deletions.
21 changes: 14 additions & 7 deletions gms/gossiper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -702,21 +702,24 @@ future<> gossiper::do_status_check() {

auto now = this->now();

for (auto it = _endpoint_state_map.begin(); it != _endpoint_state_map.end();) {
auto endpoint = it->first;
auto& ep_state = it->second;
it++;

bool is_alive = ep_state.is_alive();
for (const auto& endpoint : get_endpoints()) {
if (endpoint == get_broadcast_address()) {
continue;
}
auto eps = get_endpoint_state_for_endpoint_ptr(endpoint);
if (!eps) {
continue;
}
auto& ep_state = *eps;
bool is_alive = ep_state.is_alive();
auto update_timestamp = ep_state.get_update_timestamp();
// ep_state cannot be used after yielding

// check if this is a fat client. fat clients are removed automatically from
// gossip after FatClientTimeout. Do not remove dead states here.
if (is_gossip_only_member(endpoint)
&& !_just_removed_endpoints.contains(endpoint)
&& ((now - ep_state.get_update_timestamp()) > fat_client_timeout)) {
&& ((now - update_timestamp) > fat_client_timeout)) {
logger.info("FatClient {} has been silent for {}ms, removing from gossip", endpoint, fat_client_timeout.count());
co_await remove_endpoint(endpoint); // will put it in _just_removed_endpoints to respect quarantine delay
co_await evict_from_membership(endpoint); // can get rid of the state immediately
Expand Down Expand Up @@ -1381,6 +1384,10 @@ const std::unordered_map<inet_address, endpoint_state>& gms::gossiper::get_endpo
return _endpoint_state_map;
}

std::vector<inet_address> gossiper::get_endpoints() const {
return boost::copy_range<std::vector<inet_address>>(_endpoint_state_map | boost::adaptors::map_keys);
}

bool gossiper::uses_host_id(inet_address endpoint) const {
return _messaging.knows_version(endpoint) ||
get_application_state_ptr(endpoint, application_state::NET_VERSION);
Expand Down
2 changes: 2 additions & 0 deletions gms/gossiper.hh
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,8 @@ public:

const std::unordered_map<inet_address, endpoint_state>& get_endpoint_states() const noexcept;

std::vector<inet_address> get_endpoints() const;

bool uses_host_id(inet_address endpoint) const;

locator::host_id get_host_id(inet_address endpoint) const;
Expand Down
7 changes: 3 additions & 4 deletions service/storage_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1577,12 +1577,11 @@ future<> storage_service::check_for_endpoint_collision(std::unordered_set<gms::i
}
if (_db.local().get_config().consistent_rangemovement()) {
found_bootstrapping_node = false;
for (auto& x : _gossiper.get_endpoint_states()) {
auto state = _gossiper.get_gossip_status(x.second);
for (const auto& addr : _gossiper.get_endpoints()) {
auto state = _gossiper.get_gossip_status(addr);
if (state == sstring(versioned_value::STATUS_UNKNOWN)) {
throw std::runtime_error(format("Node {} has gossip status=UNKNOWN. Try fixing it before adding new node to the cluster.", x.first));
throw std::runtime_error(format("Node {} has gossip status=UNKNOWN. Try fixing it before adding new node to the cluster.", addr));
}
auto addr = x.first;
slogger.debug("Checking bootstrapping/leaving/moving nodes: node={}, status={} (check_for_endpoint_collision)", addr, state);
if (state == sstring(versioned_value::STATUS_BOOTSTRAPPING) ||
state == sstring(versioned_value::STATUS_LEAVING) ||
Expand Down

0 comments on commit 8048821

Please sign in to comment.