Skip to content

Commit

Permalink
storage_service: force_remove_endpoint of changed ip address
Browse files Browse the repository at this point in the history
When a host changes its ip address we should force remove
the previous endpoint state since we want only one endpoint
to refer to this host_id.

If the new node that changed its ip address is decommissioned,
the previous node seems as a normal token owner, just in shutdown
status, but it is not longer in the cluster.

Refs scylladb#14468
Fixes scylladb#13775

Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
  • Loading branch information
bhalevy committed Aug 6, 2023
1 parent a48e7a3 commit 74a03b2
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
25 changes: 16 additions & 9 deletions service/storage_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2870,11 +2870,14 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit
if (tmptr->is_normal_token_owner(endpoint)) {
slogger.info("Node {} state jump to normal", endpoint);
}
std::unordered_set<inet_address> endpoints_to_remove;
std::unordered_map<inet_address, bool> endpoints_to_remove;

auto do_remove_node = [&] (gms::inet_address node) {
auto do_remove_node = [&] (gms::inet_address node, bool force = false) {
tmptr->remove_endpoint(node);
endpoints_to_remove.insert(node);
auto [it, inserted] = endpoints_to_remove.try_emplace(node, force);
if (!inserted && force) {
it->second = force;
}
};
// Order Matters, TM.updateHostID() should be called before TM.updateNormalToken(), (see CASSANDRA-4300).
if (_gossiper.uses_host_id(endpoint)) {
Expand All @@ -2886,7 +2889,7 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit
do_remove_node(endpoint);
} else if (_gossiper.compare_endpoint_startup(endpoint, *existing) > 0) {
slogger.warn("Host ID collision for {} between {} and {}; {} is the new owner", host_id, *existing, endpoint, endpoint);
do_remove_node(*existing);
do_remove_node(*existing, true);
slogger.info("Set host_id={} to be owned by node={}, existing={}", host_id, endpoint, *existing);
tmptr->update_host_id(host_id, endpoint);
} else {
Expand Down Expand Up @@ -2968,7 +2971,7 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit

for (const auto& ep : candidates_for_removal) {
slogger.info("handle_state_normal: endpoints_to_remove endpoint={}", ep);
endpoints_to_remove.insert(ep);
endpoints_to_remove.try_emplace(ep, false);
}

bool is_normal_token_owner = tmptr->is_normal_token_owner(endpoint);
Expand All @@ -2995,8 +2998,8 @@ future<> storage_service::handle_state_normal(inet_address endpoint, gms::permit
co_await replicate_to_all_cores(std::move(tmptr));
tmlock.reset();

for (auto ep : endpoints_to_remove) {
co_await remove_endpoint(ep, ep == endpoint ? pid : gms::null_permit_id);
for (const auto& [ep, force] : endpoints_to_remove) {
co_await remove_endpoint(ep, ep == endpoint ? pid : gms::null_permit_id, force);
}
slogger.debug("handle_state_normal: endpoint={} is_normal_token_owner={} endpoint_to_remove={} owned_tokens={}", endpoint, is_normal_token_owner, endpoints_to_remove.contains(endpoint), owned_tokens);
if (!owned_tokens.empty() && !endpoints_to_remove.count(endpoint)) {
Expand Down Expand Up @@ -3607,8 +3610,12 @@ future<> storage_service::check_for_endpoint_collision(std::unordered_set<gms::i
});
}

future<> storage_service::remove_endpoint(inet_address endpoint, gms::permit_id pid) {
co_await _gossiper.remove_endpoint(endpoint, pid);
future<> storage_service::remove_endpoint(inet_address endpoint, gms::permit_id pid, bool force) {
if (force) {
co_await _gossiper.force_remove_endpoint(endpoint, pid);
} else {
co_await _gossiper.remove_endpoint(endpoint, pid);
}
try {
co_await _sys_ks.local().remove_endpoint(endpoint);
} catch (...) {
Expand Down
2 changes: 1 addition & 1 deletion service/storage_service.hh
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ private:
future<> excise(std::unordered_set<token> tokens, inet_address endpoint, long expire_time, gms::permit_id);

/** unlike excise we just need this endpoint gone without going through any notifications **/
future<> remove_endpoint(inet_address endpoint, gms::permit_id pid);
future<> remove_endpoint(inet_address endpoint, gms::permit_id pid, bool force = false);

void add_expire_time_if_found(inet_address endpoint, int64_t expire_time);

Expand Down

0 comments on commit 74a03b2

Please sign in to comment.