New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
rename maintenance mode to decommission #7154
Changes from 1 commit
950b03f
ab31ac5
85f6ef6
5fd6ffe
acfea61
756ac62
335a84a
fc54161
ce2555a
e9b1242
1da2cb8
5abb5da
4834037
4b5bd4c
be4daba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -694,7 +694,7 @@ public CoordinatorHistoricalManagerRunnable(final int startingLeaderCounter) | |
} | ||
|
||
// Find all historical servers, group them by subType and sort by ascending usage | ||
Set<String> nodesInMaintenance = params.getCoordinatorDynamicConfig().getHistoricalNodesInMaintenance(); | ||
Set<String> decommissioned = params.getCoordinatorDynamicConfig().getDecommissionNodes(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested "decommissioningNodes" |
||
final DruidCluster cluster = new DruidCluster(); | ||
for (ImmutableDruidServer server : servers) { | ||
if (!loadManagementPeons.containsKey(server.getName())) { | ||
|
@@ -709,7 +709,7 @@ public CoordinatorHistoricalManagerRunnable(final int startingLeaderCounter) | |
new ServerHolder( | ||
server, | ||
loadManagementPeons.get(server.getName()), | ||
nodesInMaintenance.contains(server.getHost()) | ||
decommissioned.contains(server.getHost()) | ||
) | ||
); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,18 +32,18 @@ public class ServerHolder implements Comparable<ServerHolder> | |
private static final Logger log = new Logger(ServerHolder.class); | ||
private final ImmutableDruidServer server; | ||
private final LoadQueuePeon peon; | ||
private final boolean inMaintenance; | ||
private final boolean isDecommissioned; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested "isDecommissioning" or "beingDecommissioned" |
||
|
||
public ServerHolder(ImmutableDruidServer server, LoadQueuePeon peon) | ||
{ | ||
this(server, peon, false); | ||
} | ||
|
||
public ServerHolder(ImmutableDruidServer server, LoadQueuePeon peon, boolean inMaintenance) | ||
public ServerHolder(ImmutableDruidServer server, LoadQueuePeon peon, boolean isDecommissioned) | ||
{ | ||
this.server = server; | ||
this.peon = peon; | ||
this.inMaintenance = inMaintenance; | ||
this.isDecommissioned = isDecommissioned; | ||
} | ||
|
||
public ImmutableDruidServer getServer() | ||
|
@@ -82,14 +82,14 @@ public double getPercentUsed() | |
} | ||
|
||
/** | ||
* Historical nodes can be placed in maintenance mode, which instructs Coordinator to move segments from them | ||
* according to a specified priority. The mechanism allows to drain segments from nodes which are planned for | ||
* Historical nodes can be 'decommissioned', which instructs Coordinator to move segments from them | ||
* according to a specified priority. The mechanism allows draining segments from nodes which are planned for | ||
* replacement. | ||
* @return true if the node is in maitenance mode | ||
* @return true if the node is being decommissioned | ||
*/ | ||
public boolean isInMaintenance() | ||
public boolean isDecommissioned() | ||
{ | ||
return inMaintenance; | ||
return isDecommissioned; | ||
} | ||
|
||
public long getAvailableSize() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -107,25 +107,25 @@ private void balanceTier( | |
} | ||
|
||
/* | ||
Take as much segments from maintenance servers as priority allows and find the best location for them on | ||
Take as many segments from decommissioned servers as priority allows and find the best location for them on | ||
available servers. After that, balance segments within available servers pool. | ||
*/ | ||
Map<Boolean, List<ServerHolder>> partitions = | ||
servers.stream().collect(Collectors.partitioningBy(ServerHolder::isInMaintenance)); | ||
final List<ServerHolder> maintenanceServers = partitions.get(true); | ||
servers.stream().collect(Collectors.partitioningBy(ServerHolder::isDecommissioned)); | ||
final List<ServerHolder> decommssionedServers = partitions.get(true); | ||
final List<ServerHolder> availableServers = partitions.get(false); | ||
log.info( | ||
"Found %d servers in maintenance, %d available servers servers", | ||
maintenanceServers.size(), | ||
"Found %d decomissioned servers, %d available servers servers", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "%d decommissioning servers, % active servers"
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to 'active' |
||
decommssionedServers.size(), | ||
availableServers.size() | ||
); | ||
|
||
if (maintenanceServers.isEmpty()) { | ||
if (decommssionedServers.isEmpty()) { | ||
if (availableServers.size() <= 1) { | ||
log.info("[%s]: %d available servers servers found. Cannot balance.", tier, availableServers.size()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "servers" duplicated |
||
} | ||
} else if (availableServers.isEmpty()) { | ||
log.info("[%s]: no available servers servers found during maintenance. Cannot balance.", tier); | ||
log.info("[%s]: no available servers servers found during decommissioning. Cannot balance.", tier); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested to change the message to something like "no active servers found, segments can't be moved off %d decommissioning servers"
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "no active servers found, segments can't be moved off %d decommissioning servers" leads me to the question: should we really give up decommission in this case? Maybe if segments are available on historical nodes from other tiers (or from the same tier, on other decommissioning servers or on full active servers -- see below), or actually always, we should continue decommission by just dropping segments from the decommissioning servers and not moving them anywhere? Also this condition doesn't cover the situation when there are active historical nodes in the tier, but they are either full, or their load queuess have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I elaborated on the problems related to the independence of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@egor-ryashin, what's your take on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @leventov the initial idea was to keep the replication at the same level. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consolidated the 'cannot balance' checks here and changed the message here to just log.warn that there is an insufficient number of active servers to do anything and added a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This should be reflected in docs, as well as other corner cases related to decommissioning nodes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll try to add docs that decommissioning can become stalled if there are no active servers available to move segments to. |
||
} | ||
egor-ryashin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
int numSegments = 0; | ||
|
@@ -139,18 +139,18 @@ private void balanceTier( | |
} | ||
|
||
final int maxSegmentsToMove = Math.min(params.getCoordinatorDynamicConfig().getMaxSegmentsToMove(), numSegments); | ||
int priority = params.getCoordinatorDynamicConfig().getNodesInMaintenancePriority(); | ||
int maxMaintenanceSegmentsToMove = (int) Math.ceil(maxSegmentsToMove * priority / 10.0); | ||
log.info("Processing %d segments from servers in maintenance mode", maxMaintenanceSegmentsToMove); | ||
Pair<Integer, Integer> maintenanceResult = | ||
balanceServers(params, maintenanceServers, availableServers, maxMaintenanceSegmentsToMove); | ||
int maxGeneralSegmentsToMove = maxSegmentsToMove - maintenanceResult.lhs; | ||
log.info("Processing %d segments from servers in general mode", maxGeneralSegmentsToMove); | ||
int priority = params.getCoordinatorDynamicConfig().getDecommissionPriority(); | ||
int maxDecommissionedSegmentsToMove = (int) Math.ceil(maxSegmentsToMove * priority / 10.0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested "maxSegmentsToMoveOffDecommissioningNodes" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd vote for smth ending in "segments". |
||
log.info("Processing %d segments from decommissioned servers", maxDecommissionedSegmentsToMove); | ||
Pair<Integer, Integer> decommissionedResult = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggested "decommissioningResult" |
||
balanceServers(params, decommssionedServers, availableServers, maxDecommissionedSegmentsToMove); | ||
int maxGeneralSegmentsToMove = maxSegmentsToMove - decommissionedResult.lhs; | ||
log.info("Processing %d segments for balancing", maxGeneralSegmentsToMove); | ||
Pair<Integer, Integer> generalResult = | ||
balanceServers(params, availableServers, availableServers, maxGeneralSegmentsToMove); | ||
|
||
int moved = generalResult.lhs + maintenanceResult.lhs; | ||
int unmoved = generalResult.rhs + maintenanceResult.rhs; | ||
int moved = generalResult.lhs + decommissionedResult.lhs; | ||
int unmoved = generalResult.rhs + decommissionedResult.rhs; | ||
if (unmoved == maxSegmentsToMove) { | ||
// Cluster should be alive and constantly adjusting | ||
log.info("No good moves found in tier [%s]", tier); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Parhaps better to call it "decommissionVelocity" as @drcrallen suggested here: #6349 (comment) because "priority" is a super overloaded term.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense to me, though I think the properties should be consistent named so maybe I'll either rename the other
decommissioningNodes
or call this onedecommissionVelocity
to match. Anyone have a preference?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed to
decommissioningNodes
anddecommissioningVelocity