Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;


/**
* Class to hold dead servers list and utility querying dead server list.
* Servers are added when they expire or when we find them in filesystem on startup.
Expand All @@ -59,13 +56,6 @@ public class DeadServer {
*/
private final Map<ServerName, Long> deadServers = new HashMap<>();

/**
* Set of dead servers currently being processed by a SCP.
* Added to this list at the start of SCP and removed after it is done
* processing the crash.
*/
private final Set<ServerName> processingServers = new HashSet<>();

/**
* @param serverName server name.
* @return true if this server is on the dead servers list false otherwise
Expand All @@ -74,17 +64,6 @@ public synchronized boolean isDeadServer(final ServerName serverName) {
return deadServers.containsKey(serverName);
}

/**
* Checks if there are currently any dead servers being processed by the
* master. Returns true if at least one region server is currently being
* processed as dead.
*
* @return true if any RS are being processed as dead
*/
synchronized boolean areDeadServersInProgress() {
return !processingServers.isEmpty();
}

public synchronized Set<ServerName> copyServerNames() {
Set<ServerName> clone = new HashSet<>(deadServers.size());
clone.addAll(deadServers.keySet());
Expand All @@ -96,29 +75,6 @@ public synchronized Set<ServerName> copyServerNames() {
*/
synchronized void putIfAbsent(ServerName sn) {
this.deadServers.putIfAbsent(sn, EnvironmentEdgeManager.currentTime());
processing(sn);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These codes are not only set the server be dead, but also set it be processing.
But the changed codes only set it be dead.
As a result ,when calling ServerManager.expireServer() method, the server is firstly set be dead by moveFromOnlineToDeadServers(), but the SCP is submitted afterwards.
So this patch made a diff from the origin codes when calling ServerManager.areDeadServersInProgress(), because the new codes has a little delay time of submit and execute the SCP procedure.


/**
* Add <code>sn<</code> to set of processing deadservers.
* @see #finish(ServerName)
*/
public synchronized void processing(ServerName sn) {
if (processingServers.add(sn)) {
// Only log on add.
LOG.debug("Processing {}; numProcessing={}", sn, processingServers.size());
}
}

/**
* Complete processing for this dead server.
* @param sn ServerName for the dead server.
* @see #processing(ServerName)
*/
public synchronized void finish(ServerName sn) {
if (processingServers.remove(sn)) {
LOG.debug("Removed {} from processing; numProcessing={}", sn, processingServers.size());
}
}

public synchronized int size() {
Expand Down Expand Up @@ -179,17 +135,12 @@ public synchronized String toString() {
// Display unified set of servers from both maps
Set<ServerName> servers = new HashSet<>();
servers.addAll(deadServers.keySet());
servers.addAll(processingServers);
StringBuilder sb = new StringBuilder();
for (ServerName sn : servers) {
if (sb.length() > 0) {
sb.append(", ");
}
sb.append(sn.toString());
// Star entries that are being processed
if (processingServers.contains(sn)) {
sb.append("*");
}
}
return sb.toString();
}
Expand Down Expand Up @@ -228,9 +179,6 @@ public synchronized Date getTimeOfDeath(final ServerName deadServerName){
* @return true if this server was removed
*/
public synchronized boolean removeDeadServer(final ServerName deadServerName) {
Preconditions.checkState(!processingServers.contains(deadServerName),
"Asked to remove server still in processingServers set " + deadServerName +
" (numProcessing=" + processingServers.size() + ")");
return this.deadServers.remove(deadServerName) != null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2440,6 +2440,15 @@ public ClearDeadServersResponse clearDeadServers(RpcController controller,
Set<Address> clearedServers = new HashSet<>();
for (HBaseProtos.ServerName pbServer : request.getServerNameList()) {
ServerName server = ProtobufUtil.toServerName(pbServer);

final boolean deadInProcess = master.getProcedures().stream().anyMatch(
p -> (p instanceof ServerCrashProcedure)
&& ((ServerCrashProcedure) p).getServerName().equals(server));
if (deadInProcess) {
throw new ServiceException(
String.format("Dead server '%s' is not 'dead' in fact...", server));
}

if (!deadServer.removeDeadServer(server)) {
response.addServerName(pbServer);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.ipc.RemoteWithExtrasException;
import org.apache.hadoop.hbase.master.assignment.RegionStates;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.util.Bytes;
Expand Down Expand Up @@ -503,8 +504,8 @@ public DeadServer getDeadServers() {
* Checks if any dead servers are currently in progress.
* @return true if any RS are being processed as dead, false if not
*/
public boolean areDeadServersInProgress() {
return this.deadservers.areDeadServersInProgress();
public boolean areDeadServersInProgress() throws IOException {
return master.getProcedures().stream().anyMatch(p -> p instanceof ServerCrashProcedure);
}

void letRegionServersShutdown() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
// This adds server to the DeadServer processing list but not to the DeadServers list.
// Server gets removed from processing list below on procedure successful finish.
if (!notifiedDeadServer) {
services.getServerManager().getDeadServers().processing(serverName);
notifiedDeadServer = true;
}

Expand Down Expand Up @@ -230,7 +229,6 @@ protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
case SERVER_CRASH_FINISH:
LOG.info("removed crashed server {} after splitting done", serverName);
services.getAssignmentManager().getRegionStates().removeServer(serverName);
services.getServerManager().getDeadServers().finish(serverName);
updateProgress(true);
return Flow.NO_MORE_STATE;
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ public void testRebalanceOnRegionServerNumberChange()
/**
* Wait on crash processing. Balancer won't run if processing a crashed server.
*/
private void waitOnCrashProcessing() {
private void waitOnCrashProcessing() throws IOException {
while (UTIL.getHBaseCluster().getMaster().getServerManager().areDeadServersInProgress()) {
LOG.info("Waiting on processing of crashed server before proceeding...");
Threads.sleep(1000);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.hbase.HBaseClassTestRule;
Expand Down Expand Up @@ -68,22 +69,10 @@ public static void tearDownAfterClass() throws Exception {
@Test public void testIsDead() {
DeadServer ds = new DeadServer();
ds.putIfAbsent(hostname123);
ds.processing(hostname123);
assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname123);
assertFalse(ds.areDeadServersInProgress());

ds.putIfAbsent(hostname1234);
ds.processing(hostname1234);
assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname1234);
assertFalse(ds.areDeadServersInProgress());

ds.putIfAbsent(hostname12345);
ds.processing(hostname12345);
assertTrue(ds.areDeadServersInProgress());
ds.finish(hostname12345);
assertFalse(ds.areDeadServersInProgress());

// Already dead = 127.0.0.1,9090,112321
// Coming back alive = 127.0.0.1,9090,223341
Expand All @@ -104,15 +93,15 @@ public static void tearDownAfterClass() throws Exception {
}

@Test
public void testCrashProcedureReplay() {
public void testCrashProcedureReplay() throws IOException {
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
final ProcedureExecutor<MasterProcedureEnv> pExecutor = master.getMasterProcedureExecutor();
ServerCrashProcedure proc = new ServerCrashProcedure(
pExecutor.getEnvironment(), hostname123, false, false);

ProcedureTestingUtility.submitAndWait(pExecutor, proc);

assertFalse(master.getServerManager().getDeadServers().areDeadServersInProgress());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test case should assert false, because the SCP has been done completely.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

assertTrue(master.getServerManager().areDeadServersInProgress());
}

@Test
Expand Down Expand Up @@ -163,17 +152,14 @@ public void testClearDeadServer(){
d.putIfAbsent(hostname1234);
Assert.assertEquals(2, d.size());

d.finish(hostname123);
d.removeDeadServer(hostname123);
Assert.assertEquals(1, d.size());
d.finish(hostname1234);
d.removeDeadServer(hostname1234);
Assert.assertTrue(d.isEmpty());

d.putIfAbsent(hostname1234);
Assert.assertFalse(d.removeDeadServer(hostname123_2));
Assert.assertEquals(1, d.size());
d.finish(hostname1234);
Assert.assertTrue(d.removeDeadServer(hostname1234));
Assert.assertTrue(d.isEmpty());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ public void testBasicRollingRestart() throws Exception {
}

private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
ServerName serverName) throws InterruptedException {
ServerName serverName) throws InterruptedException, IOException {
ServerManager sm = activeMaster.getMaster().getServerManager();
// First wait for it to be in dead list
while (!sm.getDeadServers().isDeadServer(serverName)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ public void test() throws Exception {
// calling 'finish' and then remove it from dead servers so rsServerName
// becomes an 'Unknown Server' even though it is still around.
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
master.getServerManager().getDeadServers().finish(rsServerName);
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
// Kill the server. Nothing should happen since an 'Unknown Server' as far
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.fs.Path;
Expand Down Expand Up @@ -101,7 +102,7 @@ public void testStandbyKillRegionServer() throws Exception {
}

private void waitForRSShutdownToStartAndFinish(JVMClusterUtil.MasterThread activeMaster,
ServerName serverName) throws InterruptedException {
ServerName serverName) throws InterruptedException, IOException {
ServerManager sm = activeMaster.getMaster().getServerManager();
// First wait for it to be in dead list
while (!sm.getDeadServers().isDeadServer(serverName)) {
Expand Down