Skip to content

Commit

Permalink
HBASE-22193 Add backoff when region failed open too many times
Browse files Browse the repository at this point in the history
  • Loading branch information
infraio committed Apr 12, 2019
1 parent f227eb7 commit 942f8c4
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 11 deletions.
Expand Up @@ -131,6 +131,10 @@ public class AssignmentManager {
"hbase.assignment.maximum.attempts"; "hbase.assignment.maximum.attempts";
private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE; private static final int DEFAULT_ASSIGN_MAX_ATTEMPTS = Integer.MAX_VALUE;


public static final String ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS =
"hbase.assignment.retry.immediately.maximum.attempts";
private static final int DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS = 3;

/** Region in Transition metrics threshold time */ /** Region in Transition metrics threshold time */
public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD = public static final String METRICS_RIT_STUCK_WARNING_THRESHOLD =
"hbase.metrics.rit.stuck.warning.threshold"; "hbase.metrics.rit.stuck.warning.threshold";
Expand All @@ -151,6 +155,7 @@ public class AssignmentManager {
private final int assignDispatchWaitQueueMaxSize; private final int assignDispatchWaitQueueMaxSize;
private final int assignDispatchWaitMillis; private final int assignDispatchWaitMillis;
private final int assignMaxAttempts; private final int assignMaxAttempts;
private final int assignRetryImmediatelyMaxAttempts;


private final Object checkIfShouldMoveSystemRegionLock = new Object(); private final Object checkIfShouldMoveSystemRegionLock = new Object();


Expand Down Expand Up @@ -179,6 +184,8 @@ public AssignmentManager(final MasterServices master) {


this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS, this.assignMaxAttempts = Math.max(1, conf.getInt(ASSIGN_MAX_ATTEMPTS,
DEFAULT_ASSIGN_MAX_ATTEMPTS)); DEFAULT_ASSIGN_MAX_ATTEMPTS));
this.assignRetryImmediatelyMaxAttempts = conf.getInt(ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS,
DEFAULT_ASSIGN_RETRY_IMMEDIATELY_MAX_ATTEMPTS);


int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY, int ritChoreInterval = conf.getInt(RIT_CHORE_INTERVAL_MSEC_CONF_KEY,
DEFAULT_RIT_CHORE_INTERVAL_MSEC); DEFAULT_RIT_CHORE_INTERVAL_MSEC);
Expand Down Expand Up @@ -308,6 +315,10 @@ int getAssignMaxAttempts() {
return assignMaxAttempts; return assignMaxAttempts;
} }


int getAssignRetryImmediatelyMaxAttempts() {
return assignRetryImmediatelyMaxAttempts;
}

public RegionStates getRegionStates() { public RegionStates getRegionStates() {
return regionStates; return regionStates;
} }
Expand Down
Expand Up @@ -226,20 +226,32 @@ private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode)
return Flow.HAS_MORE_STATE; return Flow.HAS_MORE_STATE;
} }


if (incrementAndCheckMaxAttempts(env, regionNode)) { int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
.incrementAndGetRetries();
int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());

if (retries >= maxAttempts) {
env.getAssignmentManager().regionFailedOpen(regionNode, true); env.getAssignmentManager().regionFailedOpen(regionNode, true);
setFailure(getClass().getSimpleName(), new RetriesExhaustedException( setFailure(getClass().getSimpleName(), new RetriesExhaustedException(
"Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded")); "Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
regionNode.unsetProcedure(this); regionNode.unsetProcedure(this);
return Flow.NO_MORE_STATE; return Flow.NO_MORE_STATE;
} }

env.getAssignmentManager().regionFailedOpen(regionNode, false); env.getAssignmentManager().regionFailedOpen(regionNode, false);
// we failed to assign the region, force a new plan // we failed to assign the region, force a new plan
forceNewPlan = true; forceNewPlan = true;
regionNode.setRegionLocation(null); regionNode.setRegionLocation(null);
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE); setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
// Here we do not throw exception because we want to the region to be online ASAP
return Flow.HAS_MORE_STATE; if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
// Throw exception to backoff and retry when failed open too many times
throw new HBaseIOException("Failed to open region");
} else {
// Here we do not throw exception because we want to the region to be online ASAP
return Flow.HAS_MORE_STATE;
}
} }


private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException { private void closeRegion(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
Expand Down Expand Up @@ -400,14 +412,6 @@ void unattachRemoteProc(RegionRemoteProcedureBase proc) {
this.remoteProc = null; this.remoteProc = null;
} }


private boolean incrementAndCheckMaxAttempts(MasterProcedureEnv env, RegionStateNode regionNode) {
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode)
.incrementAndGetRetries();
int max = env.getAssignmentManager().getAssignMaxAttempts();
LOG.info("Retry={} of max={}; {}; {}", retries, max, this, regionNode.toShortString());
return retries >= max;
}

@Override @Override
protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state) protected void rollbackState(MasterProcedureEnv env, RegionStateTransitionState state)
throws IOException, InterruptedException { throws IOException, InterruptedException {
Expand Down

0 comments on commit 942f8c4

Please sign in to comment.