Skip to content

Commit

Permalink
Merge pull request #782 from etschannen/release-6.0
Browse files Browse the repository at this point in the history
consider server health when building subsets of emergency teams
  • Loading branch information
bnamasivayam committed Sep 20, 2018
2 parents e159b92 + 861c8aa commit 8b09c36
Showing 1 changed file with 9 additions and 6 deletions.
15 changes: 9 additions & 6 deletions fdbserver/DataDistribution.actor.cpp
Expand Up @@ -558,6 +558,7 @@ struct DDTeamCollection {
Reference<AsyncVar<bool>> processingUnhealthy;
Future<Void> readyToStart;
Future<Void> checkTeamDelay;
bool addSubsetComplete;

Reference<LocalitySet> storageServerSet;
std::vector<LocalityEntry> forcedEntries, resultEntries;
Expand Down Expand Up @@ -598,7 +599,7 @@ struct DDTeamCollection {
Optional<PromiseStream< std::pair<UID, Optional<StorageServerInterface>> >> const& serverChanges,
Future<Void> readyToStart, Reference<AsyncVar<bool>> zeroHealthyTeams, bool primary,
Reference<AsyncVar<bool>> processingUnhealthy)
:cx(cx), masterId(masterId), lock(lock), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams( true ), teamBuilder( Void() ),
:cx(cx), masterId(masterId), lock(lock), output(output), shardsAffectedByTeamFailure(shardsAffectedByTeamFailure), doBuildTeams(true), teamBuilder( Void() ), addSubsetComplete(false),
configuration(configuration), serverChanges(serverChanges), readyToStart(readyToStart), checkTeamDelay( delay( SERVER_KNOBS->CHECK_TEAM_DELAY, TaskDataDistribution) ),
initialFailureReactionDelay( delayed( readyToStart, SERVER_KNOBS->INITIAL_FAILURE_REACTION_DELAY, TaskDataDistribution ) ), healthyTeamCount( 0 ), storageServerSet(new LocalityMap<UID>()),
initializationDoneActor(logOnCompletion(readyToStart && initialFailureReactionDelay, this)), optimalTeamCount( 0 ), recruitingStream(0), restartRecruiting( SERVER_KNOBS->DEBOUNCE_RECRUITING_DELAY ),
Expand Down Expand Up @@ -649,7 +650,7 @@ struct DDTeamCollection {
while( !self->teamBuilder.isReady() )
Void _ = wait( self->teamBuilder );

if( self->doBuildTeams ) {
if( self->doBuildTeams && self->readyToStart.isReady() ) {
self->doBuildTeams = false;
try {
loop {
Expand Down Expand Up @@ -845,7 +846,7 @@ struct DDTeamCollection {
for(; idx < self->badTeams.size(); idx++ ) {
servers.clear();
for(auto server : self->badTeams[idx]->servers) {
if(server->inDesiredDC) {
if(server->inDesiredDC && !self->server_status.get(server->id).isUnhealthy()) {
servers.push_back(server);
}
}
Expand Down Expand Up @@ -928,7 +929,6 @@ struct DDTeamCollection {
Void _ = wait( yield() );
}

Void _ = wait( addSubsetOfEmergencyTeams(self) );
return Void();
}

Expand Down Expand Up @@ -1167,6 +1167,11 @@ struct DDTeamCollection {
//
// buildTeams will not count teams larger than teamSize against the desired teams.
ACTOR Future<Void> buildTeams( DDTeamCollection* self ) {
if(!self->addSubsetComplete) {
self->addSubsetComplete = true;
Void _ = wait( addSubsetOfEmergencyTeams(self) );
}

state int desiredTeams;
int serverCount = 0;
int uniqueDataCenters = 0;
Expand Down Expand Up @@ -1729,8 +1734,6 @@ ACTOR Future<Void> storageServerTracker(
Promise<Void> errorOut,
Version addedVersion)
{
Void _ = wait( self->readyToStart );

state Future<Void> failureTracker;
state ServerStatus status( false, false, server->lastKnownInterface.locality );
state bool lastIsUnhealthy = false;
Expand Down

0 comments on commit 8b09c36

Please sign in to comment.