diff --git a/documentation/sphinx/source/release-notes.rst b/documentation/sphinx/source/release-notes.rst index 84316e67c3a..c9776567777 100644 --- a/documentation/sphinx/source/release-notes.rst +++ b/documentation/sphinx/source/release-notes.rst @@ -2,7 +2,7 @@ Release Notes ############# -6.0.5 +6.0.6 ===== Features @@ -44,6 +44,8 @@ Fixes * On clusters configured with usable_regions=2, status reported no replicas remaining when the primary DC was still healthy. [6.0.5] `(PR #687) `_ * Clients could crash when passing in TLS options. [6.0.5] `(PR #649) `_ * A mismatched TLS certificate and key set could cause the server to crash. [6.0.5] `(PR #689) `_ +* Databases with more than 10TB of data would pause for a few seconds after recovery. [6.0.6] `(PR #705) `_ +* Sometimes a minority of coordinators would fail to converge after a new leader was elected. [6.0.6] `(PR #700) `_ Status ------ diff --git a/fdbclient/CoordinationInterface.h b/fdbclient/CoordinationInterface.h index 887fee36bc5..787c8408079 100644 --- a/fdbclient/CoordinationInterface.h +++ b/fdbclient/CoordinationInterface.h @@ -110,22 +110,14 @@ struct LeaderInfo { // All but the first 7 bits are used to represent process id bool equalInternalId(LeaderInfo const& leaderInfo) const { - if ( (changeID.first() & mask) == (leaderInfo.changeID.first() & mask) && changeID.second() == leaderInfo.changeID.second() ) { - return true; - } else { - return false; - } + return ((changeID.first() & mask) == (leaderInfo.changeID.first() & mask)) && changeID.second() == leaderInfo.changeID.second(); } - // Change leader only if + // Change leader only if // 1. the candidate has better process class fitness and the candidate is not the leader - // 2. the leader process class fitness become worse + // 2. the leader process class fitness becomes worse bool leaderChangeRequired(LeaderInfo const& candidate) const { - if ( ((changeID.first() & ~mask) > (candidate.changeID.first() & ~mask) && !equalInternalId(candidate)) || ((changeID.first() & ~mask) < (candidate.changeID.first() & ~mask) && equalInternalId(candidate)) ) { - return true; - } else { - return false; - } + return ((changeID.first() & ~mask) > (candidate.changeID.first() & ~mask) && !equalInternalId(candidate)) || ((changeID.first() & ~mask) < (candidate.changeID.first() & ~mask) && equalInternalId(candidate)); } template diff --git a/fdbserver/Coordination.actor.cpp b/fdbserver/Coordination.actor.cpp index 2b6cd16d0a7..7f1b70e11e2 100644 --- a/fdbserver/Coordination.actor.cpp +++ b/fdbserver/Coordination.actor.cpp @@ -285,14 +285,14 @@ ACTOR Future leaderRegister(LeaderElectionRegInterface interf, Key key) { } } - if ( currentNominee.present() != nextNominee.present() || (nextNominee.present() && !foundCurrentNominee) || (currentNominee.present() && currentNominee.get().leaderChangeRequired(nextNominee.get())) ) { + if ( !nextNominee.present() || !foundCurrentNominee || currentNominee.get().leaderChangeRequired(nextNominee.get()) ) { TraceEvent("NominatingLeader").detail("Nominee", nextNominee.present() ? nextNominee.get().changeID : UID()) .detail("Changed", nextNominee != currentNominee).detail("Key", printable(key)); for(int i=0; i dataDistribution( state PromiseStream getShardMetrics; state Reference> processingUnhealthy( new AsyncVar(false) ); state Promise readyToStart; + state Reference shardsAffectedByTeamFailure( new ShardsAffectedByTeamFailure ); + + state int shard = 0; + for(; shardshards.size() - 1; shard++) { + KeyRangeRef keys = KeyRangeRef(initData->shards[shard].key, initData->shards[shard+1].key); + shardsAffectedByTeamFailure->defineShard(keys); + std::vector teams; + teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[shard].primarySrc, true)); + if(configuration.usableRegions > 1) { + teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[shard].remoteSrc, false)); + } + shardsAffectedByTeamFailure->moveShard(keys, teams); + if(initData->shards[shard].hasDest) { + // This shard is already in flight. Ideally we should use dest in sABTF and generate a dataDistributionRelocator directly in + // DataDistributionQueue to track it, but it's easier to just (with low priority) schedule it for movement. + output.send( RelocateShard( keys, PRIORITY_RECOVER_MOVE ) ); + } + Void _ = wait( yield(TaskDataDistribution) ); + } vector tcis; Reference> anyZeroHealthyTeams; @@ -2235,25 +2254,6 @@ ACTOR Future dataDistribution( anyZeroHealthyTeams = zeroHealthyTeams[0]; } - Reference shardsAffectedByTeamFailure( new ShardsAffectedByTeamFailure ); - actors.push_back(yieldPromiseStream(output.getFuture(), input)); - - for(int s=0; sshards.size() - 1; s++) { - KeyRangeRef keys = KeyRangeRef(initData->shards[s].key, initData->shards[s+1].key); - shardsAffectedByTeamFailure->defineShard(keys); - std::vector teams; - teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[s].primarySrc, true)); - if(configuration.usableRegions > 1) { - teams.push_back(ShardsAffectedByTeamFailure::Team(initData->shards[s].remoteSrc, false)); - } - shardsAffectedByTeamFailure->moveShard(keys, teams); - if(initData->shards[s].hasDest) { - // This shard is already in flight. Ideally we should use dest in sABTF and generate a dataDistributionRelocator directly in - // DataDistributionQueue to track it, but it's easier to just (with low priority) schedule it for movement. - output.send( RelocateShard( keys, PRIORITY_RECOVER_MOVE ) ); - } - } - actors.push_back( pollMoveKeysLock(cx, lock) ); actors.push_back( reportErrorsExcept( dataDistributionTracker( initData, cx, output, shardsAffectedByTeamFailure, getShardMetrics, getAverageShardBytes.getFuture(), readyToStart, anyZeroHealthyTeams, mi.id() ), "DDTracker", mi.id(), &normalDDQueueErrors() ) ); actors.push_back( reportErrorsExcept( dataDistributionQueue( cx, output, input.getFuture(), getShardMetrics, processingUnhealthy, tcis, shardsAffectedByTeamFailure, lock, getAverageShardBytes, mi, storageTeamSize, lastLimited, recoveryCommitVersion ), "DDQueue", mi.id(), &normalDDQueueErrors() ) ); @@ -2261,6 +2261,7 @@ ACTOR Future dataDistribution( if (configuration.usableRegions > 1) { actors.push_back( reportErrorsExcept( dataDistributionTeamCollection( initData, tcis[1], cx, db, shardsAffectedByTeamFailure, lock, output, mi.id(), configuration, remoteDcIds, Optional>>(), Optional> >>(), readyToStart.getFuture() && remoteRecovered, zeroHealthyTeams[1], false, processingUnhealthy ), "DDTeamCollectionSecondary", mi.id(), &normalDDQueueErrors() ) ); } + actors.push_back(yieldPromiseStream(output.getFuture(), input)); Void _ = wait( waitForAll( actors ) ); return Void();