Skip to content

Commit

Permalink
Remove bootstrapping retry config (#2301)
Browse files Browse the repository at this point in the history
  • Loading branch information
StephenButtolph committed Nov 17, 2023
1 parent 392b313 commit 4768ed4
Show file tree
Hide file tree
Showing 13 changed files with 139 additions and 166 deletions.
58 changes: 26 additions & 32 deletions chains/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,34 +174,32 @@ type ManagerConfig struct {
StakingBLSKey *bls.SecretKey
TracingEnabled bool
// Must not be used unless [TracingEnabled] is true as this may be nil.
Tracer trace.Tracer
Log logging.Logger
LogFactory logging.Factory
VMManager vms.Manager // Manage mappings from vm ID --> vm
BlockAcceptorGroup snow.AcceptorGroup
TxAcceptorGroup snow.AcceptorGroup
VertexAcceptorGroup snow.AcceptorGroup
DB database.Database
MsgCreator message.OutboundMsgBuilder // message creator, shared with network
Router router.Router // Routes incoming messages to the appropriate chain
Net network.Network // Sends consensus messages to other validators
Validators validators.Manager // Validators validating on this chain
NodeID ids.NodeID // The ID of this node
NetworkID uint32 // ID of the network this node is connected to
PartialSyncPrimaryNetwork bool
Server server.Server // Handles HTTP API calls
Keystore keystore.Keystore
AtomicMemory *atomic.Memory
AVAXAssetID ids.ID
XChainID ids.ID // ID of the X-Chain,
CChainID ids.ID // ID of the C-Chain,
CriticalChains set.Set[ids.ID] // Chains that can't exit gracefully
TimeoutManager timeout.Manager // Manages request timeouts when sending messages to other validators
Health health.Registerer
RetryBootstrap bool // Should Bootstrap be retried
RetryBootstrapWarnFrequency int // Max number of times to retry bootstrap before warning the node operator
SubnetConfigs map[ids.ID]subnets.Config // ID -> SubnetConfig
ChainConfigs map[string]ChainConfig // alias -> ChainConfig
Tracer trace.Tracer
Log logging.Logger
LogFactory logging.Factory
VMManager vms.Manager // Manage mappings from vm ID --> vm
BlockAcceptorGroup snow.AcceptorGroup
TxAcceptorGroup snow.AcceptorGroup
VertexAcceptorGroup snow.AcceptorGroup
DB database.Database
MsgCreator message.OutboundMsgBuilder // message creator, shared with network
Router router.Router // Routes incoming messages to the appropriate chain
Net network.Network // Sends consensus messages to other validators
Validators validators.Manager // Validators validating on this chain
NodeID ids.NodeID // The ID of this node
NetworkID uint32 // ID of the network this node is connected to
PartialSyncPrimaryNetwork bool
Server server.Server // Handles HTTP API calls
Keystore keystore.Keystore
AtomicMemory *atomic.Memory
AVAXAssetID ids.ID
XChainID ids.ID // ID of the X-Chain,
CChainID ids.ID // ID of the C-Chain,
CriticalChains set.Set[ids.ID] // Chains that can't exit gracefully
TimeoutManager timeout.Manager // Manages request timeouts when sending messages to other validators
Health health.Registerer
SubnetConfigs map[ids.ID]subnets.Config // ID -> SubnetConfig
ChainConfigs map[string]ChainConfig // alias -> ChainConfig
// ShutdownNodeFunc allows the chain manager to issue a request to shutdown the node
ShutdownNodeFunc func(exitCode int)
MeterVMEnabled bool // Should each VM be wrapped with a MeterVM
Expand Down Expand Up @@ -889,8 +887,6 @@ func (m *manager) createAvalancheChain(
Sender: snowmanMessageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
},
Expand Down Expand Up @@ -1235,8 +1231,6 @@ func (m *manager) createSnowmanChain(
Sender: messageSender,
BootstrapTracker: sb,
Timer: h,
RetryBootstrap: m.RetryBootstrap,
RetryBootstrapWarnFrequency: m.RetryBootstrapWarnFrequency,
AncestorsMaxContainersReceived: m.BootstrapAncestorsMaxContainersReceived,
SharedCfg: &common.SharedConfig{},
}
Expand Down
2 changes: 0 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,6 @@ func getStateSyncConfig(v *viper.Viper) (node.StateSyncConfig, error) {

func getBootstrapConfig(v *viper.Viper, networkID uint32) (node.BootstrapConfig, error) {
config := node.BootstrapConfig{
RetryBootstrap: v.GetBool(RetryBootstrapKey),
RetryBootstrapWarnFrequency: v.GetInt(RetryBootstrapWarnFrequencyKey),
BootstrapBeaconConnectionTimeout: v.GetDuration(BootstrapBeaconConnectionTimeoutKey),
BootstrapMaxTimeGetAncestors: v.GetDuration(BootstrapMaxTimeGetAncestorsKey),
BootstrapAncestorsMaxContainersSent: int(v.GetUint(BootstrapAncestorsMaxContainersSentKey)),
Expand Down
2 changes: 0 additions & 2 deletions config/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,6 @@ func addNodeFlags(fs *pflag.FlagSet) {
// TODO: combine "BootstrapIPsKey" and "BootstrapIDsKey" into one flag
fs.String(BootstrapIPsKey, "", "Comma separated list of bootstrap peer ips to connect to. Example: 127.0.0.1:9630,127.0.0.1:9631")
fs.String(BootstrapIDsKey, "", "Comma separated list of bootstrap peer ids to connect to. Example: NodeID-JR4dVmy6ffUGAKCBDkyCbeZbyHQBeDsET,NodeID-8CrVPQZ4VSqgL8zTdvL14G8HqAfrBr4z")
fs.Bool(RetryBootstrapKey, true, "Specifies whether bootstrap should be retried")
fs.Int(RetryBootstrapWarnFrequencyKey, 50, "Specifies how many times bootstrap should be retried before warning the operator")
fs.Duration(BootstrapBeaconConnectionTimeoutKey, time.Minute, "Timeout before emitting a warn log when connecting to bootstrapping beacons")
fs.Duration(BootstrapMaxTimeGetAncestorsKey, 50*time.Millisecond, "Max Time to spend fetching a container and its ancestors when responding to a GetAncestors")
fs.Uint(BootstrapAncestorsMaxContainersSentKey, 2000, "Max number of containers in an Ancestors message sent by this node")
Expand Down
2 changes: 0 additions & 2 deletions config/keys.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,6 @@ const (
RouterHealthMaxOutstandingRequestsKey = "router-health-max-outstanding-requests"
HealthCheckFreqKey = "health-check-frequency"
HealthCheckAveragerHalflifeKey = "health-check-averager-halflife"
RetryBootstrapKey = "bootstrap-retry-enabled"
RetryBootstrapWarnFrequencyKey = "bootstrap-retry-warn-frequency"
PluginDirKey = "plugin-dir"
BootstrapBeaconConnectionTimeoutKey = "bootstrap-beacon-connection-timeout"
BootstrapMaxTimeGetAncestorsKey = "bootstrap-max-time-get-ancestors"
Expand Down
6 changes: 0 additions & 6 deletions node/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,6 @@ type StateSyncConfig struct {
}

type BootstrapConfig struct {
// Should Bootstrap be retried
RetryBootstrap bool `json:"retryBootstrap"`

// Max number of times to retry bootstrap before warning the node operator
RetryBootstrapWarnFrequency int `json:"retryBootstrapWarnFrequency"`

// Timeout before emitting a warn log when connecting to bootstrapping beacons
BootstrapBeaconConnectionTimeout time.Duration `json:"bootstrapBeaconConnectionTimeout"`

Expand Down
2 changes: 0 additions & 2 deletions node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1009,8 +1009,6 @@ func (n *Node) initChainManager(avaxAssetID ids.ID) error {
CriticalChains: criticalChains,
TimeoutManager: n.timeoutManager,
Health: n.health,
RetryBootstrap: n.Config.RetryBootstrap,
RetryBootstrapWarnFrequency: n.Config.RetryBootstrapWarnFrequency,
ShutdownNodeFunc: n.Shutdown,
MeterVMEnabled: n.Config.MeterVMEnabled,
Metrics: n.MetricsGatherer,
Expand Down
28 changes: 5 additions & 23 deletions snow/engine/common/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type Bootstrapper interface {
AcceptedHandler
Haltable
Startup(context.Context) error
Restart(ctx context.Context, reset bool) error
Restart(ctx context.Context) error
}

// It collects mechanisms common to both snowman and avalanche bootstrappers
Expand All @@ -41,9 +41,6 @@ type bootstrapper struct {

minority smbootstrapper.Poll
majority smbootstrapper.Poll

// number of times the bootstrap has been attempted
bootstrapAttempts int
}

func NewCommonBootstrapper(config Config) Bootstrapper {
Expand Down Expand Up @@ -146,7 +143,6 @@ func (b *bootstrapper) Startup(ctx context.Context) error {
MaxOutstandingBroadcastRequests,
)

b.bootstrapAttempts++
if accepted, finalized := b.majority.Result(ctx); finalized {
b.Ctx.Log.Info("bootstrapping skipped",
zap.String("reason", "no provided bootstraps"),
Expand All @@ -158,22 +154,9 @@ func (b *bootstrapper) Startup(ctx context.Context) error {
return b.sendMessagesOrFinish(ctx)
}

func (b *bootstrapper) Restart(ctx context.Context, reset bool) error {
// resets the attempts when we're pulling blocks/vertices we don't want to
// fail the bootstrap at that stage
if reset {
b.Ctx.Log.Debug("Checking for new frontiers")

b.Config.SharedCfg.Restarted = true
b.bootstrapAttempts = 0
}

if b.bootstrapAttempts > 0 && b.bootstrapAttempts%b.RetryBootstrapWarnFrequency == 0 {
b.Ctx.Log.Debug("check internet connection",
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
)
}

func (b *bootstrapper) Restart(ctx context.Context) error {
b.Ctx.Log.Debug("Checking for new frontiers")
b.Config.SharedCfg.Restarted = true
return b.Startup(ctx)
}

Expand Down Expand Up @@ -207,9 +190,8 @@ func (b *bootstrapper) sendMessagesOrFinish(ctx context.Context) error {
b.Ctx.Log.Debug("restarting bootstrap",
zap.String("reason", "no blocks accepted"),
zap.Int("numBeacons", b.Beacons.Count(b.Ctx.SubnetID)),
zap.Int("numBootstrapAttempts", b.bootstrapAttempts),
)
return b.Restart(ctx, false /*=reset*/)
return b.Startup(ctx)
}

if !b.Config.SharedCfg.Restarted {
Expand Down
6 changes: 0 additions & 6 deletions snow/engine/common/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ type Config struct {
BootstrapTracker BootstrapTracker
Timer Timer

// Should Bootstrap be retried
RetryBootstrap bool

// Max number of times to retry bootstrap before warning the node operator
RetryBootstrapWarnFrequency int

// This node will only consider the first [AncestorsMaxContainersReceived]
// containers in an ancestors message it receives.
AncestorsMaxContainersReceived int
Expand Down
20 changes: 17 additions & 3 deletions snow/engine/snowman/bootstrap/bootstrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,20 @@ var (
errUnexpectedTimeout = errors.New("unexpected timeout fired")
)

// bootstrapper repeatedly performs the bootstrapping protocol.
//
// 1. Wait until a sufficient amount of stake is connected.
// 2. Sample a small number of nodes to get the last accepted block ID
// 3. Verify against the full network that the last accepted block ID received
// in step 2 is an accepted block.
// 4. Sync the full ancestry of the last accepted block.
// 5. Execute all the fetched blocks that haven't already been executed.
// 6. Restart the bootstrapping protocol until the number of blocks being
// accepted during a bootstrapping round stops decreasing.
//
// Note: Because of step 6, the bootstrapping protocol will generally be
// performed multiple times.
//
// Invariant: The VM is not guaranteed to be initialized until Start has been
// called, so it must be guaranteed the VM is not used until after Start.
type bootstrapper struct {
Expand Down Expand Up @@ -288,7 +302,7 @@ func (b *bootstrapper) Timeout(ctx context.Context) error {
b.awaitingTimeout = false

if !b.Config.BootstrapTracker.IsBootstrapped() {
return b.Restart(ctx, true)
return b.Restart(ctx)
}
b.fetchETA.Set(0)
return b.OnFinished(ctx, b.Config.SharedCfg.RequestID)
Expand Down Expand Up @@ -592,8 +606,8 @@ func (b *bootstrapper) checkFinish(ctx context.Context) error {
// Note that executedBlocks < c*previouslyExecuted ( 0 <= c < 1 ) is enforced
// so that the bootstrapping process will terminate even as new blocks are
// being issued.
if b.Config.RetryBootstrap && executedBlocks > 0 && executedBlocks < previouslyExecuted/2 {
return b.Restart(ctx, true)
if executedBlocks > 0 && executedBlocks < previouslyExecuted/2 {
return b.Restart(ctx)
}

// If there is an additional callback, notify them that this chain has been
Expand Down
48 changes: 28 additions & 20 deletions snow/engine/snowman/bootstrap/bootstrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,8 +356,6 @@ func TestBootstrapperUnknownByzantineResponse(t *testing.T) {

require.NoError(bs.Start(context.Background(), 0))

acceptedIDs := []ids.ID{blkID2}

parsedBlk1 := false
vm.GetBlockF = func(_ context.Context, blkID ids.ID) (snowman.Block, error) {
switch blkID {
Expand Down Expand Up @@ -390,31 +388,29 @@ func TestBootstrapperUnknownByzantineResponse(t *testing.T) {
return nil, errUnknownBlock
}

requestID := new(uint32)
var requestID uint32
sender.SendGetAncestorsF = func(_ context.Context, vdr ids.NodeID, reqID uint32, blkID ids.ID) {
require.Equal(peerID, vdr)
require.Equal(blkID1, blkID)
*requestID = reqID
requestID = reqID
}

vm.CantSetState = false
require.NoError(bs.ForceAccepted(context.Background(), acceptedIDs)) // should request blk1

oldReqID := *requestID
require.NoError(bs.Ancestors(context.Background(), peerID, *requestID+1, [][]byte{blkBytes1})) // respond with wrong request ID
require.Equal(oldReqID, *requestID)
require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID2})) // should request blk1

require.NoError(bs.Ancestors(context.Background(), ids.BuildTestNodeID([]byte{1, 2, 3}), *requestID, [][]byte{blkBytes1})) // respond from wrong peer
require.Equal(oldReqID, *requestID)
oldReqID := requestID
require.NoError(bs.Ancestors(context.Background(), peerID, requestID, [][]byte{blkBytes0})) // respond with wrong block
require.NotEqual(oldReqID, requestID)

require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes0})) // respond with wrong block
require.NotEqual(oldReqID, *requestID)
require.NoError(bs.Ancestors(context.Background(), peerID, requestID, [][]byte{blkBytes1}))

require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes1}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID2}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

// There are multiple needed blocks and Ancestors returns one at a time
Expand Down Expand Up @@ -554,10 +550,13 @@ func TestBootstrapperPartialFetch(t *testing.T) {
require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes1})) // respond with blk1
require.Equal(blkID1, requested)

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), acceptedIDs))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

// There are multiple needed blocks and some validators do not have all the blocks
Expand Down Expand Up @@ -714,7 +713,7 @@ func TestBootstrapperEmptyResponse(t *testing.T) {

require.NoError(bs.Ancestors(context.Background(), requestedVdr, requestID, [][]byte{blkBytes1})) // respond with blk1

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())
Expand Down Expand Up @@ -856,10 +855,13 @@ func TestBootstrapperAncestors(t *testing.T) {
require.NoError(bs.Ancestors(context.Background(), peerID, *requestID, [][]byte{blkBytes2, blkBytes1})) // respond with blk2 and blk1
require.Equal(blkID2, requested)

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), acceptedIDs))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

func TestBootstrapperFinalized(t *testing.T) {
Expand Down Expand Up @@ -976,10 +978,13 @@ func TestBootstrapperFinalized(t *testing.T) {

require.NoError(bs.Ancestors(context.Background(), peerID, reqIDBlk2, [][]byte{blkBytes2, blkBytes1}))

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())

require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID2}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

func TestRestartBootstrapping(t *testing.T) {
Expand Down Expand Up @@ -1156,12 +1161,15 @@ func TestRestartBootstrapping(t *testing.T) {

require.NoError(bs.Ancestors(context.Background(), peerID, blk4RequestID, [][]byte{blkBytes4}))

require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
require.Equal(snow.Bootstrapping, config.Ctx.State.Get().State)
require.Equal(choices.Accepted, blk0.Status())
require.Equal(choices.Accepted, blk1.Status())
require.Equal(choices.Accepted, blk2.Status())
require.Equal(choices.Accepted, blk3.Status())
require.Equal(choices.Accepted, blk4.Status())

require.NoError(bs.ForceAccepted(context.Background(), []ids.ID{blkID4}))
require.Equal(snow.NormalOp, config.Ctx.State.Get().State)
}

func TestBootstrapOldBlockAfterStateSync(t *testing.T) {
Expand Down

0 comments on commit 4768ed4

Please sign in to comment.