From 3e5abdb7e26853b2f2320a847f4bf2d73fe80b62 Mon Sep 17 00:00:00 2001 From: distractedm1nd Date: Thu, 20 Apr 2023 13:48:26 -0700 Subject: [PATCH 01/56] prog --- nodebuilder/share/config.go | 3 +++ nodebuilder/share/constructors.go | 1 + share/availability/discovery/discovery.go | 13 ++++++++++--- share/availability/discovery/set.go | 4 +--- share/getters/shrex_test.go | 2 +- share/p2p/peers/manager_test.go | 4 +++- 6 files changed, 19 insertions(+), 8 deletions(-) diff --git a/nodebuilder/share/config.go b/nodebuilder/share/config.go index 789b54dd75..355f4163d3 100644 --- a/nodebuilder/share/config.go +++ b/nodebuilder/share/config.go @@ -17,6 +17,8 @@ var ( type Config struct { // PeersLimit defines how many peers will be added during discovery. PeersLimit uint + // DialTimeout is the timeout for connecting to peers found via discovery. + DialTimeout time.Duration // DiscoveryInterval is an interval between discovery sessions. DiscoveryInterval time.Duration // AdvertiseInterval is a interval between advertising sessions. @@ -36,6 +38,7 @@ type Config struct { func DefaultConfig() Config { return Config{ PeersLimit: 5, + DialTimeout: time.Second * 15, DiscoveryInterval: time.Second * 30, AdvertiseInterval: time.Second * 30, UseShareExchange: true, diff --git a/nodebuilder/share/constructors.go b/nodebuilder/share/constructors.go index a8d45f6a40..a01edf705c 100644 --- a/nodebuilder/share/constructors.go +++ b/nodebuilder/share/constructors.go @@ -30,6 +30,7 @@ func discovery(cfg Config) func(routing.ContentRouting, host.Host) *disc.Discove h, routingdisc.NewRoutingDiscovery(r), cfg.PeersLimit, + cfg.DialTimeout, cfg.DiscoveryInterval, cfg.AdvertiseInterval, ) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 47733a9281..b32a4b3cbd 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -16,7 +16,6 @@ import ( var log = logging.Logger("share/discovery") const ( - // peerWeight is a weight of discovered peers. // peerWeight is a number that will be assigned to all discovered full nodes, // so ConnManager will not break a connection with them. peerWeight = 1000 @@ -40,6 +39,10 @@ type Discovery struct { connector *backoffConnector // peersLimit is max amount of peers that will be discovered during a discovery session. peersLimit uint + // dialTimeout is the timeout used for dialing peers. + // network.WithDialPeerTimeout is not sufficient here, + // as RoutedHost only uses it for the underlying dial. + dialTimeout time.Duration // discInterval is an interval between discovery sessions. discoveryInterval time.Duration // advertiseInterval is an interval between advertising sessions. @@ -57,6 +60,7 @@ func NewDiscovery( h host.Host, d discovery.Discovery, peersLimit uint, + dialTimeout, discInterval, advertiseInterval time.Duration, ) *Discovery { @@ -66,6 +70,7 @@ func NewDiscovery( disc: d, connector: newBackoffConnector(h, defaultBackoffFactory), peersLimit: peersLimit, + dialTimeout: dialTimeout, discoveryInterval: discInterval, advertiseInterval: advertiseInterval, onUpdatedPeers: func(peer.ID, bool) {}, @@ -102,13 +107,15 @@ func (d *Discovery) handlePeerFound(ctx context.Context, topic string, peer peer } err := d.set.TryAdd(peer.ID) if err != nil { - log.Debug(err) + log.Debugw("failed to add peer to set", "peer", peer.ID, "error", err) return } + ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) + defer cancel() err = d.connector.Connect(ctx, peer) if err != nil { - log.Debug(err) + log.Debugw("couldn't connect to peer, removing from set", "peer", peer.ID, "error", err) d.set.Remove(peer.ID) return } diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index a7b330fadd..6425fcee0b 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -53,17 +53,15 @@ func (ps *limitedSet) TryAdd(p peer.ID) error { } ps.ps[p] = struct{}{} -LOOP: for { // peer will be pushed to the channel only when somebody is reading from it. // this is done to handle case when Peers() was called on empty set. select { case ps.waitPeer <- p: default: - break LOOP + return nil } } - return nil } func (ps *limitedSet) Remove(id peer.ID) { diff --git a/share/getters/shrex_test.go b/share/getters/shrex_test.go index 7578195639..03b13e4961 100644 --- a/share/getters/shrex_test.go +++ b/share/getters/shrex_test.go @@ -158,7 +158,7 @@ func testManager(ctx context.Context, host host.Host, headerSub libhead.Subscrib } disc := discovery.NewDiscovery(nil, - routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) + routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second, time.Second) connGater, err := conngater.NewBasicConnectionGater(ds_sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { return nil, err diff --git a/share/p2p/peers/manager_test.go b/share/p2p/peers/manager_test.go index 05c1f5b33e..ca64dfcbc5 100644 --- a/share/p2p/peers/manager_test.go +++ b/share/p2p/peers/manager_test.go @@ -395,6 +395,7 @@ func TestIntegration(t *testing.T) { routingdisc.NewRoutingDiscovery(router1), 10, time.Second, + time.Second, time.Second) // set up full node / receiver node @@ -407,6 +408,7 @@ func TestIntegration(t *testing.T) { 10, time.Second, time.Second, + time.Second, ) err = fnDisc.Start(ctx) require.NoError(t, err) @@ -459,7 +461,7 @@ func testManager(ctx context.Context, headerSub libhead.Subscriber[*header.Exten return nil, err } disc := discovery.NewDiscovery(nil, - routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) + routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second, time.Second) connGater, err := conngater.NewBasicConnectionGater(sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { return nil, err From e3b85956631be9113a6944e0f909e8b93a6ab5e0 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 20 Apr 2023 20:04:49 -0700 Subject: [PATCH 02/56] only adding connected peers --- nodebuilder/share/config.go | 6 +- share/availability/discovery/backoff.go | 17 ++++- share/availability/discovery/discovery.go | 78 +++++++++++++++-------- share/availability/discovery/set.go | 7 +- share/availability/full/testing.go | 4 +- 5 files changed, 75 insertions(+), 37 deletions(-) diff --git a/nodebuilder/share/config.go b/nodebuilder/share/config.go index 355f4163d3..5ff1bbb2f4 100644 --- a/nodebuilder/share/config.go +++ b/nodebuilder/share/config.go @@ -15,7 +15,7 @@ var ( ) type Config struct { - // PeersLimit defines how many peers will be added during discovery. + // PeersLimit defines the soft limit of FNs to connect to via discovery. PeersLimit uint // DialTimeout is the timeout for connecting to peers found via discovery. DialTimeout time.Duration @@ -38,7 +38,7 @@ type Config struct { func DefaultConfig() Config { return Config{ PeersLimit: 5, - DialTimeout: time.Second * 15, + DialTimeout: time.Second * 60, DiscoveryInterval: time.Second * 30, AdvertiseInterval: time.Second * 30, UseShareExchange: true, @@ -50,7 +50,7 @@ func DefaultConfig() Config { // Validate performs basic validation of the config. func (cfg *Config) Validate() error { - if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 { + if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 || cfg.DialTimeout <= 0 { return fmt.Errorf("nodebuilder/share: %s", ErrNegativeInterval) } if err := cfg.ShrExNDParams.Validate(); err != nil { diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index e76449f2c6..173e5c25b9 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -2,7 +2,7 @@ package discovery import ( "context" - "fmt" + "errors" "sync" "time" @@ -14,7 +14,10 @@ import ( // gcInterval is a default period after which disconnected peers will be removed from cache const gcInterval = time.Hour -var defaultBackoffFactory = backoff.NewFixedBackoff(time.Hour) +var ( + defaultBackoffFactory = backoff.NewFixedBackoff(time.Hour) + errBackoffNotEnded = errors.New("share/discovery: backoff period has not ended") +) // backoffConnector wraps a libp2p.Host to establish a connection with peers // with adding a delay for the next connection attempt. @@ -48,7 +51,7 @@ func (b *backoffConnector) Connect(ctx context.Context, p peer.AddrInfo) error { cache := b.connectionData(p.ID) if time.Now().Before(cache.nexttry) { b.cacheLk.Unlock() - return fmt.Errorf("share/discovery: backoff period has not ended for peer=%s", p.ID.String()) + return errBackoffNotEnded } cache.nexttry = time.Now().Add(cache.backoff.Delay()) b.cacheLk.Unlock() @@ -67,6 +70,14 @@ func (b *backoffConnector) connectionData(p peer.ID) *backoffData { return cache } +// RemoveBackoff removes peer from the backoffCache. It is called when we cancel the attempt to +// connect to a peer after calling Connect. +func (b *backoffConnector) RemoveBackoff(p peer.ID) { + b.cacheLk.Lock() + defer b.cacheLk.Unlock() + delete(b.cacheData, p) +} + // RestartBackoff resets delay time between attempts and adds a delay for the next connection // attempt to remote peer. It will mostly be called when host receives a notification that remote // peer was disconnected. diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index b32a4b3cbd..dd8f928001 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -2,6 +2,8 @@ package discovery import ( "context" + "errors" + "sync" "time" logging "github.com/ipfs/go-log/v2" @@ -10,6 +12,7 @@ import ( "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/core/routing" "github.com/libp2p/go-libp2p/p2p/host/eventbus" ) @@ -37,7 +40,7 @@ type Discovery struct { host host.Host disc discovery.Discovery connector *backoffConnector - // peersLimit is max amount of peers that will be discovered during a discovery session. + // peersLimit is the soft limit of peers to add to the set. peersLimit uint // dialTimeout is the timeout used for dialing peers. // network.WithDialPeerTimeout is not sufficient here, @@ -101,28 +104,35 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. -func (d *Discovery) handlePeerFound(ctx context.Context, topic string, peer peer.AddrInfo) { +func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, cancelOngoing context.CancelFunc) { if peer.ID == d.host.ID() || len(peer.Addrs) == 0 || d.set.Contains(peer.ID) { return } - err := d.set.TryAdd(peer.ID) + + ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) + defer cancel() + err := d.connector.Connect(ctx, peer) + // we don't want to add backoff when the context is canceled. if err != nil { - log.Debugw("failed to add peer to set", "peer", peer.ID, "error", err) + if errors.Is(err, context.Canceled) || errors.Is(err, routing.ErrNotFound) { + d.connector.RemoveBackoff(peer.ID) + } return } - ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) - defer cancel() - err = d.connector.Connect(ctx, peer) + if uint(d.set.Size()) >= d.peersLimit { + log.Debugw("peer limit reached", "count", d.set.Size(), "peer", peer.ID) + cancelOngoing() + } + err = d.set.TryAdd(peer.ID) if err != nil { - log.Debugw("couldn't connect to peer, removing from set", "peer", peer.ID, "error", err) - d.set.Remove(peer.ID) + log.Debugw("failed to add peer to set", "peer", peer.ID, "error", err) return } d.onUpdatedPeers(peer.ID, true) log.Debugw("added peer to set", "id", peer.ID) - // add tag to protect peer of being killed by ConnManager + // add tag to protect peer from being killed by ConnManager d.host.ConnManager().TagPeer(peer.ID, topic, peerWeight) } @@ -171,13 +181,12 @@ func (d *Discovery) ensurePeers(ctx context.Context) { connStatus := e.(event.EvtPeerConnectednessChanged) if connStatus.Connectedness == network.NotConnected { if d.set.Contains(connStatus.Peer) { - log.Debugw("removing the peer from the peer set", + log.Debugw("removing peer from the peer set", "peer", connStatus.Peer, "status", connStatus.Connectedness.String()) d.connector.RestartBackoff(connStatus.Peer) d.set.Remove(connStatus.Peer) d.onUpdatedPeers(connStatus.Peer, false) d.host.ConnManager().UntagPeer(connStatus.Peer, topic) - t.Reset(d.discoveryInterval) } } } @@ -190,23 +199,42 @@ func (d *Discovery) ensurePeers(ctx context.Context) { log.Info("Context canceled. Finishing peer discovery") return case <-t.C: - if uint(d.set.Size()) == d.peersLimit { - // stop ticker if we have reached the limit - t.Stop() - continue - } - peers, err := d.disc.FindPeers(ctx, topic) - if err != nil { - log.Error(err) - continue - } - for p := range peers { - go d.handlePeerFound(ctx, topic, p) - } + d.findPeers(ctx) } } } +func (d *Discovery) findPeers(ctx context.Context) { + if uint(d.set.Size()) >= d.peersLimit { + log.Debugw("at peer limit, skipping FindPeers", "size", d.set.Size()) + return + } + + peers, err := d.disc.FindPeers(ctx, topic) + if err != nil { + log.Error(err) + return + } + + // we use a wait group to avoid a context leak + var wg sync.WaitGroup + ctx, cancel := context.WithCancel(ctx) + + for p := range peers { + wg.Add(1) + go func(peer peer.AddrInfo) { + defer wg.Done() + d.handlePeerFound(ctx, peer, cancel) + }(p) + } + + // Wait for all goroutines to finish, then cancel the context + go func() { + wg.Wait() + cancel() + }() +} + // Advertise is a utility function that persistently advertises a service through an Advertiser. func (d *Discovery) Advertise(ctx context.Context) { timer := time.NewTimer(d.advertiseInterval) diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index 6425fcee0b..5ee9e5ba8d 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -44,15 +44,12 @@ func (ps *limitedSet) Size() int { // This operation will fail if the number of peers in the set is equal to size. func (ps *limitedSet) TryAdd(p peer.ID) error { ps.lk.Lock() - defer ps.lk.Unlock() if _, ok := ps.ps[p]; ok { return errors.New("share: discovery: peer already added") } - if len(ps.ps) >= int(ps.limit) { - return errors.New("share: discovery: peers limit reached") - } - ps.ps[p] = struct{}{} + ps.lk.Unlock() + for { // peer will be pushed to the channel only when somebody is reading from it. // this is done to handle case when Peers() was called on empty set. diff --git a/share/availability/full/testing.go b/share/availability/full/testing.go index ca5d0f10c5..8df7277667 100644 --- a/share/availability/full/testing.go +++ b/share/availability/full/testing.go @@ -37,6 +37,8 @@ func Node(dn *availability_test.TestDagNet) *availability_test.TestNode { } func TestAvailability(getter share.Getter) *ShareAvailability { - disc := discovery.NewDiscovery(nil, routing.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) + disc := discovery.NewDiscovery(nil, routing.NewRoutingDiscovery( + routinghelpers.Null{}), 0, time.Second, time.Second, time.Second, + ) return NewShareAvailability(nil, getter, disc) } From 1f9a052daf7f56c8fdb1b0378fd318154e7d85c6 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 21 Apr 2023 14:33:51 +0200 Subject: [PATCH 03/56] improve logic a bit --- share/availability/discovery/discovery.go | 65 +++++++++++++---------- share/availability/discovery/set.go | 4 +- share/availability/discovery/set_test.go | 18 +++---- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index dd8f928001..2ba9cca7f0 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -3,7 +3,6 @@ package discovery import ( "context" "errors" - "sync" "time" logging "github.com/ipfs/go-log/v2" @@ -14,6 +13,7 @@ import ( "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/routing" "github.com/libp2p/go-libp2p/p2p/host/eventbus" + "golang.org/x/sync/errgroup" ) var log = logging.Logger("share/discovery") @@ -104,36 +104,46 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. -func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, cancelOngoing context.CancelFunc) { +func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, cancelFind context.CancelFunc) { if peer.ID == d.host.ID() || len(peer.Addrs) == 0 || d.set.Contains(peer.ID) { return } ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) defer cancel() + err := d.connector.Connect(ctx, peer) - // we don't want to add backoff when the context is canceled. if err != nil { + // we don't want to add backoff when the context is canceled. if errors.Is(err, context.Canceled) || errors.Is(err, routing.ErrNotFound) { d.connector.RemoveBackoff(peer.ID) } return } - if uint(d.set.Size()) >= d.peersLimit { - log.Debugw("peer limit reached", "count", d.set.Size(), "peer", peer.ID) - cancelOngoing() - } - err = d.set.TryAdd(peer.ID) + err = d.set.Add(peer.ID) if err != nil { log.Debugw("failed to add peer to set", "peer", peer.ID, "error", err) return } - - d.onUpdatedPeers(peer.ID, true) log.Debugw("added peer to set", "id", peer.ID) - // add tag to protect peer from being killed by ConnManager + + // check the size only after we add + // so that peer set represents the actual number of connections we made + // which can go slightly over peersLimit + if uint(d.set.Size()) >= d.peersLimit { + log.Infow("soft peer limit reached", "count", d.set.Size(), "peer", peer.ID) + cancelFind() + } + + // tag to protect peer from being killed by ConnManager + // NOTE: This is does not protect from remote killing the connection. + // In the future, we should design a protocol that keeps bidirectional agreement on whether + // connection should be kept or not, similar to mesh link in GossipSub. d.host.ConnManager().TagPeer(peer.ID, topic, peerWeight) + + // and notify our subscribers + d.onUpdatedPeers(peer.ID, true) } // ensurePeers ensures we always have 'peerLimit' connected peers. @@ -210,32 +220,31 @@ func (d *Discovery) findPeers(ctx context.Context) { return } - peers, err := d.disc.FindPeers(ctx, topic) + findCtx, findCancel := context.WithCancel(ctx) + defer findCancel() + + peers, err := d.disc.FindPeers(findCtx, topic) if err != nil { - log.Error(err) + log.Warn(err) return } - // we use a wait group to avoid a context leak - var wg sync.WaitGroup - ctx, cancel := context.WithCancel(ctx) - + // we use errgroup as it obeys the context + wg, findCtx := errgroup.WithContext(ctx) for p := range peers { - wg.Add(1) - go func(peer peer.AddrInfo) { - defer wg.Done() - d.handlePeerFound(ctx, peer, cancel) - }(p) + peer := p + wg.Go(func() error { + // pass the cancel so that we cancel FindPeers when we connected to enough peers + d.handlePeerFound(findCtx, peer, findCancel) + return nil + }) } - - // Wait for all goroutines to finish, then cancel the context - go func() { - wg.Wait() - cancel() - }() + // we expect no errors + _ = wg.Wait() } // Advertise is a utility function that persistently advertises a service through an Advertiser. +// TODO: Start advertising only after the reachability is confirmed by AutoNAT func (d *Discovery) Advertise(ctx context.Context) { timer := time.NewTimer(d.advertiseInterval) defer timer.Stop() diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index 5ee9e5ba8d..e5e121b533 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -40,9 +40,9 @@ func (ps *limitedSet) Size() int { return len(ps.ps) } -// TryAdd attempts to add the given peer into the set. +// Add attempts to add the given peer into the set. // This operation will fail if the number of peers in the set is equal to size. -func (ps *limitedSet) TryAdd(p peer.ID) error { +func (ps *limitedSet) Add(p peer.ID) error { ps.lk.Lock() if _, ok := ps.ps[p]; ok { return errors.New("share: discovery: peer already added") diff --git a/share/availability/discovery/set_test.go b/share/availability/discovery/set_test.go index b4c6a8fb49..9a145abc41 100644 --- a/share/availability/discovery/set_test.go +++ b/share/availability/discovery/set_test.go @@ -15,7 +15,7 @@ func TestSet_TryAdd(t *testing.T) { require.NoError(t, err) set := newLimitedSet(1) - require.NoError(t, set.TryAdd(h.ID())) + require.NoError(t, set.Add(h.ID())) require.True(t, set.Contains(h.ID())) } @@ -27,8 +27,8 @@ func TestSet_TryAddFails(t *testing.T) { require.NoError(t, err) set := newLimitedSet(1) - require.NoError(t, set.TryAdd(h1.ID())) - require.Error(t, set.TryAdd(h2.ID())) + require.NoError(t, set.Add(h1.ID())) + require.Error(t, set.Add(h2.ID())) } func TestSet_Remove(t *testing.T) { @@ -37,7 +37,7 @@ func TestSet_Remove(t *testing.T) { require.NoError(t, err) set := newLimitedSet(1) - require.NoError(t, set.TryAdd(h.ID())) + require.NoError(t, set.Add(h.ID())) set.Remove(h.ID()) require.False(t, set.Contains(h.ID())) } @@ -50,8 +50,8 @@ func TestSet_Peers(t *testing.T) { require.NoError(t, err) set := newLimitedSet(2) - require.NoError(t, set.TryAdd(h1.ID())) - require.NoError(t, set.TryAdd(h2.ID())) + require.NoError(t, set.Add(h1.ID())) + require.NoError(t, set.Add(h2.ID())) ctx, cancel := context.WithTimeout(context.Background(), time.Second*1) t.Cleanup(cancel) @@ -71,7 +71,7 @@ func TestSet_WaitPeers(t *testing.T) { set := newLimitedSet(2) go func() { time.Sleep(time.Millisecond * 500) - set.TryAdd(h1.ID()) //nolint:errcheck + set.Add(h1.ID()) //nolint:errcheck }() ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) @@ -91,8 +91,8 @@ func TestSet_Size(t *testing.T) { require.NoError(t, err) set := newLimitedSet(2) - require.NoError(t, set.TryAdd(h1.ID())) - require.NoError(t, set.TryAdd(h2.ID())) + require.NoError(t, set.Add(h1.ID())) + require.NoError(t, set.Add(h2.ID())) require.Equal(t, 2, set.Size()) set.Remove(h2.ID()) require.Equal(t, 1, set.Size()) From f058a21107abfad6ad62273600c517e640adff6b Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 21 Apr 2023 15:09:31 +0200 Subject: [PATCH 04/56] add connecting map --- share/availability/discovery/discovery.go | 83 +++++++++++++++-------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 2ba9cca7f0..012e593cb2 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -3,6 +3,7 @@ package discovery import ( "context" "errors" + "sync" "time" logging "github.com/ipfs/go-log/v2" @@ -53,6 +54,10 @@ type Discovery struct { // onUpdatedPeers will be called on peer set changes onUpdatedPeers OnUpdatedPeers + connectingLk sync.Mutex + connecting map[peer.ID]context.CancelFunc + + ctx context.Context cancel context.CancelFunc } @@ -77,14 +82,13 @@ func NewDiscovery( discoveryInterval: discInterval, advertiseInterval: advertiseInterval, onUpdatedPeers: func(peer.ID, bool) {}, + connecting: make(map[peer.ID]context.CancelFunc), } } func (d *Discovery) Start(context.Context) error { - ctx, cancel := context.WithCancel(context.Background()) - d.cancel = cancel - - go d.ensurePeers(ctx) + d.ctx, d.cancel = context.WithCancel(context.Background()) + go d.ensurePeers(d.ctx) return nil } @@ -112,38 +116,28 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) defer cancel() + d.connectingLk.Lock() + d.connecting[peer.ID] = cancelFind + d.connectingLk.Unlock() + err := d.connector.Connect(ctx, peer) if err != nil { // we don't want to add backoff when the context is canceled. if errors.Is(err, context.Canceled) || errors.Is(err, routing.ErrNotFound) { d.connector.RemoveBackoff(peer.ID) } - return - } - err = d.set.Add(peer.ID) - if err != nil { - log.Debugw("failed to add peer to set", "peer", peer.ID, "error", err) + d.connectingLk.Lock() + delete(d.connecting, peer.ID) + d.connectingLk.Unlock() return } - log.Debugw("added peer to set", "id", peer.ID) - - // check the size only after we add - // so that peer set represents the actual number of connections we made - // which can go slightly over peersLimit - if uint(d.set.Size()) >= d.peersLimit { - log.Infow("soft peer limit reached", "count", d.set.Size(), "peer", peer.ID) - cancelFind() - } // tag to protect peer from being killed by ConnManager // NOTE: This is does not protect from remote killing the connection. // In the future, we should design a protocol that keeps bidirectional agreement on whether // connection should be kept or not, similar to mesh link in GossipSub. d.host.ConnManager().TagPeer(peer.ID, topic, peerWeight) - - // and notify our subscribers - d.onUpdatedPeers(peer.ID, true) } // ensurePeers ensures we always have 'peerLimit' connected peers. @@ -188,16 +182,45 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } // listen to disconnect event to remove peer from set and reset backoff time // reset timer in order to restart the discovery, once stored peer is disconnected - connStatus := e.(event.EvtPeerConnectednessChanged) - if connStatus.Connectedness == network.NotConnected { - if d.set.Contains(connStatus.Peer) { - log.Debugw("removing peer from the peer set", - "peer", connStatus.Peer, "status", connStatus.Connectedness.String()) - d.connector.RestartBackoff(connStatus.Peer) - d.set.Remove(connStatus.Peer) - d.onUpdatedPeers(connStatus.Peer, false) - d.host.ConnManager().UntagPeer(connStatus.Peer, topic) + evnt := e.(event.EvtPeerConnectednessChanged) + switch evnt.Connectedness { + case network.NotConnected: + if !d.set.Contains(evnt.Peer) { + continue + } + + d.host.ConnManager().UntagPeer(evnt.Peer, topic) + d.connector.RestartBackoff(evnt.Peer) + d.set.Remove(evnt.Peer) + d.onUpdatedPeers(evnt.Peer, false) + log.Debugw("removed peer from the peer set", + "peer", evnt.Peer, "status", evnt.Connectedness.String()) + case network.Connected: + peerID := evnt.Peer + d.connectingLk.Lock() + cancelFind, ok := d.connecting[peerID] + d.connectingLk.Unlock() + if !ok { + continue } + + err = d.set.Add(peerID) + if err != nil { + log.Debugw("failed to add peer to set", "peer", peerID, "error", err) + return + } + log.Debugw("added peer to set", "id", peerID) + + // first do Add and only after check the limit + // so that peer set represents the actual number of connections we made + // which can go slightly over peersLimit + if uint(d.set.Size()) >= d.peersLimit { + log.Infow("soft peer limit reached", "count", d.set.Size(), "peer", peerID) + cancelFind() + } + + // and notify our subscribers + d.onUpdatedPeers(peerID, true) } } } From 48732ff8613d9b98c395509987f205855a676cba Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 21 Apr 2023 15:43:30 +0200 Subject: [PATCH 05/56] cleanup the connecting map --- share/availability/discovery/discovery.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 012e593cb2..a138a88e54 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -36,6 +36,7 @@ var waitF = func(ttl time.Duration) time.Duration { } // Discovery combines advertise and discover services and allows to store discovered nodes. +// TODO: The code here gets horribly hairy, so we should refactor this at some point type Discovery struct { set *limitedSet host host.Host @@ -113,13 +114,17 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can return } - ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) - defer cancel() - d.connectingLk.Lock() + if _, ok := d.connecting[peer.ID]; ok { + d.connectingLk.Unlock() + return + } d.connecting[peer.ID] = cancelFind d.connectingLk.Unlock() + ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) + defer cancel() + err := d.connector.Connect(ctx, peer) if err != nil { // we don't want to add backoff when the context is canceled. @@ -215,10 +220,14 @@ func (d *Discovery) ensurePeers(ctx context.Context) { // so that peer set represents the actual number of connections we made // which can go slightly over peersLimit if uint(d.set.Size()) >= d.peersLimit { - log.Infow("soft peer limit reached", "count", d.set.Size(), "peer", peerID) + log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() } + d.connectingLk.Lock() + delete(d.connecting, peerID) + d.connectingLk.Unlock() + // and notify our subscribers d.onUpdatedPeers(peerID, true) } From 3f3c287ba216b6ff1220118dfa26eee8d5b65b5a Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 21 Apr 2023 15:57:24 +0200 Subject: [PATCH 06/56] limit connections --- share/availability/discovery/discovery.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index a138a88e54..3e5e0b1813 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -110,7 +110,10 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, cancelFind context.CancelFunc) { - if peer.ID == d.host.ID() || len(peer.Addrs) == 0 || d.set.Contains(peer.ID) { + if peer.ID == d.host.ID() || + len(peer.Addrs) == 0 || + d.set.Contains(peer.ID) || + uint(d.set.Size()) >= d.peersLimit { return } @@ -263,6 +266,8 @@ func (d *Discovery) findPeers(ctx context.Context) { // we use errgroup as it obeys the context wg, findCtx := errgroup.WithContext(ctx) + // limit to minimize chances of overreaching the limit + wg.SetLimit(int(d.peersLimit)) for p := range peers { peer := p wg.Go(func() error { From 0148dd2c1c91946ab73c789b01572dbfbd610394 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 21 Apr 2023 17:28:28 +0200 Subject: [PATCH 07/56] dialTimeout -> connectTimeout --- nodebuilder/share/config.go | 8 ++++---- nodebuilder/share/constructors.go | 2 +- share/availability/discovery/discovery.go | 12 +++++------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/nodebuilder/share/config.go b/nodebuilder/share/config.go index 5ff1bbb2f4..87c347eaf5 100644 --- a/nodebuilder/share/config.go +++ b/nodebuilder/share/config.go @@ -17,8 +17,8 @@ var ( type Config struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. PeersLimit uint - // DialTimeout is the timeout for connecting to peers found via discovery. - DialTimeout time.Duration + // ConnectTimeout is the timeout for connecting to peers found via discovery. + ConnectTimeout time.Duration // DiscoveryInterval is an interval between discovery sessions. DiscoveryInterval time.Duration // AdvertiseInterval is a interval between advertising sessions. @@ -38,7 +38,7 @@ type Config struct { func DefaultConfig() Config { return Config{ PeersLimit: 5, - DialTimeout: time.Second * 60, + ConnectTimeout: time.Minute * 2, DiscoveryInterval: time.Second * 30, AdvertiseInterval: time.Second * 30, UseShareExchange: true, @@ -50,7 +50,7 @@ func DefaultConfig() Config { // Validate performs basic validation of the config. func (cfg *Config) Validate() error { - if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 || cfg.DialTimeout <= 0 { + if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 || cfg.ConnectTimeout <= 0 { return fmt.Errorf("nodebuilder/share: %s", ErrNegativeInterval) } if err := cfg.ShrExNDParams.Validate(); err != nil { diff --git a/nodebuilder/share/constructors.go b/nodebuilder/share/constructors.go index a01edf705c..0920ac536b 100644 --- a/nodebuilder/share/constructors.go +++ b/nodebuilder/share/constructors.go @@ -30,7 +30,7 @@ func discovery(cfg Config) func(routing.ContentRouting, host.Host) *disc.Discove h, routingdisc.NewRoutingDiscovery(r), cfg.PeersLimit, - cfg.DialTimeout, + cfg.ConnectTimeout, cfg.DiscoveryInterval, cfg.AdvertiseInterval, ) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 3e5e0b1813..7832714011 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -44,10 +44,8 @@ type Discovery struct { connector *backoffConnector // peersLimit is the soft limit of peers to add to the set. peersLimit uint - // dialTimeout is the timeout used for dialing peers. - // network.WithDialPeerTimeout is not sufficient here, - // as RoutedHost only uses it for the underlying dial. - dialTimeout time.Duration + // connectTimeout is the timeout used for dialing peers and discovering peer addresses. + connectTimeout time.Duration // discInterval is an interval between discovery sessions. discoveryInterval time.Duration // advertiseInterval is an interval between advertising sessions. @@ -69,7 +67,7 @@ func NewDiscovery( h host.Host, d discovery.Discovery, peersLimit uint, - dialTimeout, + connectTimeout, discInterval, advertiseInterval time.Duration, ) *Discovery { @@ -79,7 +77,7 @@ func NewDiscovery( disc: d, connector: newBackoffConnector(h, defaultBackoffFactory), peersLimit: peersLimit, - dialTimeout: dialTimeout, + connectTimeout: connectTimeout, discoveryInterval: discInterval, advertiseInterval: advertiseInterval, onUpdatedPeers: func(peer.ID, bool) {}, @@ -125,7 +123,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can d.connecting[peer.ID] = cancelFind d.connectingLk.Unlock() - ctx, cancel := context.WithTimeout(ctx, d.dialTimeout) + ctx, cancel := context.WithTimeout(ctx, d.connectTimeout) defer cancel() err := d.connector.Connect(ctx, peer) From 23b590e933546034bf68b8ab2fc543a2f899aa52 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 24 Apr 2023 16:23:55 +0200 Subject: [PATCH 08/56] unlock on error --- share/availability/discovery/set.go | 1 + 1 file changed, 1 insertion(+) diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index e5e121b533..609c97cb34 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -45,6 +45,7 @@ func (ps *limitedSet) Size() int { func (ps *limitedSet) Add(p peer.ID) error { ps.lk.Lock() if _, ok := ps.ps[p]; ok { + ps.lk.Unlock() return errors.New("share: discovery: peer already added") } ps.ps[p] = struct{}{} From 9943c1790e9dc6d1b6dfcac1dfe57ec799bbe347 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Mon, 24 Apr 2023 16:30:38 +0200 Subject: [PATCH 09/56] make connect timeout a constatnt --- nodebuilder/share/config.go | 5 +---- nodebuilder/share/constructors.go | 1 - share/availability/discovery/backoff.go | 10 +++++++++- share/availability/discovery/discovery.go | 7 ------- share/availability/discovery/set_test.go | 4 +--- share/availability/full/testing.go | 2 +- share/getters/shrex_test.go | 2 +- share/p2p/peers/manager_test.go | 6 ++---- 8 files changed, 15 insertions(+), 22 deletions(-) diff --git a/nodebuilder/share/config.go b/nodebuilder/share/config.go index 87c347eaf5..ac6c8613d9 100644 --- a/nodebuilder/share/config.go +++ b/nodebuilder/share/config.go @@ -17,8 +17,6 @@ var ( type Config struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. PeersLimit uint - // ConnectTimeout is the timeout for connecting to peers found via discovery. - ConnectTimeout time.Duration // DiscoveryInterval is an interval between discovery sessions. DiscoveryInterval time.Duration // AdvertiseInterval is a interval between advertising sessions. @@ -38,7 +36,6 @@ type Config struct { func DefaultConfig() Config { return Config{ PeersLimit: 5, - ConnectTimeout: time.Minute * 2, DiscoveryInterval: time.Second * 30, AdvertiseInterval: time.Second * 30, UseShareExchange: true, @@ -50,7 +47,7 @@ func DefaultConfig() Config { // Validate performs basic validation of the config. func (cfg *Config) Validate() error { - if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 || cfg.ConnectTimeout <= 0 { + if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 { return fmt.Errorf("nodebuilder/share: %s", ErrNegativeInterval) } if err := cfg.ShrExNDParams.Validate(); err != nil { diff --git a/nodebuilder/share/constructors.go b/nodebuilder/share/constructors.go index 0920ac536b..a8d45f6a40 100644 --- a/nodebuilder/share/constructors.go +++ b/nodebuilder/share/constructors.go @@ -30,7 +30,6 @@ func discovery(cfg Config) func(routing.ContentRouting, host.Host) *disc.Discove h, routingdisc.NewRoutingDiscovery(r), cfg.PeersLimit, - cfg.ConnectTimeout, cfg.DiscoveryInterval, cfg.AdvertiseInterval, ) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index 173e5c25b9..af9500b30e 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -12,7 +12,11 @@ import ( ) // gcInterval is a default period after which disconnected peers will be removed from cache -const gcInterval = time.Hour +const ( + gcInterval = time.Hour + // connectTimeout is the timeout used for dialing peers and discovering peer addresses. + connectTimeout = time.Minute * 2 +) var ( defaultBackoffFactory = backoff.NewFixedBackoff(time.Hour) @@ -55,6 +59,10 @@ func (b *backoffConnector) Connect(ctx context.Context, p peer.AddrInfo) error { } cache.nexttry = time.Now().Add(cache.backoff.Delay()) b.cacheLk.Unlock() + + ctx, cancel := context.WithTimeout(ctx, connectTimeout) + defer cancel() + return b.h.Connect(ctx, p) } diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 7832714011..c42d4c855f 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -44,8 +44,6 @@ type Discovery struct { connector *backoffConnector // peersLimit is the soft limit of peers to add to the set. peersLimit uint - // connectTimeout is the timeout used for dialing peers and discovering peer addresses. - connectTimeout time.Duration // discInterval is an interval between discovery sessions. discoveryInterval time.Duration // advertiseInterval is an interval between advertising sessions. @@ -67,7 +65,6 @@ func NewDiscovery( h host.Host, d discovery.Discovery, peersLimit uint, - connectTimeout, discInterval, advertiseInterval time.Duration, ) *Discovery { @@ -77,7 +74,6 @@ func NewDiscovery( disc: d, connector: newBackoffConnector(h, defaultBackoffFactory), peersLimit: peersLimit, - connectTimeout: connectTimeout, discoveryInterval: discInterval, advertiseInterval: advertiseInterval, onUpdatedPeers: func(peer.ID, bool) {}, @@ -123,9 +119,6 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can d.connecting[peer.ID] = cancelFind d.connectingLk.Unlock() - ctx, cancel := context.WithTimeout(ctx, d.connectTimeout) - defer cancel() - err := d.connector.Connect(ctx, peer) if err != nil { // we don't want to add backoff when the context is canceled. diff --git a/share/availability/discovery/set_test.go b/share/availability/discovery/set_test.go index 9a145abc41..18fdbc3daf 100644 --- a/share/availability/discovery/set_test.go +++ b/share/availability/discovery/set_test.go @@ -23,12 +23,10 @@ func TestSet_TryAddFails(t *testing.T) { m := mocknet.New() h1, err := m.GenPeer() require.NoError(t, err) - h2, err := m.GenPeer() - require.NoError(t, err) set := newLimitedSet(1) require.NoError(t, set.Add(h1.ID())) - require.Error(t, set.Add(h2.ID())) + require.Error(t, set.Add(h1.ID())) } func TestSet_Remove(t *testing.T) { diff --git a/share/availability/full/testing.go b/share/availability/full/testing.go index 8df7277667..bb25462f46 100644 --- a/share/availability/full/testing.go +++ b/share/availability/full/testing.go @@ -38,7 +38,7 @@ func Node(dn *availability_test.TestDagNet) *availability_test.TestNode { func TestAvailability(getter share.Getter) *ShareAvailability { disc := discovery.NewDiscovery(nil, routing.NewRoutingDiscovery( - routinghelpers.Null{}), 0, time.Second, time.Second, time.Second, + routinghelpers.Null{}), 0, time.Second, time.Second, ) return NewShareAvailability(nil, getter, disc) } diff --git a/share/getters/shrex_test.go b/share/getters/shrex_test.go index 03b13e4961..7578195639 100644 --- a/share/getters/shrex_test.go +++ b/share/getters/shrex_test.go @@ -158,7 +158,7 @@ func testManager(ctx context.Context, host host.Host, headerSub libhead.Subscrib } disc := discovery.NewDiscovery(nil, - routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second, time.Second) + routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) connGater, err := conngater.NewBasicConnectionGater(ds_sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { return nil, err diff --git a/share/p2p/peers/manager_test.go b/share/p2p/peers/manager_test.go index ca64dfcbc5..71c5fef9d6 100644 --- a/share/p2p/peers/manager_test.go +++ b/share/p2p/peers/manager_test.go @@ -365,7 +365,7 @@ func TestIntegration(t *testing.T) { t.Run("get peer from discovery", func(t *testing.T) { nw, err := mocknet.FullMeshConnected(3) require.NoError(t, err) - ctx, cancel := context.WithTimeout(context.Background(), time.Second*5) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) t.Cleanup(cancel) // set up bootstrapper @@ -395,7 +395,6 @@ func TestIntegration(t *testing.T) { routingdisc.NewRoutingDiscovery(router1), 10, time.Second, - time.Second, time.Second) // set up full node / receiver node @@ -408,7 +407,6 @@ func TestIntegration(t *testing.T) { 10, time.Second, time.Second, - time.Second, ) err = fnDisc.Start(ctx) require.NoError(t, err) @@ -461,7 +459,7 @@ func testManager(ctx context.Context, headerSub libhead.Subscriber[*header.Exten return nil, err } disc := discovery.NewDiscovery(nil, - routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second, time.Second) + routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) connGater, err := conngater.NewBasicConnectionGater(sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { return nil, err From e58c3d5638b8323ee2bc1e327e36d341fe9bb6ba Mon Sep 17 00:00:00 2001 From: Wondertan Date: Wed, 26 Apr 2023 14:48:16 +0200 Subject: [PATCH 10/56] discovery parameters --- nodebuilder/share/constructors.go | 9 ++-- share/availability/discovery/discovery.go | 63 ++++++++++++----------- share/availability/discovery/set.go | 4 ++ share/availability/full/testing.go | 6 ++- share/getters/shrex_test.go | 7 ++- share/p2p/peers/manager_test.go | 24 ++++++--- 6 files changed, 70 insertions(+), 43 deletions(-) diff --git a/nodebuilder/share/constructors.go b/nodebuilder/share/constructors.go index a8d45f6a40..69ecb3fddb 100644 --- a/nodebuilder/share/constructors.go +++ b/nodebuilder/share/constructors.go @@ -29,9 +29,12 @@ func discovery(cfg Config) func(routing.ContentRouting, host.Host) *disc.Discove return disc.NewDiscovery( h, routingdisc.NewRoutingDiscovery(r), - cfg.PeersLimit, - cfg.DiscoveryInterval, - cfg.AdvertiseInterval, + // TODO(@Wondertan): inline into Config during next breaking release + disc.Parameters{ + PeersLimit: cfg.PeersLimit, + DiscoveryInterval: cfg.DiscoveryInterval, + AdvertiseInterval: cfg.AdvertiseInterval, + }, ) } } diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index c42d4c855f..695b62e460 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -35,20 +35,25 @@ var waitF = func(ttl time.Duration) time.Duration { return 7 * ttl / 8 } +type Parameters struct { + // PeersLimit defines the soft limit of FNs to connect to via discovery. + PeersLimit uint + // DiscoveryInterval is an interval between discovery sessions. + DiscoveryInterval time.Duration + // AdvertiseInterval is a interval between advertising sessions. + // NOTE: only full and bridge can advertise themselves. + AdvertiseInterval time.Duration +} + // Discovery combines advertise and discover services and allows to store discovered nodes. // TODO: The code here gets horribly hairy, so we should refactor this at some point type Discovery struct { - set *limitedSet - host host.Host - disc discovery.Discovery - connector *backoffConnector - // peersLimit is the soft limit of peers to add to the set. - peersLimit uint - // discInterval is an interval between discovery sessions. - discoveryInterval time.Duration - // advertiseInterval is an interval between advertising sessions. - advertiseInterval time.Duration - // onUpdatedPeers will be called on peer set changes + params Parameters + + set *limitedSet + host host.Host + disc discovery.Discovery + connector *backoffConnector onUpdatedPeers OnUpdatedPeers connectingLk sync.Mutex @@ -64,20 +69,16 @@ type OnUpdatedPeers func(peerID peer.ID, isAdded bool) func NewDiscovery( h host.Host, d discovery.Discovery, - peersLimit uint, - discInterval, - advertiseInterval time.Duration, + params Parameters, ) *Discovery { return &Discovery{ - set: newLimitedSet(peersLimit), - host: h, - disc: d, - connector: newBackoffConnector(h, defaultBackoffFactory), - peersLimit: peersLimit, - discoveryInterval: discInterval, - advertiseInterval: advertiseInterval, - onUpdatedPeers: func(peer.ID, bool) {}, - connecting: make(map[peer.ID]context.CancelFunc), + params: params, + set: newLimitedSet(params.PeersLimit), + host: h, + disc: d, + connector: newBackoffConnector(h, defaultBackoffFactory), + onUpdatedPeers: func(peer.ID, bool) {}, + connecting: make(map[peer.ID]context.CancelFunc), } } @@ -107,7 +108,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can if peer.ID == d.host.ID() || len(peer.Addrs) == 0 || d.set.Contains(peer.ID) || - uint(d.set.Size()) >= d.peersLimit { + uint(d.set.Size()) >= d.set.Limit() { return } @@ -143,7 +144,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can // It starts peer discovery every 30 seconds until peer cache reaches peersLimit. // Discovery is restarted if any previously connected peers disconnect. func (d *Discovery) ensurePeers(ctx context.Context) { - if d.peersLimit == 0 { + if d.params.PeersLimit == 0 { log.Warn("peers limit is set to 0. Skipping discovery...") return } @@ -158,7 +159,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } go d.connector.GC(ctx) - t := time.NewTicker(d.discoveryInterval) + t := time.NewTicker(d.params.DiscoveryInterval) defer func() { t.Stop() if err = sub.Close(); err != nil { @@ -213,7 +214,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { // first do Add and only after check the limit // so that peer set represents the actual number of connections we made // which can go slightly over peersLimit - if uint(d.set.Size()) >= d.peersLimit { + if uint(d.set.Size()) >= d.set.Limit() { log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() } @@ -241,7 +242,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } func (d *Discovery) findPeers(ctx context.Context) { - if uint(d.set.Size()) >= d.peersLimit { + if uint(d.set.Size()) >= d.set.Limit() { log.Debugw("at peer limit, skipping FindPeers", "size", d.set.Size()) return } @@ -258,7 +259,7 @@ func (d *Discovery) findPeers(ctx context.Context) { // we use errgroup as it obeys the context wg, findCtx := errgroup.WithContext(ctx) // limit to minimize chances of overreaching the limit - wg.SetLimit(int(d.peersLimit)) + wg.SetLimit(int(d.set.Limit())) for p := range peers { peer := p wg.Go(func() error { @@ -274,7 +275,7 @@ func (d *Discovery) findPeers(ctx context.Context) { // Advertise is a utility function that persistently advertises a service through an Advertiser. // TODO: Start advertising only after the reachability is confirmed by AutoNAT func (d *Discovery) Advertise(ctx context.Context) { - timer := time.NewTimer(d.advertiseInterval) + timer := time.NewTimer(d.params.AdvertiseInterval) defer timer.Stop() for { ttl, err := d.disc.Advertise(ctx, topic) @@ -286,7 +287,7 @@ func (d *Discovery) Advertise(ctx context.Context) { select { case <-timer.C: - timer.Reset(d.advertiseInterval) + timer.Reset(d.params.AdvertiseInterval) continue case <-ctx.Done(): return diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index 609c97cb34..704b068224 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -34,6 +34,10 @@ func (ps *limitedSet) Contains(p peer.ID) bool { return ok } +func (ps *limitedSet) Limit() uint { + return ps.limit +} + func (ps *limitedSet) Size() int { ps.lk.RLock() defer ps.lk.RUnlock() diff --git a/share/availability/full/testing.go b/share/availability/full/testing.go index bb25462f46..e1f62f3c62 100644 --- a/share/availability/full/testing.go +++ b/share/availability/full/testing.go @@ -38,7 +38,11 @@ func Node(dn *availability_test.TestDagNet) *availability_test.TestNode { func TestAvailability(getter share.Getter) *ShareAvailability { disc := discovery.NewDiscovery(nil, routing.NewRoutingDiscovery( - routinghelpers.Null{}), 0, time.Second, time.Second, + routinghelpers.Null{}), discovery.Parameters{ + PeersLimit: 10, + DiscoveryInterval: time.Second, + AdvertiseInterval: time.Second, + }, ) return NewShareAvailability(nil, getter, disc) } diff --git a/share/getters/shrex_test.go b/share/getters/shrex_test.go index 7578195639..65192ded8e 100644 --- a/share/getters/shrex_test.go +++ b/share/getters/shrex_test.go @@ -158,7 +158,12 @@ func testManager(ctx context.Context, host host.Host, headerSub libhead.Subscrib } disc := discovery.NewDiscovery(nil, - routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) + routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), discovery.Parameters{ + PeersLimit: 10, + DiscoveryInterval: time.Second, + AdvertiseInterval: time.Second, + }, + ) connGater, err := conngater.NewBasicConnectionGater(ds_sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { return nil, err diff --git a/share/p2p/peers/manager_test.go b/share/p2p/peers/manager_test.go index 71c5fef9d6..eecf2f3ac4 100644 --- a/share/p2p/peers/manager_test.go +++ b/share/p2p/peers/manager_test.go @@ -393,9 +393,12 @@ func TestIntegration(t *testing.T) { bnDisc := discovery.NewDiscovery( nw.Hosts()[0], routingdisc.NewRoutingDiscovery(router1), - 10, - time.Second, - time.Second) + discovery.Parameters{ + PeersLimit: 0, + DiscoveryInterval: time.Second, + AdvertiseInterval: time.Second, + }, + ) // set up full node / receiver node fnHost := nw.Hosts()[0] @@ -404,9 +407,11 @@ func TestIntegration(t *testing.T) { fnDisc := discovery.NewDiscovery( nw.Hosts()[1], routingdisc.NewRoutingDiscovery(router2), - 10, - time.Second, - time.Second, + discovery.Parameters{ + PeersLimit: 10, + DiscoveryInterval: time.Second, + AdvertiseInterval: time.Second, + }, ) err = fnDisc.Start(ctx) require.NoError(t, err) @@ -458,8 +463,13 @@ func testManager(ctx context.Context, headerSub libhead.Subscriber[*header.Exten if err != nil { return nil, err } + disc := discovery.NewDiscovery(nil, - routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), 0, time.Second, time.Second) + routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), discovery.Parameters{ + PeersLimit: 0, + DiscoveryInterval: time.Second, + AdvertiseInterval: time.Second, + }) connGater, err := conngater.NewBasicConnectionGater(sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { return nil, err From c717917cec71952437090f952c20efc16ae49c15 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Wed, 26 Apr 2023 15:23:41 +0200 Subject: [PATCH 11/56] actually break the config, refine defaults and allow disabling discovery and advertisements --- nodebuilder/share/config.go | 25 +++--------------- nodebuilder/share/constructors.go | 7 +---- nodebuilder/tests/p2p_test.go | 10 ++++---- share/availability/discovery/discovery.go | 31 ++++++++++++++++++----- share/availability/discovery/set.go | 6 ++--- 5 files changed, 38 insertions(+), 41 deletions(-) diff --git a/nodebuilder/share/config.go b/nodebuilder/share/config.go index ac6c8613d9..179035e21d 100644 --- a/nodebuilder/share/config.go +++ b/nodebuilder/share/config.go @@ -1,27 +1,13 @@ package share import ( - "errors" - "fmt" - "time" - + disc "github.com/celestiaorg/celestia-node/share/availability/discovery" "github.com/celestiaorg/celestia-node/share/p2p/peers" "github.com/celestiaorg/celestia-node/share/p2p/shrexeds" "github.com/celestiaorg/celestia-node/share/p2p/shrexnd" ) -var ( - ErrNegativeInterval = errors.New("interval must be positive") -) - type Config struct { - // PeersLimit defines the soft limit of FNs to connect to via discovery. - PeersLimit uint - // DiscoveryInterval is an interval between discovery sessions. - DiscoveryInterval time.Duration - // AdvertiseInterval is a interval between advertising sessions. - // NOTE: only full and bridge can advertise themselves. - AdvertiseInterval time.Duration // UseShareExchange is a flag toggling the usage of shrex protocols for blocksync. // NOTE: This config variable only has an effect on full and bridge nodes. UseShareExchange bool @@ -31,25 +17,22 @@ type Config struct { ShrExNDParams *shrexnd.Parameters // PeerManagerParams sets peer-manager configuration parameters PeerManagerParams peers.Parameters + // Discovery sets peer discovery configuration parameters. + Discovery disc.Parameters } func DefaultConfig() Config { return Config{ - PeersLimit: 5, - DiscoveryInterval: time.Second * 30, - AdvertiseInterval: time.Second * 30, UseShareExchange: true, ShrExEDSParams: shrexeds.DefaultParameters(), ShrExNDParams: shrexnd.DefaultParameters(), PeerManagerParams: peers.DefaultParameters(), + Discovery: disc.DefaultParameters(), } } // Validate performs basic validation of the config. func (cfg *Config) Validate() error { - if cfg.DiscoveryInterval <= 0 || cfg.AdvertiseInterval <= 0 { - return fmt.Errorf("nodebuilder/share: %s", ErrNegativeInterval) - } if err := cfg.ShrExNDParams.Validate(); err != nil { return err } diff --git a/nodebuilder/share/constructors.go b/nodebuilder/share/constructors.go index 69ecb3fddb..59189243a5 100644 --- a/nodebuilder/share/constructors.go +++ b/nodebuilder/share/constructors.go @@ -29,12 +29,7 @@ func discovery(cfg Config) func(routing.ContentRouting, host.Host) *disc.Discove return disc.NewDiscovery( h, routingdisc.NewRoutingDiscovery(r), - // TODO(@Wondertan): inline into Config during next breaking release - disc.Parameters{ - PeersLimit: cfg.PeersLimit, - DiscoveryInterval: cfg.DiscoveryInterval, - AdvertiseInterval: cfg.AdvertiseInterval, - }, + cfg.Discovery, ) } } diff --git a/nodebuilder/tests/p2p_test.go b/nodebuilder/tests/p2p_test.go index 95a33b11e8..497b3b76a5 100644 --- a/nodebuilder/tests/p2p_test.go +++ b/nodebuilder/tests/p2p_test.go @@ -171,7 +171,7 @@ func TestRestartNodeDiscovery(t *testing.T) { const fullNodes = 2 setTimeInterval(cfg, defaultTimeInterval) - cfg.Share.PeersLimit = fullNodes + cfg.Share.Discovery.PeersLimit = fullNodes bridge := sw.NewNodeWithConfig(node.Bridge, cfg) ctx, cancel := context.WithTimeout(context.Background(), swamp.DefaultTestTimeout) @@ -184,7 +184,7 @@ func TestRestartNodeDiscovery(t *testing.T) { nodes := make([]*nodebuilder.Node, fullNodes) cfg = nodebuilder.DefaultConfig(node.Full) setTimeInterval(cfg, defaultTimeInterval) - cfg.Share.PeersLimit = fullNodes + cfg.Share.Discovery.PeersLimit = fullNodes nodesConfig := nodebuilder.WithBootstrappers([]peer.AddrInfo{*bridgeAddr}) for index := 0; index < fullNodes; index++ { nodes[index] = sw.NewNodeWithConfig(node.Full, cfg, nodesConfig) @@ -201,7 +201,7 @@ func TestRestartNodeDiscovery(t *testing.T) { // create one more node with disabled discovery cfg = nodebuilder.DefaultConfig(node.Full) setTimeInterval(cfg, defaultTimeInterval) - cfg.Share.PeersLimit = 0 + cfg.Share.Discovery.PeersLimit = 0 node := sw.NewNodeWithConfig(node.Full, cfg, nodesConfig) connectSub, err := nodes[0].Host.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}) require.NoError(t, err) @@ -215,6 +215,6 @@ func TestRestartNodeDiscovery(t *testing.T) { func setTimeInterval(cfg *nodebuilder.Config, interval time.Duration) { cfg.P2P.RoutingTableRefreshPeriod = interval - cfg.Share.DiscoveryInterval = interval - cfg.Share.AdvertiseInterval = interval + cfg.Share.Discovery.DiscoveryInterval = interval + cfg.Share.Discovery.AdvertiseInterval = interval } diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 695b62e460..c7dcb5a69f 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -37,14 +37,29 @@ var waitF = func(ttl time.Duration) time.Duration { type Parameters struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. - PeersLimit uint + // Set -1 to disable. + PeersLimit int // DiscoveryInterval is an interval between discovery sessions. + // Set -1 to disable. DiscoveryInterval time.Duration // AdvertiseInterval is a interval between advertising sessions. + // Set -1 to disable. // NOTE: only full and bridge can advertise themselves. AdvertiseInterval time.Duration } +func DefaultParameters() Parameters { + return Parameters{ + PeersLimit: 5, + DiscoveryInterval: time.Minute, + AdvertiseInterval: time.Hour * 8, + } +} + +func (p *Parameters) Validate() error { + return nil +} + // Discovery combines advertise and discover services and allows to store discovered nodes. // TODO: The code here gets horribly hairy, so we should refactor this at some point type Discovery struct { @@ -108,7 +123,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can if peer.ID == d.host.ID() || len(peer.Addrs) == 0 || d.set.Contains(peer.ID) || - uint(d.set.Size()) >= d.set.Limit() { + d.set.Size() >= d.set.Limit() { return } @@ -144,7 +159,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can // It starts peer discovery every 30 seconds until peer cache reaches peersLimit. // Discovery is restarted if any previously connected peers disconnect. func (d *Discovery) ensurePeers(ctx context.Context) { - if d.params.PeersLimit == 0 { + if d.params.PeersLimit == 0 || d.params.DiscoveryInterval == -1 { log.Warn("peers limit is set to 0. Skipping discovery...") return } @@ -214,7 +229,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { // first do Add and only after check the limit // so that peer set represents the actual number of connections we made // which can go slightly over peersLimit - if uint(d.set.Size()) >= d.set.Limit() { + if d.set.Size() >= d.set.Limit() { log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() } @@ -242,7 +257,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } func (d *Discovery) findPeers(ctx context.Context) { - if uint(d.set.Size()) >= d.set.Limit() { + if d.set.Size() >= d.set.Limit() { log.Debugw("at peer limit, skipping FindPeers", "size", d.set.Size()) return } @@ -259,7 +274,7 @@ func (d *Discovery) findPeers(ctx context.Context) { // we use errgroup as it obeys the context wg, findCtx := errgroup.WithContext(ctx) // limit to minimize chances of overreaching the limit - wg.SetLimit(int(d.set.Limit())) + wg.SetLimit(d.set.Limit()) for p := range peers { peer := p wg.Go(func() error { @@ -275,6 +290,10 @@ func (d *Discovery) findPeers(ctx context.Context) { // Advertise is a utility function that persistently advertises a service through an Advertiser. // TODO: Start advertising only after the reachability is confirmed by AutoNAT func (d *Discovery) Advertise(ctx context.Context) { + if d.params.AdvertiseInterval == -1 { + return + } + timer := time.NewTimer(d.params.AdvertiseInterval) defer timer.Stop() for { diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index 704b068224..57acd54fcd 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -14,12 +14,12 @@ type limitedSet struct { lk sync.RWMutex ps map[peer.ID]struct{} - limit uint + limit int waitPeer chan peer.ID } // newLimitedSet constructs a set with the maximum peers amount. -func newLimitedSet(limit uint) *limitedSet { +func newLimitedSet(limit int) *limitedSet { ps := new(limitedSet) ps.ps = make(map[peer.ID]struct{}) ps.limit = limit @@ -34,7 +34,7 @@ func (ps *limitedSet) Contains(p peer.ID) bool { return ok } -func (ps *limitedSet) Limit() uint { +func (ps *limitedSet) Limit() int { return ps.limit } From ab034707f721ce9de5e533fe7e66b03a2559ce22 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 27 Apr 2023 08:39:55 +0200 Subject: [PATCH 12/56] fixing log and peer limit deactivation comment --- share/availability/discovery/discovery.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index c7dcb5a69f..e3819af24a 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -37,7 +37,7 @@ var waitF = func(ttl time.Duration) time.Duration { type Parameters struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. - // Set -1 to disable. + // Set 0 to disable. PeersLimit int // DiscoveryInterval is an interval between discovery sessions. // Set -1 to disable. @@ -160,7 +160,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can // Discovery is restarted if any previously connected peers disconnect. func (d *Discovery) ensurePeers(ctx context.Context) { if d.params.PeersLimit == 0 || d.params.DiscoveryInterval == -1 { - log.Warn("peers limit is set to 0. Skipping discovery...") + log.Warn("peers limit is set to 0 and/or discovery interval is set to -1. Skipping discovery...") return } // subscribe on EventBus in order to catch disconnected peers and restart From 7cf72879666965939a2d44ce93c461aab696410e Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 27 Apr 2023 08:41:11 +0200 Subject: [PATCH 13/56] removing storage of ctx on Discovery --- share/availability/discovery/discovery.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index e3819af24a..c603593469 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -74,7 +74,6 @@ type Discovery struct { connectingLk sync.Mutex connecting map[peer.ID]context.CancelFunc - ctx context.Context cancel context.CancelFunc } @@ -98,8 +97,9 @@ func NewDiscovery( } func (d *Discovery) Start(context.Context) error { - d.ctx, d.cancel = context.WithCancel(context.Background()) - go d.ensurePeers(d.ctx) + ctx, cancel := context.WithCancel(context.Background()) + d.cancel = cancel + go d.ensurePeers(ctx) return nil } From 86a8a06e92d815e7d786762497e0a81172bad4b4 Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 27 Apr 2023 08:48:07 +0200 Subject: [PATCH 14/56] notify subscribers directly after adding --- share/availability/discovery/discovery.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index c603593469..6b0b6c18a4 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -225,6 +225,8 @@ func (d *Discovery) ensurePeers(ctx context.Context) { return } log.Debugw("added peer to set", "id", peerID) + // and notify our subscribers + d.onUpdatedPeers(peerID, true) // first do Add and only after check the limit // so that peer set represents the actual number of connections we made @@ -237,9 +239,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { d.connectingLk.Lock() delete(d.connecting, peerID) d.connectingLk.Unlock() - - // and notify our subscribers - d.onUpdatedPeers(peerID, true) } } } From 14adb1c9466e88b999f922c1ca481744fb6920fd Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 27 Apr 2023 09:25:31 +0200 Subject: [PATCH 15/56] restarting FindPeers until limit is reached --- share/availability/discovery/discovery.go | 44 ++++++++++++++--------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 6b0b6c18a4..29a32f73b6 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -261,27 +261,37 @@ func (d *Discovery) findPeers(ctx context.Context) { return } - findCtx, findCancel := context.WithCancel(ctx) - defer findCancel() - - peers, err := d.disc.FindPeers(findCtx, topic) - if err != nil { - log.Warn(err) - return - } - // we use errgroup as it obeys the context - wg, findCtx := errgroup.WithContext(ctx) + wg, wgCtx := errgroup.WithContext(ctx) // limit to minimize chances of overreaching the limit wg.SetLimit(d.set.Limit()) - for p := range peers { - peer := p - wg.Go(func() error { - // pass the cancel so that we cancel FindPeers when we connected to enough peers - d.handlePeerFound(findCtx, peer, findCancel) - return nil - }) + + for d.set.Size() < d.set.Limit() { + log.Debugw("finding peers", "remaining", d.set.Limit()-d.set.Size()) + findCtx, findCancel := context.WithCancel(wgCtx) + defer findCancel() + + peers, err := d.disc.FindPeers(findCtx, topic) + if err != nil { + log.Warn(err) + return + } + + for p := range peers { + peer := p + wg.Go(func() error { + // pass the cancel so that we cancel FindPeers when we connected to enough peers + d.handlePeerFound(findCtx, peer, findCancel) + return nil + }) + + // break the loop if we have found enough peers + if d.set.Size() >= d.set.Limit() { + break + } + } } + // we expect no errors _ = wg.Wait() } From 8b8941f6373109f924f982c3f5974f65c83143ea Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 27 Apr 2023 09:52:51 +0200 Subject: [PATCH 16/56] respecting ctx in findPeers loop --- share/availability/discovery/discovery.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 29a32f73b6..27367addb5 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -266,7 +266,7 @@ func (d *Discovery) findPeers(ctx context.Context) { // limit to minimize chances of overreaching the limit wg.SetLimit(d.set.Limit()) - for d.set.Size() < d.set.Limit() { + for d.set.Size() < d.set.Limit() && wgCtx.Err() == nil { log.Debugw("finding peers", "remaining", d.set.Limit()-d.set.Size()) findCtx, findCancel := context.WithCancel(wgCtx) defer findCancel() From c40f68e7a34fcface6b7ed15d0b4183200f7141f Mon Sep 17 00:00:00 2001 From: Wondertan Date: Wed, 26 Apr 2023 16:11:16 +0200 Subject: [PATCH 17/56] start discovery asap --- share/availability/discovery/discovery.go | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 27367addb5..b791e8c9e1 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -172,15 +172,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { log.Error(err) return } - go d.connector.GC(ctx) - - t := time.NewTicker(d.params.DiscoveryInterval) - defer func() { - t.Stop() - if err = sub.Close(); err != nil { - log.Error(err) - } - }() + defer sub.Close() // starting to listen to subscriptions async will help us to avoid any blocking // in the case when we will not have the needed amount of FNs and will be blocked in `FindPeers`. @@ -243,23 +235,28 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } } }() + go d.connector.GC(ctx) + t := time.NewTicker(d.params.DiscoveryInterval) + defer t.Stop() for { + d.findPeers(ctx) + + t.Reset(d.params.DiscoveryInterval) select { + case <-t.C: case <-ctx.Done(): - log.Info("Context canceled. Finishing peer discovery") return - case <-t.C: - d.findPeers(ctx) } } } func (d *Discovery) findPeers(ctx context.Context) { if d.set.Size() >= d.set.Limit() { - log.Debugw("at peer limit, skipping FindPeers", "size", d.set.Size()) + log.Debugw("reached soft peer limit, skipping discovery", "size", d.set.Size()) return } + log.Infow("below soft peer limit, discovering peers", "amount", d.set.Limit()) // we use errgroup as it obeys the context wg, wgCtx := errgroup.WithContext(ctx) From 491762198e2705cdffcb4ef99e18617bcaad6cf3 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Thu, 27 Apr 2023 10:44:33 +0200 Subject: [PATCH 18/56] case for already connected peer --- share/availability/discovery/discovery.go | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index b791e8c9e1..2e3ccec319 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -127,6 +127,23 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can return } + if d.host.Network().Connectedness(peer.ID) == network.Connected { + err := d.set.Add(peer.ID) + if err != nil { + return + } + log.Debugw("added peer to set", "id", peer.ID) + if d.set.Size() >= d.set.Limit() { + log.Infow("soft peer limit reached", "count", d.set.Size()) + cancelFind() + } + d.host.ConnManager().TagPeer(peer.ID, topic, peerWeight) + + // and notify our subscribers + d.onUpdatedPeers(peer.ID, true) + return + } + d.connectingLk.Lock() if _, ok := d.connecting[peer.ID]; ok { d.connectingLk.Unlock() @@ -180,11 +197,9 @@ func (d *Discovery) ensurePeers(ctx context.Context) { for { select { case <-ctx.Done(): - log.Debug("Context canceled. Finish listening for connectedness events.") return case e, ok := <-sub.Out(): if !ok { - log.Debug("Subscription for connectedness events is closed.") return } // listen to disconnect event to remove peer from set and reset backoff time @@ -319,6 +334,7 @@ func (d *Discovery) Advertise(ctx context.Context) { } } + log.Debugf("advertised") select { case <-timer.C: timer.Reset(waitF(ttl)) From ca3a4be1c43d5f0606acaa5f7372d9129b4e3e7c Mon Sep 17 00:00:00 2001 From: Wondertan Date: Thu, 27 Apr 2023 17:56:03 +0200 Subject: [PATCH 19/56] protect rather than tag --- share/availability/discovery/discovery.go | 12 +- .../availability/discovery/discovery_test.go | 140 ++++++++++++++++++ 2 files changed, 145 insertions(+), 7 deletions(-) create mode 100644 share/availability/discovery/discovery_test.go diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 2e3ccec319..c1b6174eb9 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -20,10 +20,7 @@ import ( var log = logging.Logger("share/discovery") const ( - // peerWeight is a number that will be assigned to all discovered full nodes, - // so ConnManager will not break a connection with them. - peerWeight = 1000 - topic = "full" + topic = "full" // eventbusBufSize is the size of the buffered channel to handle // events in libp2p @@ -137,7 +134,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() } - d.host.ConnManager().TagPeer(peer.ID, topic, peerWeight) + d.host.ConnManager().Protect(peer.ID, topic) // and notify our subscribers d.onUpdatedPeers(peer.ID, true) @@ -169,7 +166,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can // NOTE: This is does not protect from remote killing the connection. // In the future, we should design a protocol that keeps bidirectional agreement on whether // connection should be kept or not, similar to mesh link in GossipSub. - d.host.ConnManager().TagPeer(peer.ID, topic, peerWeight) + d.host.ConnManager().Protect(peer.ID, topic) } // ensurePeers ensures we always have 'peerLimit' connected peers. @@ -229,7 +226,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { err = d.set.Add(peerID) if err != nil { log.Debugw("failed to add peer to set", "peer", peerID, "error", err) - return + continue } log.Debugw("added peer to set", "id", peerID) // and notify our subscribers @@ -258,6 +255,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { d.findPeers(ctx) t.Reset(d.params.DiscoveryInterval) + log.Debugw("restarted") select { case <-t.C: case <-ctx.Done(): diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go new file mode 100644 index 0000000000..77dbcd9ecc --- /dev/null +++ b/share/availability/discovery/discovery_test.go @@ -0,0 +1,140 @@ +package discovery + +import ( + "context" + "testing" + "time" + + "github.com/celestiaorg/celestia-node/logs" + logging "github.com/ipfs/go-log/v2" + dht "github.com/libp2p/go-libp2p-kad-dht" + "github.com/libp2p/go-libp2p/core/discovery" + "github.com/libp2p/go-libp2p/core/event" + "github.com/libp2p/go-libp2p/core/host" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/libp2p/go-libp2p/p2p/discovery/routing" + basic "github.com/libp2p/go-libp2p/p2p/host/basic" + "github.com/libp2p/go-libp2p/p2p/net/connmgr" + swarm "github.com/libp2p/go-libp2p/p2p/net/swarm/testing" + "github.com/stretchr/testify/require" +) + +func TestDiscovery(t *testing.T) { + logging.SetLogLevel("share/discovery", "debug") + // logging.SetDebugLogging() + logs.SetDebugLogging() + + const fulls = 120 + ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) + t.Cleanup(cancel) + + tn := newTestnet(ctx, t) + + peerA := tn.discovery(Parameters{ + PeersLimit: fulls, + DiscoveryInterval: time.Millisecond*100, + AdvertiseInterval: -1, + }) + + for range make([]int, fulls) { + tn.discovery(Parameters{ + PeersLimit: 0, + DiscoveryInterval: time.Hour, + AdvertiseInterval: time.Millisecond*100, // should only happen once + }) + } + + + // go func() { + // for { + // time.Sleep(time.Second) + // t.Log(len(peerA.host.Network().Peers())) + // } + // }() + + var peers []peer.ID + for len(peers) != fulls { + peers, _ = peerA.Peers(ctx) + if ctx.Err() != nil { + t.Fatal("did not discover peers in time") + } + } +} + +type testnet struct { + ctx context.Context + T *testing.T + + bootstrapper host.Host +} + +func newTestnet(ctx context.Context, t *testing.T) *testnet { + swrm := swarm.GenSwarm(t, swarm.OptDisableTCP) + hst, err := basic.NewHost(swrm, &basic.HostOpts{}) + require.NoError(t, err) + hst.Start() + + sub, err := hst.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}) + require.NoError(t, err) + + go func() { + for { + evt := (<-sub.Out()).(event.EvtPeerConnectednessChanged) + log.Debugw("evt", "peer", evt.Peer) + } + }() + + dht, err := dht.New(ctx, hst, dht.DisableAutoRefresh(), dht.ProtocolPrefix("/test"), dht.Mode(dht.ModeServer), dht.BootstrapPeers(), dht.BucketSize(3)) + require.NoError(t, err) + t.Cleanup(func() { + err := dht.Close() + require.NoError(t, err) + }) + + return &testnet{ctx: ctx, T: t, bootstrapper: hst} +} + +func (t *testnet) discovery(params Parameters) *Discovery { + hst, routingDisc := t.peer() + disc := NewDiscovery(hst, routingDisc, params) + err := disc.Start(t.ctx) + require.NoError(t.T, err) + t.T.Cleanup(func() { + err := disc.Stop(t.ctx) + require.NoError(t.T, err) + }) + + go disc.Advertise(t.ctx) + return disc +} + +func (t *testnet) peer() (host.Host, discovery.Discovery) { + swrm := swarm.GenSwarm(t.T, swarm.OptDisableTCP) + cm, err := connmgr.NewConnManager(0, 1, connmgr.WithGracePeriod(time.Millisecond*10)) + require.NoError(t.T, err) + + newHost, err := basic.NewHost(swrm, &basic.HostOpts{ + ConnManager: cm, + }) + require.NoError(t.T, err) + newHost.Start() + + err = newHost.Connect(t.ctx, peer.AddrInfo{ + ID: t.bootstrapper.ID(), + Addrs: t.bootstrapper.Addrs(), + }) + require.NoError(t.T, err) + + dht, err := dht.New(t.ctx, newHost, dht.DisableAutoRefresh(), dht.ProtocolPrefix("/test"), dht.Mode(dht.ModeServer), dht.BucketSize(3)) + require.NoError(t.T, err) + t.T.Cleanup(func() { + err := dht.Close() + require.NoError(t.T, err) + }) + + <-dht.RefreshRoutingTable() + + t.T.Log(len(newHost.Network().Peers())) + + return newHost, routing.NewRoutingDiscovery(dht) +} From f7c413a17b3596e62e421f3a842e80a73936a4df Mon Sep 17 00:00:00 2001 From: Wondertan Date: Thu, 27 Apr 2023 17:56:30 +0200 Subject: [PATCH 20/56] first working iteration for tests --- .../availability/discovery/discovery_test.go | 82 +++++++------------ 1 file changed, 30 insertions(+), 52 deletions(-) diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index 77dbcd9ecc..c0476a284e 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -5,26 +5,18 @@ import ( "testing" "time" - "github.com/celestiaorg/celestia-node/logs" - logging "github.com/ipfs/go-log/v2" dht "github.com/libp2p/go-libp2p-kad-dht" "github.com/libp2p/go-libp2p/core/discovery" - "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/p2p/discovery/routing" - basic "github.com/libp2p/go-libp2p/p2p/host/basic" - "github.com/libp2p/go-libp2p/p2p/net/connmgr" - swarm "github.com/libp2p/go-libp2p/p2p/net/swarm/testing" + mocknet "github.com/libp2p/go-libp2p/p2p/net/mock" "github.com/stretchr/testify/require" ) func TestDiscovery(t *testing.T) { - logging.SetLogLevel("share/discovery", "debug") - // logging.SetDebugLogging() - logs.SetDebugLogging() + const fulls = 10 - const fulls = 120 ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) t.Cleanup(cancel) @@ -32,7 +24,7 @@ func TestDiscovery(t *testing.T) { peerA := tn.discovery(Parameters{ PeersLimit: fulls, - DiscoveryInterval: time.Millisecond*100, + DiscoveryInterval: time.Millisecond * 100, AdvertiseInterval: -1, }) @@ -40,18 +32,10 @@ func TestDiscovery(t *testing.T) { tn.discovery(Parameters{ PeersLimit: 0, DiscoveryInterval: time.Hour, - AdvertiseInterval: time.Millisecond*100, // should only happen once + AdvertiseInterval: time.Millisecond * 100, // should only happen once }) } - - // go func() { - // for { - // time.Sleep(time.Second) - // t.Log(len(peerA.host.Network().Peers())) - // } - // }() - var peers []peer.ID for len(peers) != fulls { peers, _ = peerA.Peers(ctx) @@ -64,34 +48,34 @@ func TestDiscovery(t *testing.T) { type testnet struct { ctx context.Context T *testing.T + net mocknet.Mocknet - bootstrapper host.Host + bootstrapper peer.ID } func newTestnet(ctx context.Context, t *testing.T) *testnet { - swrm := swarm.GenSwarm(t, swarm.OptDisableTCP) - hst, err := basic.NewHost(swrm, &basic.HostOpts{}) - require.NoError(t, err) - hst.Start() + net := mocknet.New() + t.Cleanup(func() { + err := net.Close() + require.NoError(t, err) + }) - sub, err := hst.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}) + hst, err := net.GenPeer() require.NoError(t, err) - go func() { - for { - evt := (<-sub.Out()).(event.EvtPeerConnectednessChanged) - log.Debugw("evt", "peer", evt.Peer) - } - }() - - dht, err := dht.New(ctx, hst, dht.DisableAutoRefresh(), dht.ProtocolPrefix("/test"), dht.Mode(dht.ModeServer), dht.BootstrapPeers(), dht.BucketSize(3)) + dht, err := dht.New(ctx, hst, + dht.Mode(dht.ModeServer), + dht.BootstrapPeers(), + dht.ProtocolPrefix("/test"), + dht.BucketSize(1), + ) require.NoError(t, err) t.Cleanup(func() { err := dht.Close() require.NoError(t, err) }) - return &testnet{ctx: ctx, T: t, bootstrapper: hst} + return &testnet{ctx: ctx, T: t, net: net, bootstrapper: hst.ID()} } func (t *testnet) discovery(params Parameters) *Discovery { @@ -109,32 +93,26 @@ func (t *testnet) discovery(params Parameters) *Discovery { } func (t *testnet) peer() (host.Host, discovery.Discovery) { - swrm := swarm.GenSwarm(t.T, swarm.OptDisableTCP) - cm, err := connmgr.NewConnManager(0, 1, connmgr.WithGracePeriod(time.Millisecond*10)) + hst, err := t.net.GenPeer() require.NoError(t.T, err) - newHost, err := basic.NewHost(swrm, &basic.HostOpts{ - ConnManager: cm, - }) + err = t.net.LinkAll() require.NoError(t.T, err) - newHost.Start() - err = newHost.Connect(t.ctx, peer.AddrInfo{ - ID: t.bootstrapper.ID(), - Addrs: t.bootstrapper.Addrs(), - }) - require.NoError(t.T, err) - - dht, err := dht.New(t.ctx, newHost, dht.DisableAutoRefresh(), dht.ProtocolPrefix("/test"), dht.Mode(dht.ModeServer), dht.BucketSize(3)) + dht, err := dht.New(t.ctx, hst, + dht.Mode(dht.ModeServer), + dht.BootstrapPeers(peer.AddrInfo{ID: t.bootstrapper}), + dht.ProtocolPrefix("/test"), + dht.BucketSize(1), + ) require.NoError(t.T, err) t.T.Cleanup(func() { err := dht.Close() require.NoError(t.T, err) }) - <-dht.RefreshRoutingTable() - - t.T.Log(len(newHost.Network().Peers())) + err = dht.Bootstrap(t.ctx) + require.NoError(t.T, err) - return newHost, routing.NewRoutingDiscovery(dht) + return hst, routing.NewRoutingDiscovery(dht) } From c5e0d0038baf0f1168aae01afffe5deabb8eb809 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Thu, 27 Apr 2023 22:52:05 +0200 Subject: [PATCH 21/56] set and backoff audit + remove retrying for FindPeers --- share/availability/discovery/backoff.go | 49 +++++++++------- share/availability/discovery/backoff_test.go | 2 +- share/availability/discovery/discovery.go | 59 +++++++------------- share/availability/discovery/set.go | 16 ++---- share/availability/discovery/set_test.go | 24 +++----- 5 files changed, 62 insertions(+), 88 deletions(-) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index af9500b30e..2a5edab196 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -13,7 +13,7 @@ import ( // gcInterval is a default period after which disconnected peers will be removed from cache const ( - gcInterval = time.Hour + gcInterval = time.Minute // connectTimeout is the timeout used for dialing peers and discovering peer addresses. connectTimeout = time.Minute * 2 ) @@ -30,7 +30,7 @@ type backoffConnector struct { backoff backoff.BackoffFactory cacheLk sync.Mutex - cacheData map[peer.ID]*backoffData + cacheData map[peer.ID]backoffData } // backoffData stores time when next connection attempt with the remote peer. @@ -43,35 +43,48 @@ func newBackoffConnector(h host.Host, factory backoff.BackoffFactory) *backoffCo return &backoffConnector{ h: h, backoff: factory, - cacheData: make(map[peer.ID]*backoffData), + cacheData: make(map[peer.ID]backoffData), } } // Connect puts peer to the backoffCache and tries to establish a connection with it. func (b *backoffConnector) Connect(ctx context.Context, p peer.AddrInfo) error { - // we should lock the mutex before calling connectionData and not inside because otherwise it could + // we should lock the mutex before calling backoffData and not inside because otherwise it could // be modified from another goroutine as it returns a pointer - b.cacheLk.Lock() - cache := b.connectionData(p.ID) + cache := b.backoffData(p.ID) if time.Now().Before(cache.nexttry) { - b.cacheLk.Unlock() return errBackoffNotEnded } - cache.nexttry = time.Now().Add(cache.backoff.Delay()) - b.cacheLk.Unlock() ctx, cancel := context.WithTimeout(ctx, connectTimeout) defer cancel() - return b.h.Connect(ctx, p) + err := b.h.Connect(ctx, p) + // we don't want to add backoff when the context is canceled. + if !errors.Is(err, context.Canceled) { + b.Backoff(p.ID) + } + return err } -// connectionData returns backoffData from the map if it was stored, otherwise it will instantiate +func (b *backoffConnector) Backoff(p peer.ID) { + data := b.backoffData(p) + data.nexttry = time.Now().Add(data.backoff.Delay()) + + b.cacheLk.Lock() + b.cacheData[p] = data + b.cacheLk.Unlock() +} + +// backoffData returns backoffData from the map if it was stored, otherwise it will instantiate // a new one. -func (b *backoffConnector) connectionData(p peer.ID) *backoffData { +func (b *backoffConnector) backoffData(p peer.ID) backoffData { + b.cacheLk.Lock() + defer b.cacheLk.Unlock() + cache, ok := b.cacheData[p] if !ok { - cache = &backoffData{} + cache = backoffData{} cache.backoff = b.backoff() b.cacheData[p] = cache } @@ -86,15 +99,13 @@ func (b *backoffConnector) RemoveBackoff(p peer.ID) { delete(b.cacheData, p) } -// RestartBackoff resets delay time between attempts and adds a delay for the next connection +// ResetBackoff resets delay time between attempts and adds a delay for the next connection // attempt to remote peer. It will mostly be called when host receives a notification that remote // peer was disconnected. -func (b *backoffConnector) RestartBackoff(p peer.ID) { - b.cacheLk.Lock() - defer b.cacheLk.Unlock() - cache := b.connectionData(p) +func (b *backoffConnector) ResetBackoff(p peer.ID) { + cache := b.backoffData(p) cache.backoff.Reset() - cache.nexttry = time.Now().Add(cache.backoff.Delay()) + b.Backoff(p) } func (b *backoffConnector) GC(ctx context.Context) { diff --git a/share/availability/discovery/backoff_test.go b/share/availability/discovery/backoff_test.go index 95e84fbb8c..61a639cde3 100644 --- a/share/availability/discovery/backoff_test.go +++ b/share/availability/discovery/backoff_test.go @@ -42,6 +42,6 @@ func TestBackoff_ResetBackoffPeriod(t *testing.T) { info := host.InfoFromHost(m.Hosts()[1]) require.NoError(t, b.Connect(ctx, *info)) nexttry := b.cacheData[info.ID].nexttry - b.RestartBackoff(info.ID) + b.ResetBackoff(info.ID) require.True(t, b.cacheData[info.ID].nexttry.After(nexttry)) } diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index c1b6174eb9..1ac9fe685a 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -2,7 +2,6 @@ package discovery import ( "context" - "errors" "sync" "time" @@ -12,7 +11,6 @@ import ( "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/network" "github.com/libp2p/go-libp2p/core/peer" - "github.com/libp2p/go-libp2p/core/routing" "github.com/libp2p/go-libp2p/p2p/host/eventbus" "golang.org/x/sync/errgroup" ) @@ -125,16 +123,14 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can } if d.host.Network().Connectedness(peer.ID) == network.Connected { - err := d.set.Add(peer.ID) - if err != nil { - return - } + d.set.Add(peer.ID) log.Debugw("added peer to set", "id", peer.ID) if d.set.Size() >= d.set.Limit() { log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() } d.host.ConnManager().Protect(peer.ID, topic) + d.connector.Backoff(peer.ID) // and notify our subscribers d.onUpdatedPeers(peer.ID, true) @@ -151,11 +147,6 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can err := d.connector.Connect(ctx, peer) if err != nil { - // we don't want to add backoff when the context is canceled. - if errors.Is(err, context.Canceled) || errors.Is(err, routing.ErrNotFound) { - d.connector.RemoveBackoff(peer.ID) - } - d.connectingLk.Lock() delete(d.connecting, peer.ID) d.connectingLk.Unlock() @@ -209,7 +200,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } d.host.ConnManager().UntagPeer(evnt.Peer, topic) - d.connector.RestartBackoff(evnt.Peer) + d.connector.ResetBackoff(evnt.Peer) d.set.Remove(evnt.Peer) d.onUpdatedPeers(evnt.Peer, false) log.Debugw("removed peer from the peer set", @@ -223,12 +214,8 @@ func (d *Discovery) ensurePeers(ctx context.Context) { continue } - err = d.set.Add(peerID) - if err != nil { - log.Debugw("failed to add peer to set", "peer", peerID, "error", err) - continue - } log.Debugw("added peer to set", "id", peerID) + d.set.Add(peerID) // and notify our subscribers d.onUpdatedPeers(peerID, true) @@ -255,7 +242,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { d.findPeers(ctx) t.Reset(d.params.DiscoveryInterval) - log.Debugw("restarted") select { case <-t.C: case <-ctx.Done(): @@ -276,30 +262,23 @@ func (d *Discovery) findPeers(ctx context.Context) { // limit to minimize chances of overreaching the limit wg.SetLimit(d.set.Limit()) - for d.set.Size() < d.set.Limit() && wgCtx.Err() == nil { - log.Debugw("finding peers", "remaining", d.set.Limit()-d.set.Size()) - findCtx, findCancel := context.WithCancel(wgCtx) - defer findCancel() + log.Debugw("finding peers", "remaining", d.set.Limit()-d.set.Size()) + findCtx, findCancel := context.WithCancel(wgCtx) + defer findCancel() - peers, err := d.disc.FindPeers(findCtx, topic) - if err != nil { - log.Warn(err) - return - } + peers, err := d.disc.FindPeers(findCtx, topic) + if err != nil { + log.Warn(err) + return + } - for p := range peers { - peer := p - wg.Go(func() error { - // pass the cancel so that we cancel FindPeers when we connected to enough peers - d.handlePeerFound(findCtx, peer, findCancel) - return nil - }) - - // break the loop if we have found enough peers - if d.set.Size() >= d.set.Limit() { - break - } - } + for p := range peers { + peer := p + wg.Go(func() error { + // pass the cancel so that we cancel FindPeers when we connected to enough peers + d.handlePeerFound(findCtx, peer, findCancel) + return nil + }) } // we expect no errors diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index 57acd54fcd..dd4a0489e5 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -2,7 +2,6 @@ package discovery import ( "context" - "errors" "sync" "github.com/libp2p/go-libp2p/core/peer" @@ -45,13 +44,8 @@ func (ps *limitedSet) Size() int { } // Add attempts to add the given peer into the set. -// This operation will fail if the number of peers in the set is equal to size. -func (ps *limitedSet) Add(p peer.ID) error { +func (ps *limitedSet) Add(p peer.ID) { ps.lk.Lock() - if _, ok := ps.ps[p]; ok { - ps.lk.Unlock() - return errors.New("share: discovery: peer already added") - } ps.ps[p] = struct{}{} ps.lk.Unlock() @@ -61,7 +55,7 @@ func (ps *limitedSet) Add(p peer.ID) error { select { case ps.waitPeer <- p: default: - return nil + return } } } @@ -76,16 +70,16 @@ func (ps *limitedSet) Remove(id peer.ID) { // Peers returns all discovered peers from the set. func (ps *limitedSet) Peers(ctx context.Context) ([]peer.ID, error) { - ps.lk.Lock() + ps.lk.RLock() if len(ps.ps) > 0 { out := make([]peer.ID, 0, len(ps.ps)) for p := range ps.ps { out = append(out, p) } - ps.lk.Unlock() + ps.lk.RUnlock() return out, nil } - ps.lk.Unlock() + ps.lk.RUnlock() // block until a new peer will be discovered select { diff --git a/share/availability/discovery/set_test.go b/share/availability/discovery/set_test.go index 18fdbc3daf..1507509732 100644 --- a/share/availability/discovery/set_test.go +++ b/share/availability/discovery/set_test.go @@ -15,27 +15,17 @@ func TestSet_TryAdd(t *testing.T) { require.NoError(t, err) set := newLimitedSet(1) - require.NoError(t, set.Add(h.ID())) + set.Add(h.ID()) require.True(t, set.Contains(h.ID())) } -func TestSet_TryAddFails(t *testing.T) { - m := mocknet.New() - h1, err := m.GenPeer() - require.NoError(t, err) - - set := newLimitedSet(1) - require.NoError(t, set.Add(h1.ID())) - require.Error(t, set.Add(h1.ID())) -} - func TestSet_Remove(t *testing.T) { m := mocknet.New() h, err := m.GenPeer() require.NoError(t, err) set := newLimitedSet(1) - require.NoError(t, set.Add(h.ID())) + set.Add(h.ID()) set.Remove(h.ID()) require.False(t, set.Contains(h.ID())) } @@ -48,8 +38,8 @@ func TestSet_Peers(t *testing.T) { require.NoError(t, err) set := newLimitedSet(2) - require.NoError(t, set.Add(h1.ID())) - require.NoError(t, set.Add(h2.ID())) + set.Add(h1.ID()) + set.Add(h2.ID()) ctx, cancel := context.WithTimeout(context.Background(), time.Second*1) t.Cleanup(cancel) @@ -69,7 +59,7 @@ func TestSet_WaitPeers(t *testing.T) { set := newLimitedSet(2) go func() { time.Sleep(time.Millisecond * 500) - set.Add(h1.ID()) //nolint:errcheck + set.Add(h1.ID()) }() ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) @@ -89,8 +79,8 @@ func TestSet_Size(t *testing.T) { require.NoError(t, err) set := newLimitedSet(2) - require.NoError(t, set.Add(h1.ID())) - require.NoError(t, set.Add(h2.ID())) + set.Add(h1.ID()) + set.Add(h2.ID()) require.Equal(t, 2, set.Size()) set.Remove(h2.ID()) require.Equal(t, 1, set.Size()) From c382f16d625a0311a3effb8a99083af4b58af1c1 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Thu, 27 Apr 2023 23:46:53 +0200 Subject: [PATCH 22/56] the least flaky version of the test; 2/100 fails --- .../availability/discovery/discovery_test.go | 70 +++++++++++-------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index c0476a284e..c55867b21e 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -7,42 +7,67 @@ import ( dht "github.com/libp2p/go-libp2p-kad-dht" "github.com/libp2p/go-libp2p/core/discovery" + "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/p2p/discovery/routing" mocknet "github.com/libp2p/go-libp2p/p2p/net/mock" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestDiscovery(t *testing.T) { - const fulls = 10 + const nodes = 50 // higher number brings higher coverage - ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) t.Cleanup(cancel) tn := newTestnet(ctx, t) peerA := tn.discovery(Parameters{ - PeersLimit: fulls, + PeersLimit: nodes, DiscoveryInterval: time.Millisecond * 100, AdvertiseInterval: -1, }) + sub, _ := peerA.host.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}) - for range make([]int, fulls) { - tn.discovery(Parameters{ + discs := make([]*Discovery, nodes) + for i := range discs { + // start new node + discs[i] = tn.discovery(Parameters{ PeersLimit: 0, - DiscoveryInterval: time.Hour, - AdvertiseInterval: time.Millisecond * 100, // should only happen once + DiscoveryInterval: -1, + AdvertiseInterval: time.Millisecond * 100, }) + + // and check that we discover/connect to it + select { + case <-sub.Out(): + case <-ctx.Done(): + t.Fatal("did not discover peer in time") + } } - var peers []peer.ID - for len(peers) != fulls { - peers, _ = peerA.Peers(ctx) - if ctx.Err() != nil { - t.Fatal("did not discover peers in time") + assert.Equal(t, peerA.set.Size(), nodes) + + // immediately cut peer from bootstrapper sp it cannot rediscover peers + // helps with flakes + // TODO: Check why backoff does not help + err := peerA.host.Network().ClosePeer(tn.bootstrapper) + require.NoError(t, err) + + for _, disc := range peerA.host.Network().Peers() { + err := peerA.host.Network().ClosePeer(disc) + require.NoError(t, err) + + select { + case <-sub.Out(): + case <-ctx.Done(): + t.Fatal("did not disconnected peer in time") } } + + assert.Less(t, peerA.set.Size(), nodes) } type testnet struct { @@ -55,25 +80,15 @@ type testnet struct { func newTestnet(ctx context.Context, t *testing.T) *testnet { net := mocknet.New() - t.Cleanup(func() { - err := net.Close() - require.NoError(t, err) - }) - hst, err := net.GenPeer() require.NoError(t, err) - dht, err := dht.New(ctx, hst, + _, err = dht.New(ctx, hst, dht.Mode(dht.ModeServer), dht.BootstrapPeers(), dht.ProtocolPrefix("/test"), - dht.BucketSize(1), ) require.NoError(t, err) - t.Cleanup(func() { - err := dht.Close() - require.NoError(t, err) - }) return &testnet{ctx: ctx, T: t, net: net, bootstrapper: hst.ID()} } @@ -99,17 +114,16 @@ func (t *testnet) peer() (host.Host, discovery.Discovery) { err = t.net.LinkAll() require.NoError(t.T, err) + _, err = t.net.ConnectPeers(hst.ID(), t.bootstrapper) + require.NoError(t.T, err) + dht, err := dht.New(t.ctx, hst, dht.Mode(dht.ModeServer), - dht.BootstrapPeers(peer.AddrInfo{ID: t.bootstrapper}), dht.ProtocolPrefix("/test"), + // needed to reduce connections to peers on DHT level dht.BucketSize(1), ) require.NoError(t.T, err) - t.T.Cleanup(func() { - err := dht.Close() - require.NoError(t.T, err) - }) err = dht.Bootstrap(t.ctx) require.NoError(t.T, err) From d96ade79ce8e4d1fc64ff6289d2662cccbacd23c Mon Sep 17 00:00:00 2001 From: Vlad Date: Fri, 28 Apr 2023 10:33:41 +0300 Subject: [PATCH 23/56] - stop wait group when enough peers found - rework errorgroup loop to respect findCtx and stop when it is canceled - use parent context instead of findCtx for handlePeer to not cancel connection when it is half done - allow wg pool be cancelable from another routine --- share/availability/discovery/discovery.go | 42 ++++++++++++++++------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 1ac9fe685a..7e3ba3fb1e 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -258,12 +258,13 @@ func (d *Discovery) findPeers(ctx context.Context) { log.Infow("below soft peer limit, discovering peers", "amount", d.set.Limit()) // we use errgroup as it obeys the context - wg, wgCtx := errgroup.WithContext(ctx) + wg := errgroup.Group{} // limit to minimize chances of overreaching the limit wg.SetLimit(d.set.Limit()) + defer wg.Wait() //nolint:errcheck - log.Debugw("finding peers", "remaining", d.set.Limit()-d.set.Size()) - findCtx, findCancel := context.WithCancel(wgCtx) + // stop discovery when we are done + findCtx, findCancel := context.WithCancel(ctx) defer findCancel() peers, err := d.disc.FindPeers(findCtx, topic) @@ -272,17 +273,32 @@ func (d *Discovery) findPeers(ctx context.Context) { return } - for p := range peers { - peer := p - wg.Go(func() error { - // pass the cancel so that we cancel FindPeers when we connected to enough peers - d.handlePeerFound(findCtx, peer, findCancel) - return nil - }) - } + var amount int + for { + select { + case <-findCtx.Done(): + log.Debugw("found enough peers", "amount", d.set.Size()) + return + case p, ok := <-peers: + if !ok { + log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) + return + } - // we expect no errors - _ = wg.Wait() + amount++ + peer := p + log.Debugw("found peer", "found_amount", amount) + wg.Go(func() error { + if findCtx.Err() != nil { + log.Debug("find has been canceled, skip peer") + return nil + } + // pass the cancel so that we cancel FindPeers when we connected to enough peers + d.handlePeerFound(ctx, peer, findCancel) + return nil + }) + } + } } // Advertise is a utility function that persistently advertises a service through an Advertiser. From 2005330f6f10bac156ab57378bccee295ee6656e Mon Sep 17 00:00:00 2001 From: Vlad Date: Fri, 28 Apr 2023 10:35:03 +0300 Subject: [PATCH 24/56] log Error if Discovery is unable to find new peers for long time --- share/availability/discovery/discovery.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 7e3ba3fb1e..99d41548e1 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -23,6 +23,8 @@ const ( // eventbusBufSize is the size of the buffered channel to handle // events in libp2p eventbusBufSize = 32 + + findPeersStuckWarnDelay = time.Minute ) // waitF calculates time to restart announcing. @@ -273,12 +275,18 @@ func (d *Discovery) findPeers(ctx context.Context) { return } + ticker := time.NewTicker(findPeersStuckWarnDelay) + defer ticker.Stop() var amount int for { + ticker.Reset(findPeersStuckWarnDelay) select { case <-findCtx.Done(): log.Debugw("found enough peers", "amount", d.set.Size()) return + case <-ticker.C: + log.Error("wasn't able to find new peers for long time") + continue case p, ok := <-peers: if !ok { log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) From 37a159ea4992b25c6ecd98dda2bb8a38ac31e254 Mon Sep 17 00:00:00 2001 From: Vlad Date: Fri, 28 Apr 2023 11:27:26 +0300 Subject: [PATCH 25/56] rerun findPeers if previous run stopped too early --- share/availability/discovery/discovery.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 99d41548e1..6a7e270080 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -242,6 +242,10 @@ func (d *Discovery) ensurePeers(ctx context.Context) { defer t.Stop() for { d.findPeers(ctx) + if d.set.Size() < d.set.Limit() { + // rerun discovery is amount of peers didn't reach the limit + continue + } t.Reset(d.params.DiscoveryInterval) select { From 82eb331fec85b05d0548521b47ef153b50c45215 Mon Sep 17 00:00:00 2001 From: Vlad Date: Fri, 28 Apr 2023 11:29:38 +0300 Subject: [PATCH 26/56] - do not connect to already connected peers - send update notification asap --- share/availability/discovery/discovery.go | 13 +++++++++++-- share/availability/discovery/set.go | 7 +++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 6a7e270080..b3a62a4b44 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -125,8 +125,14 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can } if d.host.Network().Connectedness(peer.ID) == network.Connected { - d.set.Add(peer.ID) - log.Debugw("added peer to set", "id", peer.ID) + if !d.set.Add(peer.ID) { + log.Debug("skip handle: peer is already in discovery set") + return + } + + // and notify our subscribers + d.onUpdatedPeers(peer.ID, true) + log.Debug("added peer to set") if d.set.Size() >= d.set.Limit() { log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() @@ -216,6 +222,9 @@ func (d *Discovery) ensurePeers(ctx context.Context) { continue } + if !d.set.Add(peerID) { + continue + } log.Debugw("added peer to set", "id", peerID) d.set.Add(peerID) // and notify our subscribers diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index dd4a0489e5..8adb9a7b60 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -44,8 +44,11 @@ func (ps *limitedSet) Size() int { } // Add attempts to add the given peer into the set. -func (ps *limitedSet) Add(p peer.ID) { +func (ps *limitedSet) Add(p peer.ID) (added bool) { ps.lk.Lock() + if _, ok := ps.ps[p]; ok { + return false + } ps.ps[p] = struct{}{} ps.lk.Unlock() @@ -55,7 +58,7 @@ func (ps *limitedSet) Add(p peer.ID) { select { case ps.waitPeer <- p: default: - return + return true } } } From 36deab82ee43de257aa9aa6d8c94c2f7d44f444e Mon Sep 17 00:00:00 2001 From: Vlad Date: Fri, 28 Apr 2023 11:30:00 +0300 Subject: [PATCH 27/56] improve logs --- share/availability/discovery/discovery.go | 31 +++++++++++++++-------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index b3a62a4b44..0801f3faa5 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -117,10 +117,19 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, cancelFind context.CancelFunc) { - if peer.ID == d.host.ID() || - len(peer.Addrs) == 0 || - d.set.Contains(peer.ID) || - d.set.Size() >= d.set.Limit() { + log := log.With("peer", peer.ID) + switch { + case peer.ID == d.host.ID(): + log.Debug("skip handle: self discovery") + return + case d.set.Contains(peer.ID): + log.Debug("skip handle: peer is already in discovery set") + return + case len(peer.Addrs) == 0: + log.Debug("skip handle: empty address list") + return + case d.set.Size() >= d.set.Limit(): + log.Debug("skip handle: enough peers found") return } @@ -139,15 +148,13 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can } d.host.ConnManager().Protect(peer.ID, topic) d.connector.Backoff(peer.ID) - - // and notify our subscribers - d.onUpdatedPeers(peer.ID, true) return } d.connectingLk.Lock() if _, ok := d.connecting[peer.ID]; ok { d.connectingLk.Unlock() + log.Debug("skip handle: connecting to the peer in another routine") return } d.connecting[peer.ID] = cancelFind @@ -158,6 +165,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can d.connectingLk.Lock() delete(d.connecting, peer.ID) d.connectingLk.Unlock() + log.Debugw("unable to connect", "err", err) return } @@ -166,6 +174,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, can // In the future, we should design a protocol that keeps bidirectional agreement on whether // connection should be kept or not, similar to mesh link in GossipSub. d.host.ConnManager().Protect(peer.ID, topic) + log.Debug("started connecting to the peer") } // ensurePeers ensures we always have 'peerLimit' connected peers. @@ -226,7 +235,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { continue } log.Debugw("added peer to set", "id", peerID) - d.set.Add(peerID) // and notify our subscribers d.onUpdatedPeers(peerID, true) @@ -256,6 +264,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { continue } + //TODO: drain the channel, because it could be filled already or use timer with proper drain t.Reset(d.params.DiscoveryInterval) select { case <-t.C: @@ -270,7 +279,7 @@ func (d *Discovery) findPeers(ctx context.Context) { log.Debugw("reached soft peer limit, skipping discovery", "size", d.set.Size()) return } - log.Infow("below soft peer limit, discovering peers", "amount", d.set.Limit()) + log.Infow("below soft peer limit, discovering peers", "remaining", d.set.Limit()-d.set.Size()) // we use errgroup as it obeys the context wg := errgroup.Group{} @@ -284,7 +293,7 @@ func (d *Discovery) findPeers(ctx context.Context) { peers, err := d.disc.FindPeers(findCtx, topic) if err != nil { - log.Warn(err) + log.Error("unable to start discovery", "err", err) return } @@ -298,7 +307,7 @@ func (d *Discovery) findPeers(ctx context.Context) { log.Debugw("found enough peers", "amount", d.set.Size()) return case <-ticker.C: - log.Error("wasn't able to find new peers for long time") + log.Warn("wasn't able to find new peers for long time") continue case p, ok := <-peers: if !ok { From d83f2ef0769a3fff4fe361fbe86051ace4fee122 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Thu, 27 Apr 2023 23:54:35 +0200 Subject: [PATCH 28/56] increase time --- share/availability/discovery/discovery_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index c55867b21e..79e4bda5f9 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -17,9 +17,9 @@ import ( ) func TestDiscovery(t *testing.T) { - const nodes = 50 // higher number brings higher coverage + const nodes = 30 // higher number brings higher coverage - ctx, cancel := context.WithTimeout(context.Background(), time.Second*10) + ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) t.Cleanup(cancel) tn := newTestnet(ctx, t) From c1540736681db44d4058ce804bbb95d2fcf71dde Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 28 Apr 2023 11:22:35 +0200 Subject: [PATCH 29/56] another attempt to deflake the test --- .../availability/discovery/discovery_test.go | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index 79e4bda5f9..bb2ca13b69 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -7,7 +7,6 @@ import ( dht "github.com/libp2p/go-libp2p-kad-dht" "github.com/libp2p/go-libp2p/core/discovery" - "github.com/libp2p/go-libp2p/core/event" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/p2p/discovery/routing" @@ -29,26 +28,34 @@ func TestDiscovery(t *testing.T) { DiscoveryInterval: time.Millisecond * 100, AdvertiseInterval: -1, }) - sub, _ := peerA.host.EventBus().Subscribe(&event.EvtPeerConnectednessChanged{}) + + type peerUpdate struct { + peerID peer.ID + isAdded bool + } + updateCh := make(chan peerUpdate) + peerA.WithOnPeersUpdate(func(peerID peer.ID, isAdded bool) { + updateCh <- peerUpdate{peerID: peerID, isAdded: isAdded} + }) discs := make([]*Discovery, nodes) for i := range discs { - // start new node discs[i] = tn.discovery(Parameters{ PeersLimit: 0, DiscoveryInterval: -1, AdvertiseInterval: time.Millisecond * 100, }) - // and check that we discover/connect to it select { - case <-sub.Out(): + case res := <-updateCh: + require.Equal(t, discs[i].host.ID(), res.peerID) + require.True(t, res.isAdded) case <-ctx.Done(): t.Fatal("did not discover peer in time") } } - assert.Equal(t, peerA.set.Size(), nodes) + assert.Equal(t, nodes, peerA.set.Size()) // immediately cut peer from bootstrapper sp it cannot rediscover peers // helps with flakes @@ -56,14 +63,16 @@ func TestDiscovery(t *testing.T) { err := peerA.host.Network().ClosePeer(tn.bootstrapper) require.NoError(t, err) - for _, disc := range peerA.host.Network().Peers() { - err := peerA.host.Network().ClosePeer(disc) + for _, peerID := range peerA.host.Network().Peers() { + err := peerA.host.Network().ClosePeer(peerID) require.NoError(t, err) select { - case <-sub.Out(): + case res := <-updateCh: + require.Equal(t, peerID, res.peerID) + require.False(t, res.isAdded) case <-ctx.Done(): - t.Fatal("did not disconnected peer in time") + t.Fatal("did not discover peer in time") } } From cf20647fd67333af1a89e89fd95d77602ef448bd Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 28 Apr 2023 11:22:50 +0200 Subject: [PATCH 30/56] various fixes --- share/availability/discovery/discovery.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 0801f3faa5..bcd5f9c58f 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -116,7 +116,7 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. -func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo, cancelFind context.CancelFunc) { +func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.CancelFunc, peer peer.AddrInfo) { log := log.With("peer", peer.ID) switch { case peer.ID == d.host.ID(): @@ -264,7 +264,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { continue } - //TODO: drain the channel, because it could be filled already or use timer with proper drain t.Reset(d.params.DiscoveryInterval) select { case <-t.C: @@ -282,7 +281,7 @@ func (d *Discovery) findPeers(ctx context.Context) { log.Infow("below soft peer limit, discovering peers", "remaining", d.set.Limit()-d.set.Size()) // we use errgroup as it obeys the context - wg := errgroup.Group{} + var wg errgroup.Group // limit to minimize chances of overreaching the limit wg.SetLimit(d.set.Limit()) defer wg.Wait() //nolint:errcheck @@ -324,7 +323,8 @@ func (d *Discovery) findPeers(ctx context.Context) { return nil } // pass the cancel so that we cancel FindPeers when we connected to enough peers - d.handlePeerFound(ctx, peer, findCancel) + // we don't pass findCtx so that we don't cancel outgoing connections + d.handlePeerFound(ctx, findCancel, peer) return nil }) } From d1399794b184c489d0eec38fb4be4785a8285e6b Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 28 Apr 2023 11:39:26 +0200 Subject: [PATCH 31/56] update backoff logic to deflake test --- share/availability/discovery/backoff.go | 15 +++++++++++---- share/availability/discovery/discovery.go | 3 +++ share/availability/discovery/discovery_test.go | 9 ++------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index 2a5edab196..7c4f44d830 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -49,10 +49,7 @@ func newBackoffConnector(h host.Host, factory backoff.BackoffFactory) *backoffCo // Connect puts peer to the backoffCache and tries to establish a connection with it. func (b *backoffConnector) Connect(ctx context.Context, p peer.AddrInfo) error { - // we should lock the mutex before calling backoffData and not inside because otherwise it could - // be modified from another goroutine as it returns a pointer - cache := b.backoffData(p.ID) - if time.Now().Before(cache.nexttry) { + if b.HasBackoff(p.ID) { return errBackoffNotEnded } @@ -76,6 +73,16 @@ func (b *backoffConnector) Backoff(p peer.ID) { b.cacheLk.Unlock() } +func (b *backoffConnector) HasBackoff(p peer.ID) bool { + b.cacheLk.Lock() + cache, ok := b.cacheData[p] + b.cacheLk.Unlock() + if ok && time.Now().Before(cache.nexttry) { + return true + } + return false +} + // backoffData returns backoffData from the map if it was stored, otherwise it will instantiate // a new one. func (b *backoffConnector) backoffData(p peer.ID) backoffData { diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index bcd5f9c58f..d5ae7ee608 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -131,6 +131,9 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc case d.set.Size() >= d.set.Limit(): log.Debug("skip handle: enough peers found") return + case d.connector.HasBackoff(peer.ID): + log.Debug("skip handle: backoff") + return } if d.host.Network().Connectedness(peer.ID) == network.Connected { diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index bb2ca13b69..bb2d50d608 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -57,13 +57,8 @@ func TestDiscovery(t *testing.T) { assert.Equal(t, nodes, peerA.set.Size()) - // immediately cut peer from bootstrapper sp it cannot rediscover peers - // helps with flakes - // TODO: Check why backoff does not help - err := peerA.host.Network().ClosePeer(tn.bootstrapper) - require.NoError(t, err) - - for _, peerID := range peerA.host.Network().Peers() { + for _, disc := range discs { + peerID := disc.host.ID() err := peerA.host.Network().ClosePeer(peerID) require.NoError(t, err) From 72548fb6d89ab782f7ad7a9a683f344c9ad69f53 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 28 Apr 2023 12:25:27 +0200 Subject: [PATCH 32/56] more fixes --- share/availability/discovery/discovery.go | 51 +++++++++---------- .../availability/discovery/discovery_test.go | 4 +- 2 files changed, 25 insertions(+), 30 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index d5ae7ee608..ef523d3009 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -122,9 +122,6 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc case peer.ID == d.host.ID(): log.Debug("skip handle: self discovery") return - case d.set.Contains(peer.ID): - log.Debug("skip handle: peer is already in discovery set") - return case len(peer.Addrs) == 0: log.Debug("skip handle: empty address list") return @@ -136,40 +133,41 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc return } - if d.host.Network().Connectedness(peer.ID) == network.Connected { + switch d.host.Network().Connectedness(peer.ID) { + case network.Connected: if !d.set.Add(peer.ID) { log.Debug("skip handle: peer is already in discovery set") return } - - // and notify our subscribers + // notify our subscribers d.onUpdatedPeers(peer.ID, true) log.Debug("added peer to set") + // check if we should cancel discovery if d.set.Size() >= d.set.Limit() { log.Infow("soft peer limit reached", "count", d.set.Size()) cancelFind() } - d.host.ConnManager().Protect(peer.ID, topic) + // we still have to backoff the connected peer d.connector.Backoff(peer.ID) - return - } - - d.connectingLk.Lock() - if _, ok := d.connecting[peer.ID]; ok { - d.connectingLk.Unlock() - log.Debug("skip handle: connecting to the peer in another routine") - return - } - d.connecting[peer.ID] = cancelFind - d.connectingLk.Unlock() - - err := d.connector.Connect(ctx, peer) - if err != nil { + case network.NotConnected: d.connectingLk.Lock() - delete(d.connecting, peer.ID) + if _, ok := d.connecting[peer.ID]; ok { + d.connectingLk.Unlock() + log.Debug("skip handle: connecting to the peer in another routine") + return + } + d.connecting[peer.ID] = cancelFind d.connectingLk.Unlock() - log.Debugw("unable to connect", "err", err) - return + + err := d.connector.Connect(ctx, peer) + if err != nil { + d.connectingLk.Lock() + delete(d.connecting, peer.ID) + d.connectingLk.Unlock() + log.Debugw("unable to connect", "err", err) + return + } + log.Debug("started connecting to the peer") } // tag to protect peer from being killed by ConnManager @@ -177,7 +175,6 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc // In the future, we should design a protocol that keeps bidirectional agreement on whether // connection should be kept or not, similar to mesh link in GossipSub. d.host.ConnManager().Protect(peer.ID, topic) - log.Debug("started connecting to the peer") } // ensurePeers ensures we always have 'peerLimit' connected peers. @@ -210,8 +207,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { if !ok { return } - // listen to disconnect event to remove peer from set and reset backoff time - // reset timer in order to restart the discovery, once stored peer is disconnected evnt := e.(event.EvtPeerConnectednessChanged) switch evnt.Connectedness { case network.NotConnected: @@ -219,7 +214,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { continue } - d.host.ConnManager().UntagPeer(evnt.Peer, topic) + d.host.ConnManager().Unprotect(evnt.Peer, topic) d.connector.ResetBackoff(evnt.Peer) d.set.Remove(evnt.Peer) d.onUpdatedPeers(evnt.Peer, false) diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index bb2d50d608..cf20ff6c53 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -67,11 +67,11 @@ func TestDiscovery(t *testing.T) { require.Equal(t, peerID, res.peerID) require.False(t, res.isAdded) case <-ctx.Done(): - t.Fatal("did not discover peer in time") + t.Fatal("did not disconnect from peer in time") } } - assert.Less(t, peerA.set.Size(), nodes) + assert.Equal(t, 0, peerA.set.Size()) } type testnet struct { From 2869a4547d851039ee34621744f658e7c0957577 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Fri, 28 Apr 2023 12:37:37 +0200 Subject: [PATCH 33/56] decrease backoff timeout --- share/availability/discovery/backoff.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index 7c4f44d830..e923216ec1 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -19,7 +19,7 @@ const ( ) var ( - defaultBackoffFactory = backoff.NewFixedBackoff(time.Hour) + defaultBackoffFactory = backoff.NewFixedBackoff(time.Minute * 10) errBackoffNotEnded = errors.New("share/discovery: backoff period has not ended") ) From c723d5d3e6c16236312761ce9ca0263cbd65dedf Mon Sep 17 00:00:00 2001 From: Vlad Date: Fri, 28 Apr 2023 14:39:39 +0300 Subject: [PATCH 34/56] drain ticker and timer channel before reset --- share/availability/discovery/discovery.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index ef523d3009..bd7e6a976a 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -263,6 +263,8 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } t.Reset(d.params.DiscoveryInterval) + // drain all previous ticks from channel + drainChannel(t.C) select { case <-t.C: case <-ctx.Done(): @@ -299,6 +301,8 @@ func (d *Discovery) findPeers(ctx context.Context) { var amount int for { ticker.Reset(findPeersStuckWarnDelay) + // drain all previous ticks from channel + drainChannel(ticker.C) select { case <-findCtx.Done(): log.Debugw("found enough peers", "amount", d.set.Size()) @@ -348,6 +352,9 @@ func (d *Discovery) Advertise(ctx context.Context) { select { case <-timer.C: + if !timer.Stop() { + <-timer.C + } timer.Reset(d.params.AdvertiseInterval) continue case <-ctx.Done(): @@ -358,6 +365,9 @@ func (d *Discovery) Advertise(ctx context.Context) { log.Debugf("advertised") select { case <-timer.C: + if !timer.Stop() { + <-timer.C + } timer.Reset(waitF(ttl)) case <-ctx.Done(): return @@ -370,3 +380,13 @@ func (d *Discovery) Advertise(ctx context.Context) { func (d *Discovery) Peers(ctx context.Context) ([]peer.ID, error) { return d.set.Peers(ctx) } + +func drainChannel(c <-chan time.Time) { + for { + select { + case <-c: + default: + return + } + } +} From 439580b7aa0c0a40109385d2d0399fc4b852b32a Mon Sep 17 00:00:00 2001 From: Wondertan Date: Sun, 30 Apr 2023 14:37:24 +0200 Subject: [PATCH 35/56] unflake the test by removing unnecessary stop of the timer --- share/availability/discovery/discovery.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index bd7e6a976a..6e427449ce 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -352,9 +352,6 @@ func (d *Discovery) Advertise(ctx context.Context) { select { case <-timer.C: - if !timer.Stop() { - <-timer.C - } timer.Reset(d.params.AdvertiseInterval) continue case <-ctx.Done(): @@ -365,9 +362,6 @@ func (d *Discovery) Advertise(ctx context.Context) { log.Debugf("advertised") select { case <-timer.C: - if !timer.Stop() { - <-timer.C - } timer.Reset(waitF(ttl)) case <-ctx.Done(): return From a4d2b27ec32add9a694ef7ccbf88558924f4f9e2 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 11:04:16 +0200 Subject: [PATCH 36/56] log as found only valid peers --- share/availability/discovery/discovery.go | 27 +++++++++++++---------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 6e427449ce..a91b9ea9bf 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -3,6 +3,7 @@ package discovery import ( "context" "sync" + "sync/atomic" "time" logging "github.com/ipfs/go-log/v2" @@ -116,28 +117,28 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. -func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.CancelFunc, peer peer.AddrInfo) { +func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.CancelFunc, peer peer.AddrInfo) bool { log := log.With("peer", peer.ID) switch { case peer.ID == d.host.ID(): log.Debug("skip handle: self discovery") - return + return false case len(peer.Addrs) == 0: log.Debug("skip handle: empty address list") - return + return false case d.set.Size() >= d.set.Limit(): log.Debug("skip handle: enough peers found") - return + return false case d.connector.HasBackoff(peer.ID): log.Debug("skip handle: backoff") - return + return false } switch d.host.Network().Connectedness(peer.ID) { case network.Connected: if !d.set.Add(peer.ID) { log.Debug("skip handle: peer is already in discovery set") - return + return false } // notify our subscribers d.onUpdatedPeers(peer.ID, true) @@ -154,7 +155,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc if _, ok := d.connecting[peer.ID]; ok { d.connectingLk.Unlock() log.Debug("skip handle: connecting to the peer in another routine") - return + return false } d.connecting[peer.ID] = cancelFind d.connectingLk.Unlock() @@ -165,7 +166,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc delete(d.connecting, peer.ID) d.connectingLk.Unlock() log.Debugw("unable to connect", "err", err) - return + return false } log.Debug("started connecting to the peer") } @@ -175,6 +176,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc // In the future, we should design a protocol that keeps bidirectional agreement on whether // connection should be kept or not, similar to mesh link in GossipSub. d.host.ConnManager().Protect(peer.ID, topic) + return true } // ensurePeers ensures we always have 'peerLimit' connected peers. @@ -298,7 +300,7 @@ func (d *Discovery) findPeers(ctx context.Context) { ticker := time.NewTicker(findPeersStuckWarnDelay) defer ticker.Stop() - var amount int + var amount atomic.Int32 for { ticker.Reset(findPeersStuckWarnDelay) // drain all previous ticks from channel @@ -316,9 +318,7 @@ func (d *Discovery) findPeers(ctx context.Context) { return } - amount++ peer := p - log.Debugw("found peer", "found_amount", amount) wg.Go(func() error { if findCtx.Err() != nil { log.Debug("find has been canceled, skip peer") @@ -326,7 +326,10 @@ func (d *Discovery) findPeers(ctx context.Context) { } // pass the cancel so that we cancel FindPeers when we connected to enough peers // we don't pass findCtx so that we don't cancel outgoing connections - d.handlePeerFound(ctx, findCancel, peer) + if d.handlePeerFound(ctx, findCancel, peer) { + amount.Add(1) + log.Debugw("found peer", "peer", peer.ID, "found_amount", amount.Load()) + } return nil }) } From a0b0c1cc9bea6030065c29b5e495e24ed67785fd Mon Sep 17 00:00:00 2001 From: Vlad Date: Tue, 2 May 2023 12:38:44 +0300 Subject: [PATCH 37/56] add metrics for discovery --- nodebuilder/settings.go | 1 + nodebuilder/share/opts.go | 11 +- share/availability/discovery/discovery.go | 19 ++- share/availability/discovery/metrics.go | 178 ++++++++++++++++++++++ 4 files changed, 204 insertions(+), 5 deletions(-) create mode 100644 share/availability/discovery/metrics.go diff --git a/nodebuilder/settings.go b/nodebuilder/settings.go index 0ef19d8fba..18cb55dc85 100644 --- a/nodebuilder/settings.go +++ b/nodebuilder/settings.go @@ -70,6 +70,7 @@ func WithMetrics(metricOpts []otlpmetrichttp.Option, nodeType node.Type) fx.Opti fx.Invoke(fraud.WithMetrics), fx.Invoke(node.WithMetrics), fx.Invoke(modheader.WithMetrics), + fx.Invoke(share.WithDiscoveryMetrics), ) var opts fx.Option diff --git a/nodebuilder/share/opts.go b/nodebuilder/share/opts.go index 93a9f4c4d5..03d5659c95 100644 --- a/nodebuilder/share/opts.go +++ b/nodebuilder/share/opts.go @@ -1,11 +1,18 @@ package share import ( + disc "github.com/celestiaorg/celestia-node/share/availability/discovery" "github.com/celestiaorg/celestia-node/share/p2p/peers" ) -// WithPeerManagerMetrics is a utility function that is expected to be -// "invoked" by the fx lifecycle. +// WithPeerManagerMetrics is a utility function to turn on peer manager metrics and that is +// expected to be "invoked" by the fx lifecycle. func WithPeerManagerMetrics(m *peers.Manager) error { return m.WithMetrics() } + +// WithDiscoveryMetrics is a utility function to turn on discovery metrics and that is expected to +// be "invoked" by the fx lifecycle. +func WithDiscoveryMetrics(d *disc.Discovery) error { + return d.WithMetrics() +} diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index bd7e6a976a..b538e58c03 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -71,10 +71,11 @@ type Discovery struct { connectingLk sync.Mutex connecting map[peer.ID]context.CancelFunc - cancel context.CancelFunc + metrics *metrics + cancel context.CancelFunc } -type OnUpdatedPeers func(peerID peer.ID, isAdded bool) +type OnUpdatedPeers func(peerIOnUpdatedPeersD peer.ID, isAdded bool) // NewDiscovery constructs a new discovery. func NewDiscovery( @@ -120,15 +121,19 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc log := log.With("peer", peer.ID) switch { case peer.ID == d.host.ID(): + d.metrics.observeHandlePeer(handlePeerSkipSelf) log.Debug("skip handle: self discovery") return case len(peer.Addrs) == 0: + d.metrics.observeHandlePeer(handlePeerEmptyAddrs) log.Debug("skip handle: empty address list") return case d.set.Size() >= d.set.Limit(): + d.metrics.observeHandlePeer(handlePeerEnoughPeers) log.Debug("skip handle: enough peers found") return case d.connector.HasBackoff(peer.ID): + d.metrics.observeHandlePeer(handlePeerBackoff) log.Debug("skip handle: backoff") return } @@ -136,11 +141,13 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc switch d.host.Network().Connectedness(peer.ID) { case network.Connected: if !d.set.Add(peer.ID) { + d.metrics.observeHandlePeer(handlePeerInSet) log.Debug("skip handle: peer is already in discovery set") return } // notify our subscribers d.onUpdatedPeers(peer.ID, true) + d.metrics.observeHandlePeer(handlePeerConnected) log.Debug("added peer to set") // check if we should cancel discovery if d.set.Size() >= d.set.Limit() { @@ -153,6 +160,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc d.connectingLk.Lock() if _, ok := d.connecting[peer.ID]; ok { d.connectingLk.Unlock() + d.metrics.observeHandlePeer(handlePeerConnInProgress) log.Debug("skip handle: connecting to the peer in another routine") return } @@ -164,9 +172,11 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc d.connectingLk.Lock() delete(d.connecting, peer.ID) d.connectingLk.Unlock() + d.metrics.observeHandlePeer(handlePeerConnErr) log.Debugw("unable to connect", "err", err) return } + d.metrics.observeHandlePeer(handlePeerConnect) log.Debug("started connecting to the peer") } @@ -258,7 +268,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { for { d.findPeers(ctx) if d.set.Size() < d.set.Limit() { - // rerun discovery is amount of peers didn't reach the limit + // rerun discovery if amount of peers didn't reach the limit continue } @@ -305,6 +315,7 @@ func (d *Discovery) findPeers(ctx context.Context) { drainChannel(ticker.C) select { case <-findCtx.Done(): + d.metrics.observeFindPeers(ctx.Err() != nil, findCtx != nil) log.Debugw("found enough peers", "amount", d.set.Size()) return case <-ticker.C: @@ -312,6 +323,7 @@ func (d *Discovery) findPeers(ctx context.Context) { continue case p, ok := <-peers: if !ok { + d.metrics.observeFindPeers(ctx.Err() != nil, findCtx != nil) log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) return } @@ -344,6 +356,7 @@ func (d *Discovery) Advertise(ctx context.Context) { defer timer.Stop() for { ttl, err := d.disc.Advertise(ctx, topic) + d.metrics.observeAdvertise(err) if err != nil { log.Debugf("Error advertising %s: %s", topic, err.Error()) if ctx.Err() != nil { diff --git a/share/availability/discovery/metrics.go b/share/availability/discovery/metrics.go new file mode 100644 index 0000000000..3b3597ac77 --- /dev/null +++ b/share/availability/discovery/metrics.go @@ -0,0 +1,178 @@ +package discovery + +import ( + "context" + "fmt" + "time" + + "github.com/libp2p/go-libp2p/core/peer" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric/global" + "go.opentelemetry.io/otel/metric/instrument" + "go.opentelemetry.io/otel/metric/instrument/asyncint64" + "go.opentelemetry.io/otel/metric/instrument/syncint64" +) + +const ( + observeTimeout = 100 * time.Millisecond + + findPeersCanceledReasonKey = "cancel_reason" + findPeersCanceledByEnoughPeers findPeersCancelReason = "enough_peers" + findPeersCanceledByDiscoveryStopped findPeersCancelReason = "discovery_stopped" + findPeersCanceledByExternalCancel findPeersCancelReason = "external_cancel" + + handlePeerResultKey = "result" + handlePeerSkipSelf handlePeerResult = "skip_self" + handlePeerEmptyAddrs handlePeerResult = "skip_empty_addresses" + handlePeerEnoughPeers handlePeerResult = "skip_enough_peers" + handlePeerBackoff handlePeerResult = "skip_backoff" + handlePeerConnected handlePeerResult = "connected" + handlePeerInSet handlePeerResult = "in_set" + handlePeerConnect handlePeerResult = "connect" + handlePeerConnInProgress handlePeerResult = "conn_in_progress" + handlePeerConnErr handlePeerResult = "conn_err" + + advertiseErrKey = "is_err" +) + +var ( + meter = global.MeterProvider().Meter("share_discovery") +) + +type findPeersCancelReason string + +type handlePeerResult string + +type metrics struct { + peersAmount asyncint64.Gauge + findPeersResult syncint64.Counter // attributes: stop_reason[enough_peers,discovery_stopped,external_cancel] + handlePeerResult syncint64.Counter // attributes: result + advertise syncint64.Counter //attributes: is_err[yes/no] + peerAdded syncint64.Counter + peerRemoved syncint64.Counter +} + +// WithMetrics turns on metric collection in discoery. +func (d *Discovery) WithMetrics() error { + metrics, err := initMetrics(d) + if err != nil { + return fmt.Errorf("discovery: init metrics: %w", err) + } + d.metrics = metrics + d.WithOnPeersUpdate(metrics.observeOnPeersUpdate) + return nil +} + +func initMetrics(d *Discovery) (*metrics, error) { + peersAmount, err := meter.AsyncInt64().Gauge("discovery_amount_of_peers", + instrument.WithDescription("amount of peers in discovery set")) + if err != nil { + return nil, err + } + + findPeersResult, err := meter.SyncInt64().Counter("discovery_find_peers_result", + instrument.WithDescription("result of find peers run")) + if err != nil { + return nil, err + } + + handlePeerResult, err := meter.SyncInt64().Counter("discovery_handler_peer_result", + instrument.WithDescription("result handling found peer")) + if err != nil { + return nil, err + } + + advertise, err := meter.SyncInt64().Counter("discovery_advertise_event", + instrument.WithDescription("advertise events counter")) + if err != nil { + return nil, err + } + + peerAdded, err := meter.SyncInt64().Counter("discovery_add_peer", + instrument.WithDescription("add peer to discovery set counter")) + if err != nil { + return nil, err + } + + peerRemoved, err := meter.SyncInt64().Counter("discovery_remove_peer", + instrument.WithDescription("remove peer from discovery set counter")) + if err != nil { + return nil, err + } + + metrics := &metrics{ + peersAmount: peersAmount, + findPeersResult: findPeersResult, + handlePeerResult: handlePeerResult, + advertise: advertise, + peerAdded: peerAdded, + peerRemoved: peerRemoved, + } + + err = meter.RegisterCallback( + []instrument.Asynchronous{ + peersAmount, + }, + func(ctx context.Context) { + peersAmount.Observe(ctx, int64(d.set.Size())) + }, + ) + if err != nil { + return nil, fmt.Errorf("registering metrics callback: %w", err) + } + return metrics, nil +} + +func (m *metrics) observeFindPeers(isGlobalCancel, isEnoughPeers bool) { + if m == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) + defer cancel() + + reason := findPeersCanceledByDiscoveryStopped + switch { + case isGlobalCancel: + reason = findPeersCanceledByExternalCancel + case isEnoughPeers: + reason = findPeersCanceledByEnoughPeers + } + m.findPeersResult.Add(ctx, 1, + attribute.String(findPeersCanceledReasonKey, string(reason))) +} + +func (m *metrics) observeHandlePeer(result handlePeerResult) { + if m == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) + defer cancel() + + m.handlePeerResult.Add(ctx, 1, + attribute.String(handlePeerResultKey, string(result))) +} + +func (m *metrics) observeAdvertise(err error) { + if m == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) + defer cancel() + + m.advertise.Add(ctx, 1, + attribute.Bool(advertiseErrKey, err != nil)) +} + +func (m *metrics) observeOnPeersUpdate(_ peer.ID, isAdded bool) { + if m == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) + defer cancel() + + if isAdded { + m.peerAdded.Add(ctx, 1) + return + } + m.peerRemoved.Add(ctx, 1) +} From dd8abcf0cb33aab5c4283b11782bec7fefcb0bf0 Mon Sep 17 00:00:00 2001 From: Vlad Date: Tue, 2 May 2023 13:08:59 +0300 Subject: [PATCH 38/56] fix shadow read for add peers to set --- share/availability/discovery/discovery.go | 89 +++++++++++------------ 1 file changed, 43 insertions(+), 46 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index a91b9ea9bf..f036d1fcfd 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -134,32 +134,23 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc return false } + d.connectingLk.Lock() + if _, ok := d.connecting[peer.ID]; ok { + d.connectingLk.Unlock() + log.Debug("skip handle: connecting to the peer in another routine") + return false + } + d.connecting[peer.ID] = cancelFind + d.connectingLk.Unlock() + switch d.host.Network().Connectedness(peer.ID) { case network.Connected: - if !d.set.Add(peer.ID) { - log.Debug("skip handle: peer is already in discovery set") - return false - } - // notify our subscribers - d.onUpdatedPeers(peer.ID, true) - log.Debug("added peer to set") - // check if we should cancel discovery - if d.set.Size() >= d.set.Limit() { - log.Infow("soft peer limit reached", "count", d.set.Size()) - cancelFind() + if added := d.addPeerToSet(peer.ID); !added { + return added } // we still have to backoff the connected peer d.connector.Backoff(peer.ID) case network.NotConnected: - d.connectingLk.Lock() - if _, ok := d.connecting[peer.ID]; ok { - d.connectingLk.Unlock() - log.Debug("skip handle: connecting to the peer in another routine") - return false - } - d.connecting[peer.ID] = cancelFind - d.connectingLk.Unlock() - err := d.connector.Connect(ctx, peer) if err != nil { d.connectingLk.Lock() @@ -223,32 +214,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { log.Debugw("removed peer from the peer set", "peer", evnt.Peer, "status", evnt.Connectedness.String()) case network.Connected: - peerID := evnt.Peer - d.connectingLk.Lock() - cancelFind, ok := d.connecting[peerID] - d.connectingLk.Unlock() - if !ok { - continue - } - - if !d.set.Add(peerID) { - continue - } - log.Debugw("added peer to set", "id", peerID) - // and notify our subscribers - d.onUpdatedPeers(peerID, true) - - // first do Add and only after check the limit - // so that peer set represents the actual number of connections we made - // which can go slightly over peersLimit - if d.set.Size() >= d.set.Limit() { - log.Infow("soft peer limit reached", "count", d.set.Size()) - cancelFind() - } - - d.connectingLk.Lock() - delete(d.connecting, peerID) - d.connectingLk.Unlock() + d.addPeerToSet(evnt.Peer) } } } @@ -275,6 +241,37 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } } +func (d *Discovery) addPeerToSet(peerID peer.ID) bool { + d.connectingLk.Lock() + cancelFind, ok := d.connecting[peerID] + d.connectingLk.Unlock() + if !ok { + return false + } + + if !d.set.Add(peerID) { + log.Debug("skip: peer is already in discovery set") + return false + } + + // and notify our subscribers + d.onUpdatedPeers(peerID, true) + log.Debugw("added peer to set", "id", peerID) + + // first do Add and only after check the limit + // so that peer set represents the actual number of connections we made + // which can go slightly over peersLimit + if d.set.Size() >= d.set.Limit() { + log.Infow("soft peer limit reached", "count", d.set.Size()) + cancelFind() + } + + d.connectingLk.Lock() + delete(d.connecting, peerID) + d.connectingLk.Unlock() + return true +} + func (d *Discovery) findPeers(ctx context.Context) { if d.set.Size() >= d.set.Limit() { log.Debugw("reached soft peer limit, skipping discovery", "size", d.set.Size()) From 32050ebbe31a4472aefbb8aa89f6e56adf2233b1 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 13:03:26 +0200 Subject: [PATCH 39/56] another backoff cleanup --- share/availability/discovery/backoff.go | 48 +++++--------------- share/availability/discovery/backoff_test.go | 2 +- share/availability/discovery/discovery.go | 2 +- 3 files changed, 14 insertions(+), 38 deletions(-) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index e923216ec1..8ce00d07b8 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -64,15 +64,23 @@ func (b *backoffConnector) Connect(ctx context.Context, p peer.AddrInfo) error { return err } +// Backoff adds or extends backoff delay for the peer. func (b *backoffConnector) Backoff(p peer.ID) { - data := b.backoffData(p) - data.nexttry = time.Now().Add(data.backoff.Delay()) - b.cacheLk.Lock() + defer b.cacheLk.Unlock() + + data, ok := b.cacheData[p] + if !ok { + data = backoffData{} + data.backoff = b.backoff() + b.cacheData[p] = data + } + + data.nexttry = time.Now().Add(data.backoff.Delay()) b.cacheData[p] = data - b.cacheLk.Unlock() } +// HasBackoff checks if peer is in backoff. func (b *backoffConnector) HasBackoff(p peer.ID) bool { b.cacheLk.Lock() cache, ok := b.cacheData[p] @@ -83,38 +91,6 @@ func (b *backoffConnector) HasBackoff(p peer.ID) bool { return false } -// backoffData returns backoffData from the map if it was stored, otherwise it will instantiate -// a new one. -func (b *backoffConnector) backoffData(p peer.ID) backoffData { - b.cacheLk.Lock() - defer b.cacheLk.Unlock() - - cache, ok := b.cacheData[p] - if !ok { - cache = backoffData{} - cache.backoff = b.backoff() - b.cacheData[p] = cache - } - return cache -} - -// RemoveBackoff removes peer from the backoffCache. It is called when we cancel the attempt to -// connect to a peer after calling Connect. -func (b *backoffConnector) RemoveBackoff(p peer.ID) { - b.cacheLk.Lock() - defer b.cacheLk.Unlock() - delete(b.cacheData, p) -} - -// ResetBackoff resets delay time between attempts and adds a delay for the next connection -// attempt to remote peer. It will mostly be called when host receives a notification that remote -// peer was disconnected. -func (b *backoffConnector) ResetBackoff(p peer.ID) { - cache := b.backoffData(p) - cache.backoff.Reset() - b.Backoff(p) -} - func (b *backoffConnector) GC(ctx context.Context) { ticker := time.NewTicker(gcInterval) for { diff --git a/share/availability/discovery/backoff_test.go b/share/availability/discovery/backoff_test.go index 61a639cde3..24814ed199 100644 --- a/share/availability/discovery/backoff_test.go +++ b/share/availability/discovery/backoff_test.go @@ -42,6 +42,6 @@ func TestBackoff_ResetBackoffPeriod(t *testing.T) { info := host.InfoFromHost(m.Hosts()[1]) require.NoError(t, b.Connect(ctx, *info)) nexttry := b.cacheData[info.ID].nexttry - b.ResetBackoff(info.ID) + b.Backoff(info.ID) require.True(t, b.cacheData[info.ID].nexttry.After(nexttry)) } diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index f036d1fcfd..3e0b28e3ee 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -208,7 +208,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } d.host.ConnManager().Unprotect(evnt.Peer, topic) - d.connector.ResetBackoff(evnt.Peer) + d.connector.Backoff(evnt.Peer) d.set.Remove(evnt.Peer) d.onUpdatedPeers(evnt.Peer, false) log.Debugw("removed peer from the peer set", From de865a912e0e883440b6fd06e2765e9ba80a96f4 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 13:35:12 +0200 Subject: [PATCH 40/56] small fix --- share/availability/discovery/discovery.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 3e0b28e3ee..5286cd1668 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -248,15 +248,20 @@ func (d *Discovery) addPeerToSet(peerID peer.ID) bool { if !ok { return false } + defer func() { + d.connectingLk.Lock() + delete(d.connecting, peerID) + d.connectingLk.Unlock() + }() if !d.set.Add(peerID) { - log.Debug("skip: peer is already in discovery set") + log.Debugw("peer is already in discovery set", "peer", peerID) return false } // and notify our subscribers d.onUpdatedPeers(peerID, true) - log.Debugw("added peer to set", "id", peerID) + log.Debugw("added peer to set", "peer", peerID) // first do Add and only after check the limit // so that peer set represents the actual number of connections we made @@ -266,9 +271,6 @@ func (d *Discovery) addPeerToSet(peerID peer.ID) bool { cancelFind() } - d.connectingLk.Lock() - delete(d.connecting, peerID) - d.connectingLk.Unlock() return true } From 0b56b7993f3d55775f0409490023ea74a2774c3d Mon Sep 17 00:00:00 2001 From: Vlad Date: Tue, 2 May 2023 14:37:59 +0300 Subject: [PATCH 41/56] add delay for findpeers retry --- share/availability/discovery/discovery.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 5286cd1668..c59f84b934 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -227,10 +227,11 @@ func (d *Discovery) ensurePeers(ctx context.Context) { d.findPeers(ctx) if d.set.Size() < d.set.Limit() { // rerun discovery is amount of peers didn't reach the limit - continue + t.Reset(time.Second) + } else { + t.Reset(d.params.DiscoveryInterval) } - t.Reset(d.params.DiscoveryInterval) // drain all previous ticks from channel drainChannel(t.C) select { From 07961475f6eece0da8d998924f52664019a6cc4f Mon Sep 17 00:00:00 2001 From: Vlad Date: Tue, 2 May 2023 14:43:44 +0300 Subject: [PATCH 42/56] make findPeers fast retry delay a var to modify it in tests --- share/availability/discovery/discovery.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index c59f84b934..4e502c65d2 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -18,6 +18,10 @@ import ( var log = logging.Logger("share/discovery") +// if findPeers don't find enough peers, it will retry after findPeersFastRetryDelay +// delay +var findPeersFastRetryDelay = time.Second + const ( topic = "full" @@ -227,7 +231,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { d.findPeers(ctx) if d.set.Size() < d.set.Limit() { // rerun discovery is amount of peers didn't reach the limit - t.Reset(time.Second) + t.Reset(findPeersFastRetryDelay) } else { t.Reset(d.params.DiscoveryInterval) } From 8eed66824d67cfd7802e011bec4db147b7401cf2 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 13:43:52 +0200 Subject: [PATCH 43/56] fix comment --- share/availability/discovery/discovery.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 4e502c65d2..386a700f53 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -286,7 +286,7 @@ func (d *Discovery) findPeers(ctx context.Context) { } log.Infow("below soft peer limit, discovering peers", "remaining", d.set.Limit()-d.set.Size()) - // we use errgroup as it obeys the context + // we use errgroup as it provide limits var wg errgroup.Group // limit to minimize chances of overreaching the limit wg.SetLimit(d.set.Limit()) From 38d97e9712f88bfd4f96030754560fdc1908028d Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 14:20:55 +0200 Subject: [PATCH 44/56] notify when we actually need the discovery, instead of periodic checking and rebrand the discovery interval --- nodebuilder/tests/p2p_test.go | 2 +- share/availability/discovery/discovery.go | 74 +++++++++++-------- .../availability/discovery/discovery_test.go | 12 +-- share/availability/full/testing.go | 6 +- share/getters/shrex_test.go | 6 +- share/p2p/peers/manager_test.go | 18 ++--- 6 files changed, 67 insertions(+), 51 deletions(-) diff --git a/nodebuilder/tests/p2p_test.go b/nodebuilder/tests/p2p_test.go index 497b3b76a5..c4d81a9939 100644 --- a/nodebuilder/tests/p2p_test.go +++ b/nodebuilder/tests/p2p_test.go @@ -215,6 +215,6 @@ func TestRestartNodeDiscovery(t *testing.T) { func setTimeInterval(cfg *nodebuilder.Config, interval time.Duration) { cfg.P2P.RoutingTableRefreshPeriod = interval - cfg.Share.Discovery.DiscoveryInterval = interval + cfg.Share.Discovery.DiscoveryRetryTimeout = interval cfg.Share.Discovery.AdvertiseInterval = interval } diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 386a700f53..86f690c4da 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -18,10 +18,6 @@ import ( var log = logging.Logger("share/discovery") -// if findPeers don't find enough peers, it will retry after findPeersFastRetryDelay -// delay -var findPeersFastRetryDelay = time.Second - const ( topic = "full" @@ -41,9 +37,10 @@ type Parameters struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. // Set 0 to disable. PeersLimit int - // DiscoveryInterval is an interval between discovery sessions. + // DiscoveryRetryTimeout is an interval between discovery attempts + // when we discovered lower than PeersLimit peers. // Set -1 to disable. - DiscoveryInterval time.Duration + DiscoveryRetryTimeout time.Duration // AdvertiseInterval is a interval between advertising sessions. // Set -1 to disable. // NOTE: only full and bridge can advertise themselves. @@ -52,9 +49,9 @@ type Parameters struct { func DefaultParameters() Parameters { return Parameters{ - PeersLimit: 5, - DiscoveryInterval: time.Minute, - AdvertiseInterval: time.Hour * 8, + PeersLimit: 5, + DiscoveryRetryTimeout: time.Second * 1, + AdvertiseInterval: time.Hour * 8, } } @@ -76,6 +73,8 @@ type Discovery struct { connectingLk sync.Mutex connecting map[peer.ID]context.CancelFunc + triggerDisq chan struct{} + cancel context.CancelFunc } @@ -95,6 +94,7 @@ func NewDiscovery( connector: newBackoffConnector(h, defaultBackoffFactory), onUpdatedPeers: func(peer.ID, bool) {}, connecting: make(map[peer.ID]context.CancelFunc), + triggerDisq: make(chan struct{}, 1), } } @@ -119,6 +119,13 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { } } +func (d *Discovery) triggerDiscovery() { + select { + case d.triggerDisq <- struct{}{}: + default: + } +} + // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.CancelFunc, peer peer.AddrInfo) bool { @@ -178,8 +185,8 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc // It starts peer discovery every 30 seconds until peer cache reaches peersLimit. // Discovery is restarted if any previously connected peers disconnect. func (d *Discovery) ensurePeers(ctx context.Context) { - if d.params.PeersLimit == 0 || d.params.DiscoveryInterval == -1 { - log.Warn("peers limit is set to 0 and/or discovery interval is set to -1. Skipping discovery...") + if d.params.PeersLimit == 0 { + log.Warn("peers limit is set to 0. Skipping discovery...") return } // subscribe on EventBus in order to catch disconnected peers and restart @@ -217,6 +224,10 @@ func (d *Discovery) ensurePeers(ctx context.Context) { d.onUpdatedPeers(evnt.Peer, false) log.Debugw("removed peer from the peer set", "peer", evnt.Peer, "status", evnt.Connectedness.String()) + + if d.set.Size() < d.set.Limit() { + d.triggerDiscovery() + } case network.Connected: d.addPeerToSet(evnt.Peer) } @@ -225,21 +236,24 @@ func (d *Discovery) ensurePeers(ctx context.Context) { }() go d.connector.GC(ctx) - t := time.NewTicker(d.params.DiscoveryInterval) + t := time.NewTicker(d.params.DiscoveryRetryTimeout) defer t.Stop() for { - d.findPeers(ctx) - if d.set.Size() < d.set.Limit() { - // rerun discovery is amount of peers didn't reach the limit - t.Reset(findPeersFastRetryDelay) - } else { - t.Reset(d.params.DiscoveryInterval) - } - // drain all previous ticks from channel drainChannel(t.C) select { case <-t.C: + found := d.findPeers(ctx) + if !found { + // rerun discovery if amount of peers didn't reach the limit + continue + } + case <-ctx.Done(): + return + } + + select { + case <-d.triggerDisq: case <-ctx.Done(): return } @@ -279,12 +293,14 @@ func (d *Discovery) addPeerToSet(peerID peer.ID) bool { return true } -func (d *Discovery) findPeers(ctx context.Context) { - if d.set.Size() >= d.set.Limit() { - log.Debugw("reached soft peer limit, skipping discovery", "size", d.set.Size()) - return +func (d *Discovery) findPeers(ctx context.Context) bool { + size := d.set.Size() + want := d.set.Limit() - size + if want == 0 { + log.Debugw("reached soft peer limit, skipping discovery", "size", size) + return true } - log.Infow("below soft peer limit, discovering peers", "remaining", d.set.Limit()-d.set.Size()) + log.Infow("below soft peer limit, discovering peers", "remaining", want) // we use errgroup as it provide limits var wg errgroup.Group @@ -299,7 +315,7 @@ func (d *Discovery) findPeers(ctx context.Context) { peers, err := d.disc.FindPeers(findCtx, topic) if err != nil { log.Error("unable to start discovery", "err", err) - return + return false } ticker := time.NewTicker(findPeersStuckWarnDelay) @@ -311,15 +327,15 @@ func (d *Discovery) findPeers(ctx context.Context) { drainChannel(ticker.C) select { case <-findCtx.Done(): - log.Debugw("found enough peers", "amount", d.set.Size()) - return + log.Debugw("found wanted peers", "wanted", want) + return int(amount.Load()) == want case <-ticker.C: log.Warn("wasn't able to find new peers for long time") continue case p, ok := <-peers: if !ok { log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) - return + return int(amount.Load()) == want } peer := p diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index cf20ff6c53..44164f361b 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -24,9 +24,9 @@ func TestDiscovery(t *testing.T) { tn := newTestnet(ctx, t) peerA := tn.discovery(Parameters{ - PeersLimit: nodes, - DiscoveryInterval: time.Millisecond * 100, - AdvertiseInterval: -1, + PeersLimit: nodes, + DiscoveryRetryTimeout: time.Millisecond * 100, + AdvertiseInterval: -1, }) type peerUpdate struct { @@ -41,9 +41,9 @@ func TestDiscovery(t *testing.T) { discs := make([]*Discovery, nodes) for i := range discs { discs[i] = tn.discovery(Parameters{ - PeersLimit: 0, - DiscoveryInterval: -1, - AdvertiseInterval: time.Millisecond * 100, + PeersLimit: 0, + DiscoveryRetryTimeout: -1, + AdvertiseInterval: time.Millisecond * 100, }) select { diff --git a/share/availability/full/testing.go b/share/availability/full/testing.go index e1f62f3c62..7a6882b0b7 100644 --- a/share/availability/full/testing.go +++ b/share/availability/full/testing.go @@ -39,9 +39,9 @@ func Node(dn *availability_test.TestDagNet) *availability_test.TestNode { func TestAvailability(getter share.Getter) *ShareAvailability { disc := discovery.NewDiscovery(nil, routing.NewRoutingDiscovery( routinghelpers.Null{}), discovery.Parameters{ - PeersLimit: 10, - DiscoveryInterval: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 10, + DiscoveryRetryTimeout: time.Second, + AdvertiseInterval: time.Second, }, ) return NewShareAvailability(nil, getter, disc) diff --git a/share/getters/shrex_test.go b/share/getters/shrex_test.go index 65192ded8e..f35164d744 100644 --- a/share/getters/shrex_test.go +++ b/share/getters/shrex_test.go @@ -159,9 +159,9 @@ func testManager(ctx context.Context, host host.Host, headerSub libhead.Subscrib disc := discovery.NewDiscovery(nil, routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), discovery.Parameters{ - PeersLimit: 10, - DiscoveryInterval: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 10, + DiscoveryRetryTimeout: time.Second, + AdvertiseInterval: time.Second, }, ) connGater, err := conngater.NewBasicConnectionGater(ds_sync.MutexWrap(datastore.NewMapDatastore())) diff --git a/share/p2p/peers/manager_test.go b/share/p2p/peers/manager_test.go index eecf2f3ac4..a0abce2e79 100644 --- a/share/p2p/peers/manager_test.go +++ b/share/p2p/peers/manager_test.go @@ -394,9 +394,9 @@ func TestIntegration(t *testing.T) { nw.Hosts()[0], routingdisc.NewRoutingDiscovery(router1), discovery.Parameters{ - PeersLimit: 0, - DiscoveryInterval: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 0, + DiscoveryRetryTimeout: time.Second, + AdvertiseInterval: time.Second, }, ) @@ -408,9 +408,9 @@ func TestIntegration(t *testing.T) { nw.Hosts()[1], routingdisc.NewRoutingDiscovery(router2), discovery.Parameters{ - PeersLimit: 10, - DiscoveryInterval: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 10, + DiscoveryRetryTimeout: time.Second, + AdvertiseInterval: time.Second, }, ) err = fnDisc.Start(ctx) @@ -466,9 +466,9 @@ func testManager(ctx context.Context, headerSub libhead.Subscriber[*header.Exten disc := discovery.NewDiscovery(nil, routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), discovery.Parameters{ - PeersLimit: 0, - DiscoveryInterval: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 0, + DiscoveryRetryTimeout: time.Second, + AdvertiseInterval: time.Second, }) connGater, err := conngater.NewBasicConnectionGater(sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { From c8fb086913865382ebbb6f375930f6be341eb612 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 15:08:02 +0200 Subject: [PATCH 45/56] fix comment --- share/availability/discovery/backoff.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index 8ce00d07b8..982b23767c 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -11,8 +11,8 @@ import ( "github.com/libp2p/go-libp2p/p2p/discovery/backoff" ) -// gcInterval is a default period after which disconnected peers will be removed from cache const ( + // gcInterval is a default period after which disconnected peers will be removed from cache gcInterval = time.Minute // connectTimeout is the timeout used for dialing peers and discovering peer addresses. connectTimeout = time.Minute * 2 From 6cb5cdf24a258ed7c6446590311838fa77799a85 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 16:21:25 +0200 Subject: [PATCH 46/56] simplify the code further --- share/availability/discovery/backoff.go | 1 + share/availability/discovery/discovery.go | 102 +++++++--------------- 2 files changed, 32 insertions(+), 71 deletions(-) diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index 982b23767c..1f36425d10 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -91,6 +91,7 @@ func (b *backoffConnector) HasBackoff(p peer.ID) bool { return false } +// GC is a perpetual GCing loop. func (b *backoffConnector) GC(ctx context.Context) { ticker := time.NewTicker(gcInterval) for { diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 86f690c4da..7127218685 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -2,7 +2,6 @@ package discovery import ( "context" - "sync" "sync/atomic" "time" @@ -70,9 +69,6 @@ type Discovery struct { connector *backoffConnector onUpdatedPeers OnUpdatedPeers - connectingLk sync.Mutex - connecting map[peer.ID]context.CancelFunc - triggerDisq chan struct{} cancel context.CancelFunc @@ -93,8 +89,7 @@ func NewDiscovery( disc: d, connector: newBackoffConnector(h, defaultBackoffFactory), onUpdatedPeers: func(peer.ID, bool) {}, - connecting: make(map[peer.ID]context.CancelFunc), - triggerDisq: make(chan struct{}, 1), + triggerDisq: make(chan struct{}), } } @@ -128,7 +123,7 @@ func (d *Discovery) triggerDiscovery() { // handlePeersFound receives peers and tries to establish a connection with them. // Peer will be added to PeerCache if connection succeeds. -func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.CancelFunc, peer peer.AddrInfo) bool { +func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo) bool { log := log.With("peer", peer.ID) switch { case peer.ID == d.host.ID(): @@ -145,34 +140,27 @@ func (d *Discovery) handlePeerFound(ctx context.Context, cancelFind context.Canc return false } - d.connectingLk.Lock() - if _, ok := d.connecting[peer.ID]; ok { - d.connectingLk.Unlock() - log.Debug("skip handle: connecting to the peer in another routine") - return false - } - d.connecting[peer.ID] = cancelFind - d.connectingLk.Unlock() - switch d.host.Network().Connectedness(peer.ID) { case network.Connected: - if added := d.addPeerToSet(peer.ID); !added { - return added - } - // we still have to backoff the connected peer - d.connector.Backoff(peer.ID) + d.connector.Backoff(peer.ID) // we still have to backoff the connected peer case network.NotConnected: err := d.connector.Connect(ctx, peer) if err != nil { - d.connectingLk.Lock() - delete(d.connecting, peer.ID) - d.connectingLk.Unlock() log.Debugw("unable to connect", "err", err) return false } - log.Debug("started connecting to the peer") + default: + panic("unknown connectedness") } + if !d.set.Add(peer.ID) { + log.Debugw("peer is already in discovery set", "peer", peer.ID) + return false + } + + d.onUpdatedPeers(peer.ID, true) + log.Debugw("added peer to set", "peer", peer.ID) + // tag to protect peer from being killed by ConnManager // NOTE: This is does not protect from remote killing the connection. // In the future, we should design a protocol that keeps bidirectional agreement on whether @@ -211,9 +199,8 @@ func (d *Discovery) ensurePeers(ctx context.Context) { if !ok { return } - evnt := e.(event.EvtPeerConnectednessChanged) - switch evnt.Connectedness { - case network.NotConnected: + + if evnt := e.(event.EvtPeerConnectednessChanged); evnt.Connectedness == network.NotConnected { if !d.set.Contains(evnt.Peer) { continue } @@ -228,8 +215,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { if d.set.Size() < d.set.Limit() { d.triggerDiscovery() } - case network.Connected: - d.addPeerToSet(evnt.Peer) } } } @@ -260,39 +245,6 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } } -func (d *Discovery) addPeerToSet(peerID peer.ID) bool { - d.connectingLk.Lock() - cancelFind, ok := d.connecting[peerID] - d.connectingLk.Unlock() - if !ok { - return false - } - defer func() { - d.connectingLk.Lock() - delete(d.connecting, peerID) - d.connectingLk.Unlock() - }() - - if !d.set.Add(peerID) { - log.Debugw("peer is already in discovery set", "peer", peerID) - return false - } - - // and notify our subscribers - d.onUpdatedPeers(peerID, true) - log.Debugw("added peer to set", "peer", peerID) - - // first do Add and only after check the limit - // so that peer set represents the actual number of connections we made - // which can go slightly over peersLimit - if d.set.Size() >= d.set.Limit() { - log.Infow("soft peer limit reached", "count", d.set.Size()) - cancelFind() - } - - return true -} - func (d *Discovery) findPeers(ctx context.Context) bool { size := d.set.Size() want := d.set.Limit() - size @@ -300,7 +252,7 @@ func (d *Discovery) findPeers(ctx context.Context) bool { log.Debugw("reached soft peer limit, skipping discovery", "size", size) return true } - log.Infow("below soft peer limit, discovering peers", "remaining", want) + log.Infow("discovering peers", "want", want) // we use errgroup as it provide limits var wg errgroup.Group @@ -327,8 +279,7 @@ func (d *Discovery) findPeers(ctx context.Context) bool { drainChannel(ticker.C) select { case <-findCtx.Done(): - log.Debugw("found wanted peers", "wanted", want) - return int(amount.Load()) == want + return true case <-ticker.C: log.Warn("wasn't able to find new peers for long time") continue @@ -344,12 +295,21 @@ func (d *Discovery) findPeers(ctx context.Context) bool { log.Debug("find has been canceled, skip peer") return nil } - // pass the cancel so that we cancel FindPeers when we connected to enough peers - // we don't pass findCtx so that we don't cancel outgoing connections - if d.handlePeerFound(ctx, findCancel, peer) { - amount.Add(1) - log.Debugw("found peer", "peer", peer.ID, "found_amount", amount.Load()) + + // we don't pass findCtx so that we don't cancel in progress connections + // that are likely to be valuable + if !d.handlePeerFound(ctx, peer) { + return nil } + + amount := amount.Add(1) + log.Debugw("found peer", "peer", peer.ID, "found_amount", amount) + if int(amount) < want { + return nil + } + + log.Infow("discovered wanted peers", "amount", amount) + findCancel() return nil }) } From 0ebf3af51a0ea7e5599bd676b9f229ff2a9ec93f Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 16:41:46 +0200 Subject: [PATCH 47/56] rework tests to use real swarm. Mocknet has some bug regarding connection closage that cause deadlock and flakes --- share/availability/discovery/discovery.go | 2 +- .../availability/discovery/discovery_test.go | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 7127218685..882d330022 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -157,7 +157,6 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo) boo log.Debugw("peer is already in discovery set", "peer", peer.ID) return false } - d.onUpdatedPeers(peer.ID, true) log.Debugw("added peer to set", "peer", peer.ID) @@ -197,6 +196,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { return case e, ok := <-sub.Out(): if !ok { + log.Error("connection subscription was closed unexpectedly") return } diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index 44164f361b..011b4449a8 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -10,13 +10,14 @@ import ( "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/p2p/discovery/routing" - mocknet "github.com/libp2p/go-libp2p/p2p/net/mock" + basic "github.com/libp2p/go-libp2p/p2p/host/basic" + swarmt "github.com/libp2p/go-libp2p/p2p/net/swarm/testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestDiscovery(t *testing.T) { - const nodes = 30 // higher number brings higher coverage + const nodes = 10 // higher number brings higher coverage ctx, cancel := context.WithTimeout(context.Background(), time.Second*30) t.Cleanup(cancel) @@ -26,7 +27,7 @@ func TestDiscovery(t *testing.T) { peerA := tn.discovery(Parameters{ PeersLimit: nodes, DiscoveryRetryTimeout: time.Millisecond * 100, - AdvertiseInterval: -1, + AdvertiseInterval: -1, // we don't want to be found but only find }) type peerUpdate struct { @@ -77,15 +78,15 @@ func TestDiscovery(t *testing.T) { type testnet struct { ctx context.Context T *testing.T - net mocknet.Mocknet - bootstrapper peer.ID + bootstrapper peer.AddrInfo } func newTestnet(ctx context.Context, t *testing.T) *testnet { - net := mocknet.New() - hst, err := net.GenPeer() + swarm := swarmt.GenSwarm(t, swarmt.OptDisableTCP) + hst, err := basic.NewHost(swarm, &basic.HostOpts{}) require.NoError(t, err) + hst.Start() _, err = dht.New(ctx, hst, dht.Mode(dht.ModeServer), @@ -94,7 +95,7 @@ func newTestnet(ctx context.Context, t *testing.T) *testnet { ) require.NoError(t, err) - return &testnet{ctx: ctx, T: t, net: net, bootstrapper: hst.ID()} + return &testnet{ctx: ctx, T: t, bootstrapper: *host.InfoFromHost(hst)} } func (t *testnet) discovery(params Parameters) *Discovery { @@ -112,13 +113,12 @@ func (t *testnet) discovery(params Parameters) *Discovery { } func (t *testnet) peer() (host.Host, discovery.Discovery) { - hst, err := t.net.GenPeer() + swarm := swarmt.GenSwarm(t.T, swarmt.OptDisableTCP) + hst, err := basic.NewHost(swarm, &basic.HostOpts{}) require.NoError(t.T, err) + hst.Start() - err = t.net.LinkAll() - require.NoError(t.T, err) - - _, err = t.net.ConnectPeers(hst.ID(), t.bootstrapper) + err = hst.Connect(t.ctx, t.bootstrapper) require.NoError(t.T, err) dht, err := dht.New(t.ctx, hst, From 3ba19e9800d64074823edc8dc86e88d021032cb9 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 16:59:29 +0200 Subject: [PATCH 48/56] ensurePeers should depend on global state rather than local --- share/availability/discovery/discovery.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 882d330022..b83a065f1f 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -2,7 +2,6 @@ package discovery import ( "context" - "sync/atomic" "time" logging "github.com/ipfs/go-log/v2" @@ -272,7 +271,6 @@ func (d *Discovery) findPeers(ctx context.Context) bool { ticker := time.NewTicker(findPeersStuckWarnDelay) defer ticker.Stop() - var amount atomic.Int32 for { ticker.Reset(findPeersStuckWarnDelay) // drain all previous ticks from channel @@ -286,7 +284,7 @@ func (d *Discovery) findPeers(ctx context.Context) bool { case p, ok := <-peers: if !ok { log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) - return int(amount.Load()) == want + return d.set.Size() == d.set.Limit() } peer := p @@ -302,13 +300,13 @@ func (d *Discovery) findPeers(ctx context.Context) bool { return nil } - amount := amount.Add(1) - log.Debugw("found peer", "peer", peer.ID, "found_amount", amount) - if int(amount) < want { + size := d.set.Size() + log.Debugw("found peer", "peer", peer.ID, "found_amount", size) + if size < d.set.Limit() { return nil } - log.Infow("discovered wanted peers", "amount", amount) + log.Infow("discovered wanted peers", "amount", size) findCancel() return nil }) From d8b85d614b04dbdd0bb3259e30f000ada7b77a73 Mon Sep 17 00:00:00 2001 From: Wondertan Date: Tue, 2 May 2023 17:08:40 +0200 Subject: [PATCH 49/56] int back to uint --- share/availability/discovery/discovery.go | 6 +++--- share/availability/discovery/discovery_test.go | 4 ++-- share/availability/discovery/set.go | 10 +++++----- share/availability/discovery/set_test.go | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index b83a065f1f..d5b5deecc8 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -34,7 +34,7 @@ var waitF = func(ttl time.Duration) time.Duration { type Parameters struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. // Set 0 to disable. - PeersLimit int + PeersLimit uint // DiscoveryRetryTimeout is an interval between discovery attempts // when we discovered lower than PeersLimit peers. // Set -1 to disable. @@ -256,7 +256,7 @@ func (d *Discovery) findPeers(ctx context.Context) bool { // we use errgroup as it provide limits var wg errgroup.Group // limit to minimize chances of overreaching the limit - wg.SetLimit(d.set.Limit()) + wg.SetLimit(int(d.set.Limit())) defer wg.Wait() //nolint:errcheck // stop discovery when we are done @@ -284,7 +284,7 @@ func (d *Discovery) findPeers(ctx context.Context) bool { case p, ok := <-peers: if !ok { log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) - return d.set.Size() == d.set.Limit() + return d.set.Size() >= d.set.Limit() } peer := p diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index 011b4449a8..8de39433ab 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -56,7 +56,7 @@ func TestDiscovery(t *testing.T) { } } - assert.Equal(t, nodes, peerA.set.Size()) + assert.EqualValues(t, nodes, peerA.set.Size()) for _, disc := range discs { peerID := disc.host.ID() @@ -72,7 +72,7 @@ func TestDiscovery(t *testing.T) { } } - assert.Equal(t, 0, peerA.set.Size()) + assert.EqualValues(t, 0, peerA.set.Size()) } type testnet struct { diff --git a/share/availability/discovery/set.go b/share/availability/discovery/set.go index 8adb9a7b60..37b3851bdf 100644 --- a/share/availability/discovery/set.go +++ b/share/availability/discovery/set.go @@ -13,12 +13,12 @@ type limitedSet struct { lk sync.RWMutex ps map[peer.ID]struct{} - limit int + limit uint waitPeer chan peer.ID } // newLimitedSet constructs a set with the maximum peers amount. -func newLimitedSet(limit int) *limitedSet { +func newLimitedSet(limit uint) *limitedSet { ps := new(limitedSet) ps.ps = make(map[peer.ID]struct{}) ps.limit = limit @@ -33,14 +33,14 @@ func (ps *limitedSet) Contains(p peer.ID) bool { return ok } -func (ps *limitedSet) Limit() int { +func (ps *limitedSet) Limit() uint { return ps.limit } -func (ps *limitedSet) Size() int { +func (ps *limitedSet) Size() uint { ps.lk.RLock() defer ps.lk.RUnlock() - return len(ps.ps) + return uint(len(ps.ps)) } // Add attempts to add the given peer into the set. diff --git a/share/availability/discovery/set_test.go b/share/availability/discovery/set_test.go index 1507509732..d5113a2291 100644 --- a/share/availability/discovery/set_test.go +++ b/share/availability/discovery/set_test.go @@ -81,7 +81,7 @@ func TestSet_Size(t *testing.T) { set := newLimitedSet(2) set.Add(h1.ID()) set.Add(h2.ID()) - require.Equal(t, 2, set.Size()) + require.EqualValues(t, 2, set.Size()) set.Remove(h2.ID()) - require.Equal(t, 1, set.Size()) + require.EqualValues(t, 1, set.Size()) } From 8e2034227e6815ab00db7c44c70d6e07b67b3cac Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 3 May 2023 08:22:54 +0300 Subject: [PATCH 50/56] apply review suggestions --- nodebuilder/tests/p2p_test.go | 1 - share/availability/discovery/backoff.go | 5 +--- share/availability/discovery/discovery.go | 26 ++++++++++--------- .../availability/discovery/discovery_test.go | 4 +-- share/availability/full/testing.go | 12 ++++----- share/getters/shrex_test.go | 5 ++-- share/p2p/peers/manager_test.go | 15 +++++------ 7 files changed, 30 insertions(+), 38 deletions(-) diff --git a/nodebuilder/tests/p2p_test.go b/nodebuilder/tests/p2p_test.go index c4d81a9939..39c3c985a2 100644 --- a/nodebuilder/tests/p2p_test.go +++ b/nodebuilder/tests/p2p_test.go @@ -215,6 +215,5 @@ func TestRestartNodeDiscovery(t *testing.T) { func setTimeInterval(cfg *nodebuilder.Config, interval time.Duration) { cfg.P2P.RoutingTableRefreshPeriod = interval - cfg.Share.Discovery.DiscoveryRetryTimeout = interval cfg.Share.Discovery.AdvertiseInterval = interval } diff --git a/share/availability/discovery/backoff.go b/share/availability/discovery/backoff.go index 1f36425d10..5cd20e1497 100644 --- a/share/availability/discovery/backoff.go +++ b/share/availability/discovery/backoff.go @@ -85,10 +85,7 @@ func (b *backoffConnector) HasBackoff(p peer.ID) bool { b.cacheLk.Lock() cache, ok := b.cacheData[p] b.cacheLk.Unlock() - if ok && time.Now().Before(cache.nexttry) { - return true - } - return false + return ok && time.Now().Before(cache.nexttry) } // GC is a perpetual GCing loop. diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index d5b5deecc8..6b6bc03fb3 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -23,6 +23,8 @@ const ( // events in libp2p eventbusBufSize = 32 + // findPeersStuckWarnDelay is the duration after which findPeers will log an error message to + // notify that it is stuck. findPeersStuckWarnDelay = time.Minute ) @@ -35,20 +37,20 @@ type Parameters struct { // PeersLimit defines the soft limit of FNs to connect to via discovery. // Set 0 to disable. PeersLimit uint - // DiscoveryRetryTimeout is an interval between discovery attempts - // when we discovered lower than PeersLimit peers. - // Set -1 to disable. - DiscoveryRetryTimeout time.Duration // AdvertiseInterval is a interval between advertising sessions. // Set -1 to disable. // NOTE: only full and bridge can advertise themselves. AdvertiseInterval time.Duration + // discoveryRetryTimeout is an interval between discovery attempts + // when we discovered lower than PeersLimit peers. + // Set -1 to disable. + discoveryRetryTimeout time.Duration } func DefaultParameters() Parameters { return Parameters{ PeersLimit: 5, - DiscoveryRetryTimeout: time.Second * 1, + discoveryRetryTimeout: time.Second, AdvertiseInterval: time.Hour * 8, } } @@ -68,7 +70,7 @@ type Discovery struct { connector *backoffConnector onUpdatedPeers OnUpdatedPeers - triggerDisq chan struct{} + triggerDisc chan struct{} cancel context.CancelFunc } @@ -88,7 +90,7 @@ func NewDiscovery( disc: d, connector: newBackoffConnector(h, defaultBackoffFactory), onUpdatedPeers: func(peer.ID, bool) {}, - triggerDisq: make(chan struct{}), + triggerDisc: make(chan struct{}), } } @@ -115,7 +117,7 @@ func (d *Discovery) WithOnPeersUpdate(f OnUpdatedPeers) { func (d *Discovery) triggerDiscovery() { select { - case d.triggerDisq <- struct{}{}: + case d.triggerDisc <- struct{}{}: default: } } @@ -153,11 +155,11 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo) boo } if !d.set.Add(peer.ID) { - log.Debugw("peer is already in discovery set", "peer", peer.ID) + log.Debug("peer is already in discovery set") return false } d.onUpdatedPeers(peer.ID, true) - log.Debugw("added peer to set", "peer", peer.ID) + log.Debug("added peer to set") // tag to protect peer from being killed by ConnManager // NOTE: This is does not protect from remote killing the connection. @@ -220,7 +222,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { }() go d.connector.GC(ctx) - t := time.NewTicker(d.params.DiscoveryRetryTimeout) + t := time.NewTicker(d.params.discoveryRetryTimeout) defer t.Stop() for { // drain all previous ticks from channel @@ -237,7 +239,7 @@ func (d *Discovery) ensurePeers(ctx context.Context) { } select { - case <-d.triggerDisq: + case <-d.triggerDisc: case <-ctx.Done(): return } diff --git a/share/availability/discovery/discovery_test.go b/share/availability/discovery/discovery_test.go index 8de39433ab..fd88a98586 100644 --- a/share/availability/discovery/discovery_test.go +++ b/share/availability/discovery/discovery_test.go @@ -26,7 +26,7 @@ func TestDiscovery(t *testing.T) { peerA := tn.discovery(Parameters{ PeersLimit: nodes, - DiscoveryRetryTimeout: time.Millisecond * 100, + discoveryRetryTimeout: time.Millisecond * 100, AdvertiseInterval: -1, // we don't want to be found but only find }) @@ -43,7 +43,7 @@ func TestDiscovery(t *testing.T) { for i := range discs { discs[i] = tn.discovery(Parameters{ PeersLimit: 0, - DiscoveryRetryTimeout: -1, + discoveryRetryTimeout: -1, AdvertiseInterval: time.Millisecond * 100, }) diff --git a/share/availability/full/testing.go b/share/availability/full/testing.go index 7a6882b0b7..df4561a8eb 100644 --- a/share/availability/full/testing.go +++ b/share/availability/full/testing.go @@ -37,12 +37,10 @@ func Node(dn *availability_test.TestDagNet) *availability_test.TestNode { } func TestAvailability(getter share.Getter) *ShareAvailability { - disc := discovery.NewDiscovery(nil, routing.NewRoutingDiscovery( - routinghelpers.Null{}), discovery.Parameters{ - PeersLimit: 10, - DiscoveryRetryTimeout: time.Second, - AdvertiseInterval: time.Second, - }, - ) + params := discovery.DefaultParameters() + params.AdvertiseInterval = time.Second + params.PeersLimit = 10 + disc := discovery.NewDiscovery(nil, + routing.NewRoutingDiscovery(routinghelpers.Null{}), params) return NewShareAvailability(nil, getter, disc) } diff --git a/share/getters/shrex_test.go b/share/getters/shrex_test.go index f35164d744..b93a40d488 100644 --- a/share/getters/shrex_test.go +++ b/share/getters/shrex_test.go @@ -159,9 +159,8 @@ func testManager(ctx context.Context, host host.Host, headerSub libhead.Subscrib disc := discovery.NewDiscovery(nil, routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), discovery.Parameters{ - PeersLimit: 10, - DiscoveryRetryTimeout: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 10, + AdvertiseInterval: time.Second, }, ) connGater, err := conngater.NewBasicConnectionGater(ds_sync.MutexWrap(datastore.NewMapDatastore())) diff --git a/share/p2p/peers/manager_test.go b/share/p2p/peers/manager_test.go index a0abce2e79..804dd4b673 100644 --- a/share/p2p/peers/manager_test.go +++ b/share/p2p/peers/manager_test.go @@ -394,9 +394,8 @@ func TestIntegration(t *testing.T) { nw.Hosts()[0], routingdisc.NewRoutingDiscovery(router1), discovery.Parameters{ - PeersLimit: 0, - DiscoveryRetryTimeout: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 0, + AdvertiseInterval: time.Second, }, ) @@ -408,9 +407,8 @@ func TestIntegration(t *testing.T) { nw.Hosts()[1], routingdisc.NewRoutingDiscovery(router2), discovery.Parameters{ - PeersLimit: 10, - DiscoveryRetryTimeout: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 10, + AdvertiseInterval: time.Second, }, ) err = fnDisc.Start(ctx) @@ -466,9 +464,8 @@ func testManager(ctx context.Context, headerSub libhead.Subscriber[*header.Exten disc := discovery.NewDiscovery(nil, routingdisc.NewRoutingDiscovery(routinghelpers.Null{}), discovery.Parameters{ - PeersLimit: 0, - DiscoveryRetryTimeout: time.Second, - AdvertiseInterval: time.Second, + PeersLimit: 0, + AdvertiseInterval: time.Second, }) connGater, err := conngater.NewBasicConnectionGater(sync.MutexWrap(datastore.NewMapDatastore())) if err != nil { From 0b56f6756015c4beb8dd2148f97b505411f0be74 Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 3 May 2023 08:34:20 +0300 Subject: [PATCH 51/56] rebase --- share/availability/discovery/discovery.go | 2 +- share/availability/discovery/metrics.go | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 4879b85d8e..3cd6d4e80b 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -166,7 +166,7 @@ func (d *Discovery) handlePeerFound(ctx context.Context, peer peer.AddrInfo) boo return false } d.onUpdatedPeers(peer.ID, true) - d.metrics.observeHandlePeer(handlePeerConnect) + d.metrics.observeHandlePeer(handlePeerConnected) log.Debug("added peer to set") // tag to protect peer from being killed by ConnManager diff --git a/share/availability/discovery/metrics.go b/share/availability/discovery/metrics.go index 3b3597ac77..1e3b1ff67e 100644 --- a/share/availability/discovery/metrics.go +++ b/share/availability/discovery/metrics.go @@ -21,16 +21,14 @@ const ( findPeersCanceledByDiscoveryStopped findPeersCancelReason = "discovery_stopped" findPeersCanceledByExternalCancel findPeersCancelReason = "external_cancel" - handlePeerResultKey = "result" - handlePeerSkipSelf handlePeerResult = "skip_self" - handlePeerEmptyAddrs handlePeerResult = "skip_empty_addresses" - handlePeerEnoughPeers handlePeerResult = "skip_enough_peers" - handlePeerBackoff handlePeerResult = "skip_backoff" - handlePeerConnected handlePeerResult = "connected" - handlePeerInSet handlePeerResult = "in_set" - handlePeerConnect handlePeerResult = "connect" - handlePeerConnInProgress handlePeerResult = "conn_in_progress" - handlePeerConnErr handlePeerResult = "conn_err" + handlePeerResultKey = "result" + handlePeerSkipSelf handlePeerResult = "skip_self" + handlePeerEmptyAddrs handlePeerResult = "skip_empty_addresses" + handlePeerEnoughPeers handlePeerResult = "skip_enough_peers" + handlePeerBackoff handlePeerResult = "skip_backoff" + handlePeerConnected handlePeerResult = "connected" + handlePeerConnErr handlePeerResult = "conn_err" + handlePeerInSet handlePeerResult = "in_set" advertiseErrKey = "is_err" ) From 1512b1024913c3c989bc7f750f90642f99fe4c68 Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 3 May 2023 12:49:56 +0300 Subject: [PATCH 52/56] rebase --- share/availability/discovery/discovery.go | 12 ++++++++++ share/availability/discovery/metrics.go | 28 +++++++++++------------ 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 0a2deb7858..1e94e5a5a9 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -85,6 +85,8 @@ type Discovery struct { triggerDisc chan struct{} + metrics *metrics + cancel context.CancelFunc } @@ -158,6 +160,7 @@ func (d *Discovery) Advertise(ctx context.Context) { defer timer.Stop() for { ttl, err := d.disc.Advertise(ctx, rendezvousPoint) + d.metrics.observeAdvertise(err) if err != nil { log.Debugf("Error advertising %s: %s", rendezvousPoint, err.Error()) if ctx.Err() != nil { @@ -283,12 +286,14 @@ func (d *Discovery) discover(ctx context.Context) bool { drainChannel(ticker.C) select { case <-findCtx.Done(): + d.metrics.observeFindPeers(ctx.Err() != nil, findCtx.Err() != nil) return true case <-ticker.C: log.Warn("wasn't able to find new peers for long time") continue case p, ok := <-peers: if !ok { + d.metrics.observeFindPeers(ctx.Err() != nil, findCtx.Err() != nil) log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) return d.set.Size() >= d.set.Limit() } @@ -326,15 +331,19 @@ func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo logger := log.With("peer", peer.ID) switch { case peer.ID == d.host.ID(): + d.metrics.observeHandlePeer(handlePeerSkipSelf) logger.Debug("skip handle: self discovery") return false case len(peer.Addrs) == 0: + d.metrics.observeHandlePeer(handlePeerEmptyAddrs) logger.Debug("skip handle: empty address list") return false case d.set.Size() >= d.set.Limit(): + d.metrics.observeHandlePeer(handlePeerEnoughPeers) logger.Debug("skip handle: enough peers found") return false case d.connector.HasBackoff(peer.ID): + d.metrics.observeHandlePeer(handlePeerBackoff) logger.Debug("skip handle: backoff") return false } @@ -345,6 +354,7 @@ func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo case network.NotConnected: err := d.connector.Connect(ctx, peer) if err != nil { + d.metrics.observeHandlePeer(handlePeerConnErr) logger.Debugw("unable to connect", "err", err) return false } @@ -353,10 +363,12 @@ func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo } if !d.set.Add(peer.ID) { + d.metrics.observeHandlePeer(handlePeerInSet) logger.Debug("peer is already in discovery set") return false } d.onUpdatedPeers(peer.ID, true) + d.metrics.observeHandlePeer(handlePeerConnected) logger.Debug("added peer to set") // tag to protect peer from being killed by ConnManager diff --git a/share/availability/discovery/metrics.go b/share/availability/discovery/metrics.go index 1e3b1ff67e..254a29b0c7 100644 --- a/share/availability/discovery/metrics.go +++ b/share/availability/discovery/metrics.go @@ -16,10 +16,10 @@ import ( const ( observeTimeout = 100 * time.Millisecond - findPeersCanceledReasonKey = "cancel_reason" - findPeersCanceledByEnoughPeers findPeersCancelReason = "enough_peers" - findPeersCanceledByDiscoveryStopped findPeersCancelReason = "discovery_stopped" - findPeersCanceledByExternalCancel findPeersCancelReason = "external_cancel" + discoveryStopReasonKey = "stop_reason" + discoveryCanceledByEnoughPeers discoveryStopReason = "enough_peers" + discoveryCanceledByNoAvailablePeers discoveryStopReason = "no_more_available_peers" + discoveryCanceledByExternalCancel discoveryStopReason = "external_cancel" handlePeerResultKey = "result" handlePeerSkipSelf handlePeerResult = "skip_self" @@ -37,13 +37,13 @@ var ( meter = global.MeterProvider().Meter("share_discovery") ) -type findPeersCancelReason string +type discoveryStopReason string type handlePeerResult string type metrics struct { peersAmount asyncint64.Gauge - findPeersResult syncint64.Counter // attributes: stop_reason[enough_peers,discovery_stopped,external_cancel] + discoveryResult syncint64.Counter // attributes: stop_reason[enough_peers,discovery_stopped,external_cancel] handlePeerResult syncint64.Counter // attributes: result advertise syncint64.Counter //attributes: is_err[yes/no] peerAdded syncint64.Counter @@ -74,7 +74,7 @@ func initMetrics(d *Discovery) (*metrics, error) { return nil, err } - handlePeerResult, err := meter.SyncInt64().Counter("discovery_handler_peer_result", + handlePeerResultCounter, err := meter.SyncInt64().Counter("discovery_handler_peer_result", instrument.WithDescription("result handling found peer")) if err != nil { return nil, err @@ -100,8 +100,8 @@ func initMetrics(d *Discovery) (*metrics, error) { metrics := &metrics{ peersAmount: peersAmount, - findPeersResult: findPeersResult, - handlePeerResult: handlePeerResult, + discoveryResult: findPeersResult, + handlePeerResult: handlePeerResultCounter, advertise: advertise, peerAdded: peerAdded, peerRemoved: peerRemoved, @@ -128,15 +128,15 @@ func (m *metrics) observeFindPeers(isGlobalCancel, isEnoughPeers bool) { ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) defer cancel() - reason := findPeersCanceledByDiscoveryStopped + reason := discoveryCanceledByNoAvailablePeers switch { case isGlobalCancel: - reason = findPeersCanceledByExternalCancel + reason = discoveryCanceledByExternalCancel case isEnoughPeers: - reason = findPeersCanceledByEnoughPeers + reason = discoveryCanceledByEnoughPeers } - m.findPeersResult.Add(ctx, 1, - attribute.String(findPeersCanceledReasonKey, string(reason))) + m.discoveryResult.Add(ctx, 1, + attribute.String(discoveryStopReasonKey, string(reason))) } func (m *metrics) observeHandlePeer(result handlePeerResult) { From d498cf317138fb276e5ad4cd98a85d2c018b023a Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 3 May 2023 12:56:49 +0300 Subject: [PATCH 53/56] rebase --- share/availability/discovery/discovery.go | 7 +++--- share/availability/discovery/metrics.go | 30 ++++++++--------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 1e94e5a5a9..f0ee509f0b 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -286,16 +286,17 @@ func (d *Discovery) discover(ctx context.Context) bool { drainChannel(ticker.C) select { case <-findCtx.Done(): - d.metrics.observeFindPeers(ctx.Err() != nil, findCtx.Err() != nil) + d.metrics.observeFindPeers(true, d.set.Size() >= d.set.Limit()) return true case <-ticker.C: log.Warn("wasn't able to find new peers for long time") continue case p, ok := <-peers: if !ok { - d.metrics.observeFindPeers(ctx.Err() != nil, findCtx.Err() != nil) + isEnoughPeers := d.set.Size() >= d.set.Limit() + d.metrics.observeFindPeers(ctx.Err() != nil, isEnoughPeers) log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) - return d.set.Size() >= d.set.Limit() + return isEnoughPeers } peer := p diff --git a/share/availability/discovery/metrics.go b/share/availability/discovery/metrics.go index 254a29b0c7..8f5b03efa2 100644 --- a/share/availability/discovery/metrics.go +++ b/share/availability/discovery/metrics.go @@ -16,10 +16,8 @@ import ( const ( observeTimeout = 100 * time.Millisecond - discoveryStopReasonKey = "stop_reason" - discoveryCanceledByEnoughPeers discoveryStopReason = "enough_peers" - discoveryCanceledByNoAvailablePeers discoveryStopReason = "no_more_available_peers" - discoveryCanceledByExternalCancel discoveryStopReason = "external_cancel" + discoveryEnougPeersKey = "enough_peers" + discoveryFindCancledKey = "is_canceled" handlePeerResultKey = "result" handlePeerSkipSelf handlePeerResult = "skip_self" @@ -30,22 +28,20 @@ const ( handlePeerConnErr handlePeerResult = "conn_err" handlePeerInSet handlePeerResult = "in_set" - advertiseErrKey = "is_err" + advertiseFailedKey = "failed" ) var ( meter = global.MeterProvider().Meter("share_discovery") ) -type discoveryStopReason string - type handlePeerResult string type metrics struct { peersAmount asyncint64.Gauge - discoveryResult syncint64.Counter // attributes: stop_reason[enough_peers,discovery_stopped,external_cancel] - handlePeerResult syncint64.Counter // attributes: result - advertise syncint64.Counter //attributes: is_err[yes/no] + discoveryResult syncint64.Counter // attributes: enough_peers[bool],is_canceled[bool] + handlePeerResult syncint64.Counter // attributes: result[string] + advertise syncint64.Counter // attributes: failed[bool] peerAdded syncint64.Counter peerRemoved syncint64.Counter } @@ -121,22 +117,16 @@ func initMetrics(d *Discovery) (*metrics, error) { return metrics, nil } -func (m *metrics) observeFindPeers(isGlobalCancel, isEnoughPeers bool) { +func (m *metrics) observeFindPeers(canceled, isEnoughPeers bool) { if m == nil { return } ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) defer cancel() - reason := discoveryCanceledByNoAvailablePeers - switch { - case isGlobalCancel: - reason = discoveryCanceledByExternalCancel - case isEnoughPeers: - reason = discoveryCanceledByEnoughPeers - } m.discoveryResult.Add(ctx, 1, - attribute.String(discoveryStopReasonKey, string(reason))) + attribute.Bool(discoveryFindCancledKey, canceled), + attribute.Bool(discoveryEnougPeersKey, isEnoughPeers)) } func (m *metrics) observeHandlePeer(result handlePeerResult) { @@ -158,7 +148,7 @@ func (m *metrics) observeAdvertise(err error) { defer cancel() m.advertise.Add(ctx, 1, - attribute.Bool(advertiseErrKey, err != nil)) + attribute.Bool(advertiseFailedKey, err != nil)) } func (m *metrics) observeOnPeersUpdate(_ peer.ID, isAdded bool) { From 6cb5cf9f994c6a4c9dad9efa592633a959facf07 Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 3 May 2023 12:58:30 +0300 Subject: [PATCH 54/56] rebase --- nodebuilder/share/opts.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodebuilder/share/opts.go b/nodebuilder/share/opts.go index 027911215e..cba949882b 100644 --- a/nodebuilder/share/opts.go +++ b/nodebuilder/share/opts.go @@ -1,8 +1,8 @@ package share import ( - "github.com/celestiaorg/celestia-node/share/getters" disc "github.com/celestiaorg/celestia-node/share/availability/discovery" + "github.com/celestiaorg/celestia-node/share/getters" "github.com/celestiaorg/celestia-node/share/p2p/peers" "github.com/celestiaorg/celestia-node/share/p2p/shrexeds" "github.com/celestiaorg/celestia-node/share/p2p/shrexnd" From 6bfcb8a7872da66ad12dfb6613858c1f8570487c Mon Sep 17 00:00:00 2001 From: Vlad Date: Wed, 3 May 2023 16:28:22 +0300 Subject: [PATCH 55/56] - apply review suggestions - add stuck counter --- share/availability/discovery/discovery.go | 3 ++- share/availability/discovery/metrics.go | 22 ++++++++++++++++++++-- share/p2p/metrics.go | 3 ++- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index f0ee509f0b..2656c93fc2 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -286,9 +286,10 @@ func (d *Discovery) discover(ctx context.Context) bool { drainChannel(ticker.C) select { case <-findCtx.Done(): - d.metrics.observeFindPeers(true, d.set.Size() >= d.set.Limit()) + d.metrics.observeFindPeers(true, true) return true case <-ticker.C: + d.metrics.observeDiscoveryStuck() log.Warn("wasn't able to find new peers for long time") continue case p, ok := <-peers: diff --git a/share/availability/discovery/metrics.go b/share/availability/discovery/metrics.go index 8f5b03efa2..4c8d1c6e55 100644 --- a/share/availability/discovery/metrics.go +++ b/share/availability/discovery/metrics.go @@ -40,6 +40,7 @@ type handlePeerResult string type metrics struct { peersAmount asyncint64.Gauge discoveryResult syncint64.Counter // attributes: enough_peers[bool],is_canceled[bool] + discoveryStuck syncint64.Counter handlePeerResult syncint64.Counter // attributes: result[string] advertise syncint64.Counter // attributes: failed[bool] peerAdded syncint64.Counter @@ -64,12 +65,18 @@ func initMetrics(d *Discovery) (*metrics, error) { return nil, err } - findPeersResult, err := meter.SyncInt64().Counter("discovery_find_peers_result", + discoveryResult, err := meter.SyncInt64().Counter("discovery_find_peers_result", instrument.WithDescription("result of find peers run")) if err != nil { return nil, err } + discoveryStuck, err := meter.SyncInt64().Counter("discovery_lookup_is_stuck", + instrument.WithDescription("indicates discovery wasn't able to find peers for more than 1 min")) + if err != nil { + return nil, err + } + handlePeerResultCounter, err := meter.SyncInt64().Counter("discovery_handler_peer_result", instrument.WithDescription("result handling found peer")) if err != nil { @@ -96,7 +103,8 @@ func initMetrics(d *Discovery) (*metrics, error) { metrics := &metrics{ peersAmount: peersAmount, - discoveryResult: findPeersResult, + discoveryResult: discoveryResult, + discoveryStuck: discoveryStuck, handlePeerResult: handlePeerResultCounter, advertise: advertise, peerAdded: peerAdded, @@ -164,3 +172,13 @@ func (m *metrics) observeOnPeersUpdate(_ peer.ID, isAdded bool) { } m.peerRemoved.Add(ctx, 1) } + +func (m *metrics) observeDiscoveryStuck() { + if m == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) + defer cancel() + + m.discoveryStuck.Add(ctx, 1) +} diff --git a/share/p2p/metrics.go b/share/p2p/metrics.go index 4b94d8c8d5..8f3e813c8f 100644 --- a/share/p2p/metrics.go +++ b/share/p2p/metrics.go @@ -30,7 +30,8 @@ type Metrics struct { totalRequestCounter syncint64.Counter } -// ObserveRequests increments the total number of requests sent with the given status as an attribute. +// ObserveRequests increments the total number of requests sent with the given status as an +// attribute. func (m *Metrics) ObserveRequests(count int64, status status) { if m == nil { return From 894ee4efa0701b85038f52ec25306867d81b3a8b Mon Sep 17 00:00:00 2001 From: Vlad Date: Tue, 9 May 2023 12:51:59 +0300 Subject: [PATCH 56/56] use user context for metrics observation --- share/availability/discovery/discovery.go | 22 +++++++-------- share/availability/discovery/metrics.go | 34 +++++++++++------------ 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/share/availability/discovery/discovery.go b/share/availability/discovery/discovery.go index 2656c93fc2..2dc6fb2d73 100644 --- a/share/availability/discovery/discovery.go +++ b/share/availability/discovery/discovery.go @@ -160,7 +160,7 @@ func (d *Discovery) Advertise(ctx context.Context) { defer timer.Stop() for { ttl, err := d.disc.Advertise(ctx, rendezvousPoint) - d.metrics.observeAdvertise(err) + d.metrics.observeAdvertise(ctx, err) if err != nil { log.Debugf("Error advertising %s: %s", rendezvousPoint, err.Error()) if ctx.Err() != nil { @@ -286,16 +286,16 @@ func (d *Discovery) discover(ctx context.Context) bool { drainChannel(ticker.C) select { case <-findCtx.Done(): - d.metrics.observeFindPeers(true, true) + d.metrics.observeFindPeers(ctx, true, true) return true case <-ticker.C: - d.metrics.observeDiscoveryStuck() + d.metrics.observeDiscoveryStuck(ctx) log.Warn("wasn't able to find new peers for long time") continue case p, ok := <-peers: if !ok { isEnoughPeers := d.set.Size() >= d.set.Limit() - d.metrics.observeFindPeers(ctx.Err() != nil, isEnoughPeers) + d.metrics.observeFindPeers(ctx, ctx.Err() != nil, isEnoughPeers) log.Debugw("discovery channel closed", "find_is_canceled", findCtx.Err() != nil) return isEnoughPeers } @@ -333,19 +333,19 @@ func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo logger := log.With("peer", peer.ID) switch { case peer.ID == d.host.ID(): - d.metrics.observeHandlePeer(handlePeerSkipSelf) + d.metrics.observeHandlePeer(ctx, handlePeerSkipSelf) logger.Debug("skip handle: self discovery") return false case len(peer.Addrs) == 0: - d.metrics.observeHandlePeer(handlePeerEmptyAddrs) + d.metrics.observeHandlePeer(ctx, handlePeerEmptyAddrs) logger.Debug("skip handle: empty address list") return false case d.set.Size() >= d.set.Limit(): - d.metrics.observeHandlePeer(handlePeerEnoughPeers) + d.metrics.observeHandlePeer(ctx, handlePeerEnoughPeers) logger.Debug("skip handle: enough peers found") return false case d.connector.HasBackoff(peer.ID): - d.metrics.observeHandlePeer(handlePeerBackoff) + d.metrics.observeHandlePeer(ctx, handlePeerBackoff) logger.Debug("skip handle: backoff") return false } @@ -356,7 +356,7 @@ func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo case network.NotConnected: err := d.connector.Connect(ctx, peer) if err != nil { - d.metrics.observeHandlePeer(handlePeerConnErr) + d.metrics.observeHandlePeer(ctx, handlePeerConnErr) logger.Debugw("unable to connect", "err", err) return false } @@ -365,12 +365,12 @@ func (d *Discovery) handleDiscoveredPeer(ctx context.Context, peer peer.AddrInfo } if !d.set.Add(peer.ID) { - d.metrics.observeHandlePeer(handlePeerInSet) + d.metrics.observeHandlePeer(ctx, handlePeerInSet) logger.Debug("peer is already in discovery set") return false } d.onUpdatedPeers(peer.ID, true) - d.metrics.observeHandlePeer(handlePeerConnected) + d.metrics.observeHandlePeer(ctx, handlePeerConnected) logger.Debug("added peer to set") // tag to protect peer from being killed by ConnManager diff --git a/share/availability/discovery/metrics.go b/share/availability/discovery/metrics.go index 4c8d1c6e55..9edd364185 100644 --- a/share/availability/discovery/metrics.go +++ b/share/availability/discovery/metrics.go @@ -3,7 +3,6 @@ package discovery import ( "context" "fmt" - "time" "github.com/libp2p/go-libp2p/core/peer" "go.opentelemetry.io/otel/attribute" @@ -14,8 +13,6 @@ import ( ) const ( - observeTimeout = 100 * time.Millisecond - discoveryEnougPeersKey = "enough_peers" discoveryFindCancledKey = "is_canceled" @@ -125,35 +122,38 @@ func initMetrics(d *Discovery) (*metrics, error) { return metrics, nil } -func (m *metrics) observeFindPeers(canceled, isEnoughPeers bool) { +func (m *metrics) observeFindPeers(ctx context.Context, canceled, isEnoughPeers bool) { if m == nil { return } - ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) - defer cancel() + if ctx.Err() != nil { + ctx = context.Background() + } m.discoveryResult.Add(ctx, 1, attribute.Bool(discoveryFindCancledKey, canceled), attribute.Bool(discoveryEnougPeersKey, isEnoughPeers)) } -func (m *metrics) observeHandlePeer(result handlePeerResult) { +func (m *metrics) observeHandlePeer(ctx context.Context, result handlePeerResult) { if m == nil { return } - ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) - defer cancel() + if ctx.Err() != nil { + ctx = context.Background() + } m.handlePeerResult.Add(ctx, 1, attribute.String(handlePeerResultKey, string(result))) } -func (m *metrics) observeAdvertise(err error) { +func (m *metrics) observeAdvertise(ctx context.Context, err error) { if m == nil { return } - ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) - defer cancel() + if ctx.Err() != nil { + ctx = context.Background() + } m.advertise.Add(ctx, 1, attribute.Bool(advertiseFailedKey, err != nil)) @@ -163,8 +163,7 @@ func (m *metrics) observeOnPeersUpdate(_ peer.ID, isAdded bool) { if m == nil { return } - ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) - defer cancel() + ctx := context.Background() if isAdded { m.peerAdded.Add(ctx, 1) @@ -173,12 +172,13 @@ func (m *metrics) observeOnPeersUpdate(_ peer.ID, isAdded bool) { m.peerRemoved.Add(ctx, 1) } -func (m *metrics) observeDiscoveryStuck() { +func (m *metrics) observeDiscoveryStuck(ctx context.Context) { if m == nil { return } - ctx, cancel := context.WithTimeout(context.Background(), observeTimeout) - defer cancel() + if ctx.Err() != nil { + ctx = context.Background() + } m.discoveryStuck.Add(ctx, 1) }