Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

p2p: attempt to fill full outbound connection slots with peers that support tx relay #28538

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/net.cpp
Expand Up @@ -2347,12 +2347,13 @@ void CConnman::StartExtraBlockRelayPeers()
}

// Return the number of peers we have over our outbound connection limit
// Returns 0 if we are at the limit, and a negative number if below.
// Exclude peers that are marked for disconnect, or are going to be
// disconnected soon (eg ADDR_FETCH and FEELER)
// Also exclude peers that haven't finished initial connection handshake yet
// (so that we don't decide we're over our desired connection limit, and then
// evict some peer that has finished the handshake)
int CConnman::GetExtraFullOutboundCount() const
int CConnman::GetFullOutboundDelta() const
{
int full_outbound_peers = 0;
{
Expand All @@ -2363,7 +2364,7 @@ int CConnman::GetExtraFullOutboundCount() const
}
}
}
return std::max(full_outbound_peers - m_max_outbound_full_relay, 0);
return full_outbound_peers - m_max_outbound_full_relay;
mzumsande marked this conversation as resolved.
Show resolved Hide resolved
}

int CConnman::GetExtraBlockRelayCount() const
Expand Down
7 changes: 4 additions & 3 deletions src/net.h
Expand Up @@ -1124,7 +1124,7 @@ class CConnman
void PushMessage(CNode* pnode, CSerializedNetMsg&& msg) EXCLUSIVE_LOCKS_REQUIRED(!m_total_bytes_sent_mutex);

using NodeFn = std::function<void(CNode*)>;
void ForEachNode(const NodeFn& func)
void ForEachFullyConnectedNode(const NodeFn& func)
{
LOCK(m_nodes_mutex);
for (auto&& node : m_nodes) {
Expand All @@ -1133,7 +1133,7 @@ class CConnman
}
};

void ForEachNode(const NodeFn& func) const
void ForEachFullyConnectedNode(const NodeFn& func) const
{
LOCK(m_nodes_mutex);
for (auto&& node : m_nodes) {
Expand Down Expand Up @@ -1173,7 +1173,8 @@ class CConnman
// return a value less than (num_outbound_connections - num_outbound_slots)
// in cases where some outbound connections are not yet fully connected, or
// not yet fully disconnected.
int GetExtraFullOutboundCount() const;
// Returns 0 if we are at the target, and a negative number if below.
int GetFullOutboundDelta() const;
// Count the number of block-relay-only peers we have over our limit.
int GetExtraBlockRelayCount() const;

Expand Down
39 changes: 30 additions & 9 deletions src/net_processing.cpp
Expand Up @@ -2020,10 +2020,10 @@ void PeerManagerImpl::NewPoWValidBlock(const CBlockIndex *pindex, const std::sha
m_most_recent_block_txs = std::move(most_recent_block_txs);
}

m_connman.ForEachNode([this, pindex, &lazy_ser, &hashBlock](CNode* pnode) EXCLUSIVE_LOCKS_REQUIRED(::cs_main) {
m_connman.ForEachFullyConnectedNode([this, pindex, &lazy_ser, &hashBlock](CNode* pnode) EXCLUSIVE_LOCKS_REQUIRED(::cs_main) {
AssertLockHeld(::cs_main);

if (pnode->GetCommonVersion() < INVALID_CB_NO_BAN_VERSION || pnode->fDisconnect)
if (pnode->GetCommonVersion() < INVALID_CB_NO_BAN_VERSION)
return;
ProcessBlockAvailability(pnode->GetId());
CNodeState &state = *State(pnode->GetId());
Expand Down Expand Up @@ -5146,8 +5146,8 @@ void PeerManagerImpl::EvictExtraOutboundPeers(std::chrono::seconds now)
if (m_connman.GetExtraBlockRelayCount() > 0) {
std::pair<NodeId, std::chrono::seconds> youngest_peer{-1, 0}, next_youngest_peer{-1, 0};

m_connman.ForEachNode([&](CNode* pnode) {
if (!pnode->IsBlockOnlyConn() || pnode->fDisconnect) return;
m_connman.ForEachFullyConnectedNode([&](CNode* pnode) {
if (!pnode->IsBlockOnlyConn()) return;
if (pnode->GetId() > youngest_peer.first) {
next_youngest_peer = youngest_peer;
youngest_peer.first = pnode->GetId();
Expand Down Expand Up @@ -5183,7 +5183,8 @@ void PeerManagerImpl::EvictExtraOutboundPeers(std::chrono::seconds now)
}

// Check whether we have too many outbound-full-relay peers
if (m_connman.GetExtraFullOutboundCount() > 0) {
int full_outbound_delta{m_connman.GetFullOutboundDelta()};
if (full_outbound_delta > 0) {
// If we have more outbound-full-relay peers than we target, disconnect one.
// Pick the outbound-full-relay peer that least recently announced
// us a new block, with ties broken by choosing the more recent
Expand All @@ -5193,12 +5194,11 @@ void PeerManagerImpl::EvictExtraOutboundPeers(std::chrono::seconds now)
NodeId worst_peer = -1;
int64_t oldest_block_announcement = std::numeric_limits<int64_t>::max();

m_connman.ForEachNode([&](CNode* pnode) EXCLUSIVE_LOCKS_REQUIRED(::cs_main, m_connman.GetNodesMutex()) {
m_connman.ForEachFullyConnectedNode([&](CNode* pnode) EXCLUSIVE_LOCKS_REQUIRED(::cs_main, m_connman.GetNodesMutex()) {
AssertLockHeld(::cs_main);

// Only consider outbound-full-relay peers that are not already
// marked for disconnection
if (!pnode->IsFullOutboundConn() || pnode->fDisconnect) return;
// Only consider outbound-full-relay peers
if (!pnode->IsFullOutboundConn()) return;
CNodeState *state = State(pnode->GetId());
if (state == nullptr) return; // shouldn't be possible, but just in case
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ad48667

might update this too if (!Assume(state)) return and drop the comment if you feel like it :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'd prefer to not do it here, for my taste it's not related enough to the core of this PR.

// Don't evict our protected peers
Expand Down Expand Up @@ -5241,6 +5241,27 @@ void PeerManagerImpl::EvictExtraOutboundPeers(std::chrono::seconds now)
}
}
}
// If we are at the max full outbound limit but not above, check if any peers
mzumsande marked this conversation as resolved.
Show resolved Hide resolved
// don't support tx relay (they might be in -blocksonly mode) - if so,
// evict the newest of them (highest Node Id) so we can fill the slot with a tx-relaying outbound peer
// This is not done if we are in -blocksonly mode ourselves or still in IBD,
// because then we don't care if our peer supports tx relay.
else if (full_outbound_delta == 0 && !m_opts.ignore_incoming_txs && !m_chainman.IsInitialBlockDownload()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1393bf6

not super clear why we won't do this if full_outbound_delta > 0?
Maybe a comment explaining why this could be possible in the first place could help (probably above the first if block).
Even now it's hard to tell if it's possible to be over the limit most-of-the-time, let alone if we add some new complexity for being over the limit more frequently (for some security reason or whatever).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I think the main reason was to avoid interaction with the existing extra-full-outbound eviction.
Which raises the question whether there should be any interaction?
Should nodes that are protected from that (either because m_protect is true or because they are the only ones for their network) also be made exempt from this new disconnection due to not supporting tx relay?
I'm not really sure about that: It would be unfortunate if the non-tx-relaying outbound peer would be the only one with the best chain, but how realistic would such a scenario be?

Copy link
Member

@sr-gi sr-gi Jan 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not super clear why we won't do this if full_outbound_delta > 0?

Which raises the question whether there should be any interaction?

I think this would create a weird interaction. The current approach of this PR is trying to make the 8 full-outbound connections relay transactions, given we already have blocks-only connections, so we should not waste full-outbound slots for this purpose. However, our full-outbound connection protection criteria is based on whether they have provided valid blocks with enough accumulated work and it has nothing to do with how good they are at providing transactions (there is also the recently added network diversity criteria, more about that later).

This raises the question of what we value the most, having 8 full-outbound peers relaying transactions or our protected peers (and under what criteria). If the former is valued the most, we could merge this under full_outbound_delta > 0 but pick a node not taking the protection criteria into account (which raises the question of how useful the protection is to start with). If the latter is prioritized, then we exclude the protected peers first and then run the eviction over the remaining. This slightly changes the goal of the PR, given we may not achieve 8 transaction-relaying outbound peers. Finally, there are the additional protected peers for network diversity. Depending on how this is treated we may end up with an even smaller number of full-outbounds relaying transactions (8 - 4 - n where n represents the number of networks we support).

Copy link
Contributor Author

@mzumsande mzumsande Jan 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree - I think that there are colliding goals, my thinking is as follows:

  1. The m_protect protection attempts to prevent pathological situations where we wouldn't have enough peers with the honest best chain (that shouldn't happen normally even without the protection, but would be very serious if they did)
  2. The network-specific protection aims to be connected to all networks we support, to help the interconnection, and also to reduce the risk of eclipse attacks. Ideally, that means relaying both blocks and transactions from one network to another.
  3. The added code attempts to make sure we have as many full-outbound peers actually participating in tx relay as possible, mostly for selfish reasons (e.g. privacy, fee estimation)

These goals conflict. I'm pretty confident that 3) should take precedence over 2), because the network-specific protection isn't really about this particular peer, we just want any connection to that network, and would be happy to exchange a current peer that doesn't support tx-relay with a tx-relaying peer from that same network.

I'm kind of torn between 1) and 2). It would be easy to exempt non-tx-relaying m_protect peers from disconnection, but that would mean that we could end up with up to 4 of these peers permanently.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be easy to exempt non-tx-relaying m_protect peers from disconnection, but that would mean that we could end up with up to 4 of these peers permanently.

commit message where m_protect was introduced says "we pick 4 of our outbound peers and do not subject them to this logic, to be more conservative. We don't wish to permit temporary network issues (or an attacker) to excessively disrupt network topology."

wondering how someone can disrupt network topology with this and if it's still relevant.

std::optional<NodeId> node_to_evict;
m_connman.ForEachFullyConnectedNode([&node_to_evict](CNode* node) {
if (!node->IsFullOutboundConn() || node->m_relays_txs) return;
if (!node_to_evict.has_value() || *node_to_evict < node->GetId()) {
node_to_evict = node->GetId();
}
});
if (node_to_evict.has_value()) {
LogPrint(BCLog::NET, "disconnecting full outbound peer not participating in tx relay: peer=%d\n", *node_to_evict);
m_connman.ForNode(*node_to_evict, [](CNode* node) {
node->fDisconnect = true;
return true;
});
}
}
}

void PeerManagerImpl::CheckForStaleTipAndEvictPeers()
Expand Down
4 changes: 2 additions & 2 deletions src/test/fuzz/connman.cpp
Expand Up @@ -79,7 +79,7 @@ FUZZ_TARGET(connman, .init = initialize_connman)
connman.DisconnectNode(random_subnet);
},
[&] {
connman.ForEachNode([](auto) {});
connman.ForEachFullyConnectedNode([](auto) {});
},
[&] {
(void)connman.ForNode(fuzzed_data_provider.ConsumeIntegral<NodeId>(), [&](auto) { return fuzzed_data_provider.ConsumeBool(); });
Expand Down Expand Up @@ -123,7 +123,7 @@ FUZZ_TARGET(connman, .init = initialize_connman)
});
}
(void)connman.GetAddedNodeInfo(fuzzed_data_provider.ConsumeBool());
(void)connman.GetExtraFullOutboundCount();
(void)connman.GetFullOutboundDelta();
(void)connman.GetLocalServices();
(void)connman.GetMaxOutboundTarget();
(void)connman.GetMaxOutboundTimeframe();
Expand Down
29 changes: 29 additions & 0 deletions test/functional/p2p_blocksonly.py
Expand Up @@ -22,6 +22,7 @@ def run_test(self):
self.miniwallet = MiniWallet(self.nodes[0])

self.blocksonly_mode_tests()
self.blocksonly_peer_tests()
self.blocks_relay_conn_tests()

def blocksonly_mode_tests(self):
Expand Down Expand Up @@ -78,6 +79,34 @@ def blocksonly_mode_tests(self):
self.nodes[0].disconnect_p2ps()
self.generate(self.nodes[0], 1)

# Tests with an outbound peer that sets txrelay to false (which they do if they are in blocksonly mode)
def blocksonly_peer_tests(self):
self.restart_node(0, ["-noblocksonly"]) # disables blocks only mode
self.log.info("Check that we sustain full-relay outbound connections to blocksonly peers when not at the full outbound limit")
blocksonly_peer = self.nodes[0].add_outbound_p2p_connection(P2PInterface(), p2p_idx=1, txrelay=False)
# Trigger CheckForStaleTipAndEvictPeers, which is scheduled every 45s (EXTRA_PEER_CHECK_INTERVAL)
self.nodes[0].mockscheduler(46)
blocksonly_peer.sync_with_ping()
blocksonly_peer.peer_disconnect()

self.log.info("Check that we evict full-relay outbound connections to blocksonly peers at the full outbound limit")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

68769ff

You could expand this test by adding some irrelevant peers (e.g. ADDRFETCH), but no big deal.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried it, but it turned out to be annoying because in the test we simulate being at the full outbound limit by artificially running with a lower -maxconnections. If we now also add an unrelated peer like ADDRFETCH , it would make the test fail because we don't have enough slots in semOutbound.

# Simulate the situation of being at the max limit of full-outbound connections by setting -maxconnections to a low value
self.restart_node(0, ["-noblocksonly", "-maxconnections=2"])
# Have one blocksonly peer that is not being disconnected
blocksonly_peer1 = self.nodes[0].add_outbound_p2p_connection(P2PInterface(), p2p_idx=1, txrelay=False)
blocksonly_peer2 = self.nodes[0].add_outbound_p2p_connection(P2PInterface(), p2p_idx=2, txrelay=False)
self.nodes[0].mockscheduler(46)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

68769ff: nit if you retouch.

Suggested change
self.nodes[0].mockscheduler(46)
with self.nodes[0].assert_debug_log(expected_msgs=["disconnecting full outbound peer not participating in tx relay: peer=1"]):
self.nodes[0].mockscheduler(46)
blocksonly_peer2.wait_for_disconnect()

blocksonly_peer2.wait_for_disconnect()
blocksonly_peer1.sync_with_ping()
blocksonly_peer1.peer_disconnect()

self.log.info("Check that we don't evict full-relay outbound connections to blocksonly peers if we are blocksonly ourselves")
self.restart_node(0, ["-blocksonly", "-maxconnections=1"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why anyone would do maxconnections=1 in practice, but then if their firewall allows to find only one blocksonly node (and they are not in blocksonly), the behavior might be confusing: they will be frequently disconencted from this peer.

Perhaps, this could be logged if maxconnections=1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that it makes no sense, if you want just one connection you should pick a node you trust and do -connect. Using automatic connections for that means eclipsing yourself - maybe we should warn about that in general?
That being said, this PR does log disconnection to -blocksonly peers, although only in BCLog::NET - do you suggest to make it unconditional based on -maxconnections?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the idea was to log if maxconnect is set to 1, so at the startup (setting this argument) time.
I don't insist though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

imo users should get an error if they set -maxconnections to less than 8, or at least a "this is unsafe. are you SURE?!". but that's probably getting off topic here :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd agree with adding this warning, but yeah, this PR is not the best place for it.

blocksonly_peer = self.nodes[0].add_outbound_p2p_connection(P2PInterface(), p2p_idx=1, txrelay=False)
self.nodes[0].mockscheduler(46)
blocksonly_peer.sync_with_ping()
blocksonly_peer.peer_disconnect()

def blocks_relay_conn_tests(self):
self.log.info('Tests with node in normal mode with block-relay-only connections')
self.restart_node(0, ["-noblocksonly"]) # disables blocks only mode
Expand Down
9 changes: 5 additions & 4 deletions test/functional/test_framework/p2p.py
Expand Up @@ -351,12 +351,12 @@ def __init__(self, support_addrv2=False, wtxidrelay=True):
# If the peer supports wtxid-relay
self.wtxidrelay = wtxidrelay

def peer_connect_send_version(self, services):
def peer_connect_send_version(self, services, txrelay):
# Send a version msg
vt = msg_version()
vt.nVersion = P2P_VERSION
vt.strSubVer = P2P_SUBVERSION
vt.relay = P2P_VERSION_RELAY
vt.relay = P2P_VERSION_RELAY if txrelay else 0
vt.nServices = services
vt.addrTo.ip = self.dstaddr
vt.addrTo.port = self.dstport
Expand All @@ -368,13 +368,14 @@ def peer_connect(self, *, services=P2P_SERVICES, send_version, **kwargs):
create_conn = super().peer_connect(**kwargs)

if send_version:
self.peer_connect_send_version(services)
self.peer_connect_send_version(services, txrelay=True)

return create_conn

def peer_accept_connection(self, *args, services=P2P_SERVICES, **kwargs):
txrelay = kwargs.pop('txrelay', True)
create_conn = super().peer_accept_connection(*args, **kwargs)
self.peer_connect_send_version(services)
self.peer_connect_send_version(services, txrelay)

return create_conn

Expand Down