Skip to content

Commit

Permalink
neigh: Install neighbor entries only on devices where routes exist
Browse files Browse the repository at this point in the history
This PR fixes the issue that neighbor entries can be installed on devices
where no route to the destination host exists.

`netlink.RouteGetWithOptions` (equivalent to `ip route get`) with oif
returns a route even if the destination is unreachable from the specified interface.
The neighbor entries can be installed on all devices because of this.

`netlink.RouteGetWithOptions` with `FIBMatch` option (equivalent to `fibmatch` flag)
returns full fib lookup matched route. It returns `No route to host`
if the destination is non-routable from the specified device.
This PR adds `FIBMatch` option to avoid installing the unnecessary entries.

Note:

- With IPv6, it returns `Network is unreachable` if the destination
  is unreachable with or without fibmatch.
- With `FIBMatch` option, `netlink.RouteGetWithOptions` returns `MultiPath` field
  if there are multiple paths to the dest. It returns `GW` field without `FIBMatch`.
  See examples below.

```
// FIBMatch: false
netlink.Route{ GW: 8.8.8.250, MultiPath: [] }
netlink.Route{ GW: 9.9.9.250, MultiPath: [] }

// FIBMatch: true
netlink.Route{ GW: <nil>, MultiPath: [{Ifindex: 1218 Weight: 1 Gw: 9.9.9.250 Flags: []},
                                      {Ifindex: 1220 Weight: 1 Gw: 8.8.8.250 Flags: []}]}
```

`ip route get` examples

```
$ ip route
10.0.0.0/24 dev veth1 proto kernel scope link src 10.0.0.1
10.0.1.0/24 dev veth3 proto kernel scope link src 10.0.1.1

$ ping -I veth1 10.0.0.1
PING 10.0.0.1 (10.0.0.1) from 10.0.0.1 veth1: 56(84) bytes of data.
64 bytes from 10.0.0.1: icmp_seq=1 ttl=64 time=0.066 ms
64 bytes from 10.0.0.1: icmp_seq=2 ttl=64 time=0.056 ms
64 bytes from 10.0.0.1: icmp_seq=3 ttl=64 time=0.057 ms

$ ping -I veth3 10.0.0.1
PING 10.0.0.1 (10.0.0.1) from 10.0.1.1 veth3: 56(84) bytes of data.
From 10.0.1.1 icmp_seq=1 Destination Host Unreachable
From 10.0.1.1 icmp_seq=2 Destination Host Unreachable
From 10.0.1.1 icmp_seq=3 Destination Host Unreachable

$ ip route get 10.0.0.1 oif veth1
local 10.0.0.1 dev lo table local src 10.0.0.1 uid 1000
    cache <local>

// `ip route get` returns a route even if the destination is unreachable
// from the specified interface
$ ip route get 10.0.0.1 oif veth3
10.0.0.1 dev veth3 src 10.0.1.1 uid 1000
    cache

// With fibmatch flag, it returns full fib lookup matched route
$ ip route get fibmatch 10.0.0.1 oif veth1
local 10.0.0.1 dev veth1 proto kernel scope host src 10.0.0.1

$ ip route get fibmatch 10.0.0.1 oif veth3
RTNETLINK answers: No route to host
```

Fixes: cilium#28660

Signed-off-by: Yusuke Suzuki <ysuzuki4112@gmail.com>
  • Loading branch information
ysksuzuki authored and squeed committed Nov 10, 2023
1 parent 554fb6f commit 2058ed6
Show file tree
Hide file tree
Showing 2 changed files with 202 additions and 37 deletions.
23 changes: 21 additions & 2 deletions pkg/datapath/linux/node.go
Expand Up @@ -545,8 +545,8 @@ func (n *linuxNodeHandler) NodeUpdate(oldNode, newNode nodeTypes.Node) error {

func getNextHopIP(nodeIP net.IP, link netlink.Link) (nextHopIP net.IP, err error) {
// Figure out whether nodeIP is directly reachable (i.e. in the same L2)
routes, err := netlink.RouteGetWithOptions(nodeIP, &netlink.RouteGetOptions{Oif: link.Attrs().Name})
if err != nil {
routes, err := netlink.RouteGetWithOptions(nodeIP, &netlink.RouteGetOptions{Oif: link.Attrs().Name, FIBMatch: true})
if err != nil && !errors.Is(err, unix.EHOSTUNREACH) && !errors.Is(err, unix.ENETUNREACH) {
return nil, fmt.Errorf("failed to retrieve route for remote node IP: %w", err)
}
if len(routes) == 0 {
Expand All @@ -563,6 +563,25 @@ func getNextHopIP(nodeIP net.IP, link netlink.Link) (nextHopIP net.IP, err error
copy(nextHopIP, route.Gw.To16())
break
}

// Select a gw for the specified link if there are multi paths to the nodeIP
// For example, the nextHop to the nodeIP 9.9.9.9 from eth0 is 10.0.1.2,
// from eth1 is 10.0.2.2 as shown bellow.
//
// 9.9.9.9 proto bird metric 32
// nexthop via 10.0.1.2 dev eth0 weight 1
// nexthop via 10.0.2.2 dev eth1 weight 1
//
// NOTE: We currently don't handle multiple next hops, so only one next hop
// per device can be used.
if route.MultiPath != nil {
for _, mp := range route.MultiPath {
if mp.LinkIndex == link.Attrs().Index {
copy(nextHopIP, mp.Gw.To16())
break
}
}
}
}
return nextHopIP, nil
}
Expand Down
216 changes: 181 additions & 35 deletions pkg/datapath/linux/node_linux_test.go
Expand Up @@ -2136,22 +2136,33 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)
defer func() { sysctl.Write(mcastNumIPv6, mcastNumOld) }()

// 1. Test whether another node in the same L2 subnet can be arpinged.
// 1. Test whether another node with multiple paths can be arpinged.
// Each node has two devices and the other node in the different netns
// is reachable via either pair.
// Neighbor entries are not installed on devices where no route exists
//
// +--------------+ +--------------+
// | host netns | | netns1 |
// | | | nodev1 |
// | | | fe80::1/128 |
// | veth0+-----+veth1 |
// | | | | | |
// | f00d::249/96 | | f00d::250/96 |
// | | | |
// | veth2+-----+veth3 |
// | | | | | |
// | f00a::249/96 | | f00a::250/96 |
// +--------------+ +--------------+
// +--------------+ +-------------------+
// | host netns | | netns1 |
// | | | nodev1 |
// | | | fc00:c111::1/128 |
// | veth0+-----+veth1 |
// | | | | | |
// | f00a::249/96 | | f00a::250/96 |
// | | | |
// | veth2+-----+veth3 |
// | | | | | |
// | f00b::249/96 | | f00b::250/96 |
// | | | |
// | f00c::249/96 | | |
// | | | | |
// | veth4 | | |
// +-+------------+ +-------------------+
// |
// +-+--------------------------------------+
// | veth5 other netns |
// | | |
// | f00c::250/96 |
// +----------------------------------------+

// Setup
vethPair01 := &netlink.Veth{
Expand All @@ -2165,10 +2176,10 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)
veth1, err := netlink.LinkByName("veth1")
c.Assert(err, check.IsNil)
_, ipnet, _ := net.ParseCIDR("f00d::/96")
v1IP0 := net.ParseIP("f00d::249")
v1IP1 := net.ParseIP("f00d::250")
v1IPG := net.ParseIP("f00d::251")
_, ipnet, _ := net.ParseCIDR("f00a::/96")
v1IP0 := net.ParseIP("f00a::249")
v1IP1 := net.ParseIP("f00a::250")
v1IPG := net.ParseIP("f00a::251")
ipnet.IP = v1IP0
addr := &netlink.Addr{IPNet: ipnet}
err = netlink.AddrAdd(veth0, addr)
Expand All @@ -2181,14 +2192,14 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
defer netns0.Close()
err = netlink.LinkSetNsFd(veth1, int(netns0.Fd()))
c.Assert(err, check.IsNil)
node1Addr, err := netlink.ParseAddr("fc00:c111::1/128")
c.Assert(err, check.IsNil)
netns0.Do(func(ns.NetNS) error {
lo, err := netlink.LinkByName("lo")
c.Assert(err, check.IsNil)
err = netlink.LinkSetUp(lo)
c.Assert(err, check.IsNil)
addr, err := netlink.ParseAddr("fe80::1/128")
c.Assert(err, check.IsNil)
err = netlink.AddrAdd(lo, addr)
err = netlink.AddrAdd(lo, node1Addr)
c.Assert(err, check.IsNil)

veth1, err := netlink.LinkByName("veth1")
Expand Down Expand Up @@ -2217,10 +2228,10 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)
veth3, err := netlink.LinkByName("veth3")
c.Assert(err, check.IsNil)
_, ipnet, _ = net.ParseCIDR("f00a::/96")
v2IP0 := net.ParseIP("f00a::249")
v2IP1 := net.ParseIP("f00a::250")
v2IPG := net.ParseIP("f00a::251")
_, ipnet, _ = net.ParseCIDR("f00b::/96")
v2IP0 := net.ParseIP("f00b::249")
v2IP1 := net.ParseIP("f00b::250")
v2IPG := net.ParseIP("f00b::251")
ipnet.IP = v2IP0
addr = &netlink.Addr{IPNet: ipnet}
err = netlink.AddrAdd(veth2, addr)
Expand Down Expand Up @@ -2248,7 +2259,7 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)

r := &netlink.Route{
Dst: netlink.NewIPNet(net.ParseIP("fe80::1")),
Dst: netlink.NewIPNet(node1Addr.IP),
MultiPath: []*netlink.NexthopInfo{
{
LinkIndex: veth0.Attrs().Index,
Expand All @@ -2264,6 +2275,47 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)
defer netlink.RouteDel(r)

// Setup another veth pair that doesn't have a route to node
vethPair45 := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{Name: "veth4"},
PeerName: "veth5",
}
err = netlink.LinkAdd(vethPair45)
c.Assert(err, check.IsNil)
defer netlink.LinkDel(vethPair45)
veth4, err := netlink.LinkByName("veth4")
c.Assert(err, check.IsNil)
veth5, err := netlink.LinkByName("veth5")
c.Assert(err, check.IsNil)
_, ipnet, _ = net.ParseCIDR("f00c::/96")
v3IP0 := net.ParseIP("f00c::249")
v3IP1 := net.ParseIP("f00c::250")
ipnet.IP = v3IP0
addr = &netlink.Addr{IPNet: ipnet}
err = netlink.AddrAdd(veth4, addr)
c.Assert(err, check.IsNil)
err = netlink.LinkSetUp(veth4)
c.Assert(err, check.IsNil)

netns1, err := netns.ReplaceNetNSWithName("test-arping-netns1")
c.Assert(err, check.IsNil)
defer netns1.Close()

err = netlink.LinkSetNsFd(veth5, int(netns1.Fd()))
c.Assert(err, check.IsNil)
err = netns1.Do(func(ns.NetNS) error {
veth5, err := netlink.LinkByName("veth5")
c.Assert(err, check.IsNil)
ipnet.IP = v3IP1
addr = &netlink.Addr{IPNet: ipnet}
err = netlink.AddrAdd(veth5, addr)
c.Assert(err, check.IsNil)
err = netlink.LinkSetUp(veth5)
c.Assert(err, check.IsNil)
return nil
})
c.Assert(err, check.IsNil)

prevRoutingMode := option.Config.RoutingMode
defer func() { option.Config.RoutingMode = prevRoutingMode }()
option.Config.RoutingMode = option.RoutingModeNative
Expand All @@ -2272,7 +2324,7 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
option.Config.DirectRoutingDevice = "veth0"
prevDevices := option.Config.GetDevices()
defer func() { option.Config.SetDevices(prevDevices) }()
option.Config.SetDevices([]string{"veth0", "veth2"})
option.Config.SetDevices([]string{"veth0", "veth2", "veth4"})
prevNP := option.Config.EnableNodePort
defer func() { option.Config.EnableNodePort = prevNP }()
option.Config.EnableNodePort = true
Expand Down Expand Up @@ -2320,7 +2372,7 @@ func (s *linuxPrivilegedIPv6OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
Name: "node1",
IPAddresses: []nodeTypes.Address{{
Type: nodeaddressing.NodeInternalIP,
IP: net.ParseIP("fe80::1"),
IP: node1Addr.IP,
}},
}
now := time.Now()
Expand Down Expand Up @@ -2373,6 +2425,27 @@ refetch2:
}
c.Assert(found, check.Equals, true)

// Check whether we don't install the neighbor entries to nodes on the device where the actual route isn't.
// "Consistently(<check>, 5sec, 1sec)"
start := time.Now()
for {
if time.Since(start) > 5*time.Second {
break
}

neighs, err = netlink.NeighList(veth4.Attrs().Index, netlink.FAMILY_V6)
c.Assert(err, check.IsNil)
found = false
for _, n := range neighs {
if n.IP.Equal(v3IP1) || n.IP.Equal(node1Addr.IP) {
found = true
}
}
c.Assert(found, check.Equals, false)

time.Sleep(1 * time.Second)
}

// Swap MAC addresses of veth0 and veth1, veth2 and veth3 to ensure the MAC address of veth1 changed.
// Trigger neighbor refresh on veth0 and check whether the arp entry was updated.
var veth0HwAddr, veth1HwAddr, veth2HwAddr, veth3HwAddr, updatedHwAddrFromArpEntry net.HardwareAddr
Expand Down Expand Up @@ -3319,9 +3392,10 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)
defer func() { sysctl.Write(mcastNumIPv4, mcastNumOld) }()

// 1. Test whether another node in the same L2 subnet can be arpinged.
// 1. Test whether another node with multiple paths can be arpinged.
// Each node has two devices and the other node in the different netns
// is reachable via either pair.
// Neighbor entries are not installed on devices where no route exists.
//
// +--------------+ +--------------+
// | host netns | | netns1 |
Expand All @@ -3334,7 +3408,17 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
// | veth2+-----+veth3 |
// | | | | | |
// | 8.8.8.249/29 | | 8.8.8.250/29 |
// +--------------+ +--------------+
// | | | |
// | 7.7.7.249/29 | | |
// | | | | |
// | veth4 | | |
// +-+------------+ +--------------+
// |
// +-+---------------------------------+
// | veth5 other netns |
// | | |
// | 7.7.7.250/29 |
// +-----------------------------------+

// Setup
vethPair01 := &netlink.Veth{
Expand Down Expand Up @@ -3364,14 +3448,14 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
defer netns0.Close()
err = netlink.LinkSetNsFd(veth1, int(netns0.Fd()))
c.Assert(err, check.IsNil)
node1Addr, err := netlink.ParseAddr("10.0.0.1/32")
c.Assert(err, check.IsNil)
err = netns0.Do(func(ns.NetNS) error {
lo, err := netlink.LinkByName("lo")
c.Assert(err, check.IsNil)
err = netlink.LinkSetUp(lo)
c.Assert(err, check.IsNil)
addr, err := netlink.ParseAddr("10.0.0.1/32")
c.Assert(err, check.IsNil)
err = netlink.AddrAdd(lo, addr)
err = netlink.AddrAdd(lo, node1Addr)
c.Assert(err, check.IsNil)

veth1, err := netlink.LinkByName("veth1")
Expand Down Expand Up @@ -3432,7 +3516,7 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)

r := &netlink.Route{
Dst: netlink.NewIPNet(net.ParseIP("10.0.0.1")),
Dst: netlink.NewIPNet(node1Addr.IP),
MultiPath: []*netlink.NexthopInfo{
{
LinkIndex: veth0.Attrs().Index,
Expand All @@ -3447,6 +3531,47 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
c.Assert(err, check.IsNil)
defer netlink.RouteDel(r)

// Setup another veth pair that doesn't have a route to node
vethPair45 := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{Name: "veth4"},
PeerName: "veth5",
}
err = netlink.LinkAdd(vethPair45)
c.Assert(err, check.IsNil)
defer netlink.LinkDel(vethPair45)
veth4, err := netlink.LinkByName("veth4")
c.Assert(err, check.IsNil)
veth5, err := netlink.LinkByName("veth5")
c.Assert(err, check.IsNil)
_, ipnet, _ = net.ParseCIDR("7.7.7.252/29")
v3IP0 := net.ParseIP("7.7.7.249")
v3IP1 := net.ParseIP("7.7.7.250")
ipnet.IP = v3IP0
addr = &netlink.Addr{IPNet: ipnet}
err = netlink.AddrAdd(veth4, addr)
c.Assert(err, check.IsNil)
err = netlink.LinkSetUp(veth4)
c.Assert(err, check.IsNil)

netns1, err := netns.ReplaceNetNSWithName("test-arping-netns1")
c.Assert(err, check.IsNil)
defer netns1.Close()

err = netlink.LinkSetNsFd(veth5, int(netns1.Fd()))
c.Assert(err, check.IsNil)
err = netns1.Do(func(ns.NetNS) error {
veth5, err := netlink.LinkByName("veth5")
c.Assert(err, check.IsNil)
ipnet.IP = v3IP1
addr = &netlink.Addr{IPNet: ipnet}
err = netlink.AddrAdd(veth5, addr)
c.Assert(err, check.IsNil)
err = netlink.LinkSetUp(veth5)
c.Assert(err, check.IsNil)
return nil
})
c.Assert(err, check.IsNil)

prevRoutingMode := option.Config.RoutingMode
defer func() { option.Config.RoutingMode = prevRoutingMode }()
option.Config.RoutingMode = option.RoutingModeNative
Expand All @@ -3455,7 +3580,7 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
option.Config.DirectRoutingDevice = "veth0"
prevDevices := option.Config.GetDevices()
defer func() { option.Config.SetDevices(prevDevices) }()
option.Config.SetDevices([]string{"veth0", "veth2"})
option.Config.SetDevices([]string{"veth0", "veth2", "veth4"})
prevNP := option.Config.EnableNodePort
defer func() { option.Config.EnableNodePort = prevNP }()
option.Config.EnableNodePort = true
Expand Down Expand Up @@ -3503,7 +3628,7 @@ func (s *linuxPrivilegedIPv4OnlyTestSuite) TestArpPingHandlingForMultiDevice(c *
Name: "node1",
IPAddresses: []nodeTypes.Address{{
Type: nodeaddressing.NodeInternalIP,
IP: net.ParseIP("10.0.0.1"),
IP: node1Addr.IP,
}},
}
now := time.Now()
Expand Down Expand Up @@ -3556,6 +3681,27 @@ refetch2:
}
c.Assert(found, check.Equals, true)

// Check whether we don't install the neighbor entries to nodes on the device where the actual route isn't.
// "Consistently(<check>, 5sec, 1sec)"
start := time.Now()
for {
if time.Since(start) > 5*time.Second {
break
}

neighs, err = netlink.NeighList(veth4.Attrs().Index, netlink.FAMILY_V4)
c.Assert(err, check.IsNil)
found = false
for _, n := range neighs {
if n.IP.Equal(v3IP1) || n.IP.Equal(node1Addr.IP) {
found = true
}
}
c.Assert(found, check.Equals, false)

time.Sleep(1 * time.Second)
}

// Swap MAC addresses of veth0 and veth1, veth2 and veth3 to ensure the MAC address of veth1 changed.
// Trigger neighbor refresh on veth0 and check whether the arp entry was updated.
var veth0HwAddr, veth1HwAddr, veth2HwAddr, veth3HwAddr, updatedHwAddrFromArpEntry net.HardwareAddr
Expand Down

0 comments on commit 2058ed6

Please sign in to comment.