Skip to content

Commit

Permalink
Fix the issue GARP not being sent (#796)
Browse files Browse the repository at this point in the history
On Linux, to flush stale ARP cache of network, antrea-agent is supposed
to send gratuitous ARP packets when configuring network for a Pod.
However, the routine was broken when refactoring the
"advertiseContainerAddr" method for Windows support. The action was
executed in a new spawned goroutine within the context of "NetNS.Do",
then it was actually executed after NetNS had switched to the target
container NS and had switched back to the host NS.

This patch fixes it by executing the whole routine in a goroutine, and
fixes the ticker leak problem. It also adds an e2e test to verify there
are at least 3 ARP packets received by OVS after a Pod is up to avoid
this being broken again as not sending GARP may not lead to connectivity
test failure stably if there are no IP reuse before ARP cache expiry.
  • Loading branch information
tnqn authored and antoninbas committed Jun 5, 2020
1 parent be00b83 commit 6311e75
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 36 deletions.
67 changes: 31 additions & 36 deletions pkg/agent/cniserver/interface_configuration_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,46 +98,41 @@ func configureContainerAddr(netns ns.NetNS, containerInterface *current.Interfac
// installed async, and the gratuitous ARP could be sent out after the Openflow entries are installed. Using another
// goroutine to ensure the processing of CNI ADD request is not blocked.
func (ic *ifConfigurator) advertiseContainerAddr(containerNetNS string, containerIfaceName string, result *current.Result) error {
netns, err := ns.GetNS(containerNetNS)
if err != nil {
klog.Errorf("Failed to open netns with %s: %v", containerNetNS, err)
return err
if err := ns.IsNSorErr(containerNetNS); err != nil {
return fmt.Errorf("%s is not a valid network namespace: %v", containerNetNS, err)
}
defer netns.Close()
if err := netns.Do(func(containerNs ns.NetNS) error {
go func() {
// The container veth must exist when this function is called, and do not check error here.
containerVeth, err := net.InterfaceByName(containerIfaceName)
if err != nil {
klog.Errorf("Failed to find container interface %s in ns %s", containerIfaceName, netns.Path())
return
}
var targetIP net.IP
for _, ipc := range result.IPs {
if ipc.Version == "4" {
targetIP = ipc.Address.IP
arping.GratuitousArpOverIface(ipc.Address.IP, *containerVeth)
}
}
if targetIP == nil {
klog.Warning("Failed to find a valid IP address for Gratuitous ARP, not send GARP")
return
// Sending Gratuitous ARP is a best-effort action and is unlikely to fail as we have ensured the netns is valid.
go ns.WithNetNSPath(containerNetNS, func(_ ns.NetNS) error {
iface, err := net.InterfaceByName(containerIfaceName)
if err != nil {
klog.Errorf("Failed to find container interface %s in ns %s: %v", containerIfaceName, containerNetNS, err)
return nil
}
var targetIP net.IP
for _, ipc := range result.IPs {
if ipc.Version == "4" {
targetIP = ipc.Address.IP
}
count := 0
for count < 3 {
select {
case <-time.Tick(50 * time.Millisecond):
// Send gratuitous ARP to network in case of stale mappings for this IP address
// (e.g. if a previous - deleted - Pod was using the same IP).
arping.GratuitousArpOverIface(targetIP, *containerVeth)
}
count++
}
if targetIP == nil {
klog.Warning("Failed to find an IPv4 address, skipped sending Gratuitous ARP")
return nil
}
ticker := time.NewTicker(50 * time.Millisecond)
defer ticker.Stop()
count := 0
for {
// Send gratuitous ARP to network in case of stale mappings for this IP address
// (e.g. if a previous - deleted - Pod was using the same IP).
arping.GratuitousArpOverIface(targetIP, *iface)
count++
if count == 3 {
break
}
}()
<-ticker.C
}
return nil
}); err != nil {
return err
}
})
return nil
}

Expand Down
51 changes: 51 additions & 0 deletions test/e2e/basic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -593,3 +593,54 @@ func TestDeletePreviousRoundFlowsOnStartup(t *testing.T) {
t.Errorf("Flow was still present after timeout")
}
}

// TestGratuitousARP verifies that we receive 3 GARP packets after a Pod is up.
// There might be ARP packets other than GARP sent if there is any unintentional
// traffic. So we just check the number of ARP packets is greater than 3.
func TestGratuitousARP(t *testing.T) {
data, err := setupTest(t)
if err != nil {
t.Fatalf("Error when setting up test: %v", err)
}
defer teardownTest(t, data)

podName := randName("test-pod-")
nodeName := workerNodeName(1)

t.Logf("Creating Pod '%s' on '%s'", podName, nodeName)
if err := data.createBusyboxPodOnNode(podName, nodeName); err != nil {
t.Fatalf("Error when creating Pod '%s': %v", podName, err)
}
defer deletePodWrapper(t, data, podName)

antreaPodName, err := data.getAntreaPodOnNode(nodeName)
if err != nil {
t.Fatalf("Error when retrieving the name of the Antrea Pod running on Node '%s': %v", nodeName, err)
}

podIP, err := data.podWaitForIP(defaultTimeout, podName, testNamespace)
if err != nil {
t.Fatalf("Error when waiting for IP for Pod '%s': %v", podName, err)
}

// Sending GARP is an asynchronous operation. The last GARP is supposed to
// be sent 100ms after processing CNI ADD request.
time.Sleep(100 * time.Millisecond)

cmd := []string{"ovs-ofctl", "dump-flows", defaultBridgeName, fmt.Sprintf("table=10,arp,arp_spa=%s", podIP)}
stdout, _, err := data.runCommandFromPod(antreaNamespace, antreaPodName, ovsContainerName, cmd)
if err != nil {
t.Fatalf("Error when querying openflow: %v", err)
}

re := regexp.MustCompile(`n_packets=([0-9]+)`)
matches := re.FindStringSubmatch(stdout)
if len(matches) == 0 {
t.Fatalf("cannot retrieve n_packets, unexpected output: %s", stdout)
}
arpPackets, _ := strconv.ParseUint(matches[1], 10, 32)
if arpPackets < 3 {
t.Errorf("Expected at least 3 ARP packets, got %d", arpPackets)
}
t.Logf("Got %d ARP packets after Pod was up", arpPackets)
}

0 comments on commit 6311e75

Please sign in to comment.