Skip to content

Commit

Permalink
feat(ec2): support multi NIC/IP setups (canonical#4799)
Browse files Browse the repository at this point in the history
For EC2 instances with multiple NICs, policy-based routing will be
configured on secondary NICs / secondary IPs to ensure outgoing packets
are routed via the correct interface.

Without this extra routing config, traffic coming via secondary NICs
was routed using the main routing table, which can only contain one
default route and the kernel only takes the destination IP address into
account when selecting a route.  Packets for destination
beyond local networks were always routed through the default route, the
one associated with the primary NIC.  If traffic based on specific
source IP addresses is associated with another NIC, wihtout these
routing policies, this traffic would flow over the default route and the
connection couldn't be established.

References:

[1] https://bootstack.canonical.com/cases/00336928
[2] https://bootstack.canonical.com/cases/00377150
  • Loading branch information
aciba90 committed Jan 29, 2024
1 parent 180da9b commit 3326610
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 23 deletions.
102 changes: 98 additions & 4 deletions cloudinit/sources/DataSourceEc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from cloudinit import url_helper as uhelp
from cloudinit import util, warnings
from cloudinit.event import EventScope, EventType
from cloudinit.net import dhcp
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.net.ephemeral import EphemeralIPNetwork
from cloudinit.sources.helpers import ec2
Expand Down Expand Up @@ -55,7 +56,6 @@ def skip_404_tag_errors(exception):


class DataSourceEc2(sources.DataSource):

dsname = "Ec2"
# Default metadata urls that will be used if none are provided
# They will be checked for 'resolveability' and some of the
Expand Down Expand Up @@ -402,7 +402,7 @@ def device_name_to_device(self, name):
LOG.debug("block-device-mapping not a dictionary: '%s'", bdm)
return None

for (entname, device) in bdm.items():
for entname, device in bdm.items():
if entname == name:
found = device
break
Expand Down Expand Up @@ -508,6 +508,7 @@ def network_config(self):
# behavior on those releases.
result = convert_ec2_metadata_network_config(
net_md,
self.distro,
fallback_nic=iface,
full_network_config=util.get_cfg_option_bool(
self.ds_cfg, "apply_full_imds_network_config", True
Expand Down Expand Up @@ -885,14 +886,19 @@ def _collect_platform_data():


def convert_ec2_metadata_network_config(
network_md, macs_to_nics=None, fallback_nic=None, full_network_config=True
network_md,
distro,
macs_to_nics=None,
fallback_nic=None,
full_network_config=True,
):
"""Convert ec2 metadata to network config version 2 data dict.
@param: network_md: 'network' portion of EC2 metadata.
generally formed as {"interfaces": {"macs": {}} where
'macs' is a dictionary with mac address as key and contents like:
{"device-number": "0", "interface-id": "...", "local-ipv4s": ...}
@param: distro: instance of Distro.
@param: macs_to_nics: Optional dict of mac addresses and nic names. If
not provided, get_interfaces_by_mac is called to get it from the OS.
@param: fallback_nic: Optionally provide the primary nic interface name.
Expand Down Expand Up @@ -927,6 +933,7 @@ def convert_ec2_metadata_network_config(
return netcfg
# Apply network config for all nics and any secondary IPv4/v6 addresses
nic_idx = 0
table = 100
for mac, nic_name in sorted(macs_to_nics.items()):
nic_metadata = macs_metadata.get(mac)
if not nic_metadata:
Expand All @@ -942,18 +949,105 @@ def convert_ec2_metadata_network_config(
"match": {"macaddress": mac.lower()},
"set-name": nic_name,
}
# Configure policy-based routing on secondary NICs / secondary IPs to
# ensure outgoing packets are routed via the correct interface.
#
# If device-number is not present (AliYun or other ec2-like platforms),
# do not configure source-routing as we cannot determine which is the
# primary NIC.
if nic_metadata.get("device-number") and nic_idx > 1:
dhcp_override["use-routes"] = True
cur_table = table + nic_idx - 1
dev_config["routes"] = []
try:
client = dhcp.select_dhcp_client(distro)
leases = client.dhcp_discovery(nic_name, distro=distro)
gateway = leases[-1]["routers"]
except Exception as e:
LOG.warning(
"Could not perform dhcp discovery on %s to find its "
"gateway. Not adding default route via the gateway. "
"Error: %s",
nic_name,
e,
)
else:
# Add default route via the NIC's gateway
dev_config["routes"].append(
{
"to": "0.0.0.0/0",
"via": gateway,
"table": cur_table,
},
)
subnet_prefix_routes = nic_metadata["subnet-ipv4-cidr-block"]
subnet_prefix_routes = (
[subnet_prefix_routes]
if isinstance(subnet_prefix_routes, str)
else subnet_prefix_routes
)
for prefix_route in subnet_prefix_routes:
dev_config["routes"].append(
{
"to": prefix_route,
"table": cur_table,
},
)

dev_config["routing-policy"] = []
# Packets coming from any IPv4 associated with the current NIC
# will be routed using `cur_table` routing table
ipv4s = nic_metadata["local-ipv4s"]
ipv4s = [ipv4s] if isinstance(ipv4s, str) else ipv4s
for ipv4 in ipv4s:
dev_config["routing-policy"].append(
{
"from": ipv4,
"table": cur_table,
},
)
if nic_metadata.get("ipv6s"): # Any IPv6 addresses configured
dev_config["dhcp6"] = True
dev_config["dhcp6-overrides"] = dhcp_override
if nic_metadata.get("device-number") and nic_idx > 1:
cur_table = table + nic_idx - 1
subnet_prefix_routes = nic_metadata["subnet-ipv6-cidr-block"]
subnet_prefix_routes = (
[subnet_prefix_routes]
if isinstance(subnet_prefix_routes, str)
else subnet_prefix_routes
)
for prefix_route in subnet_prefix_routes:
dev_config["routes"].append(
{
"to": prefix_route,
"table": cur_table,
},
)

dev_config["routing-policy"] = []
ipv6s = nic_metadata["ipv6s"]
ipv6s = [ipv6s] if isinstance(ipv6s, str) else ipv6s
for ipv6 in ipv6s:
dev_config["routing-policy"].append(
{
"from": ipv6,
"table": cur_table,
},
)
dev_config["addresses"] = get_secondary_addresses(nic_metadata, mac)
if not dev_config["addresses"]:
dev_config.pop("addresses") # Since we found none configured

netcfg["ethernets"][nic_name] = dev_config
# Remove route-metric dhcp overrides if only one nic configured
# Remove route-metric dhcp overrides and routes / routing-policy if only
# one nic configured
if len(netcfg["ethernets"]) == 1:
for nic_name in netcfg["ethernets"].keys():
netcfg["ethernets"][nic_name].pop("dhcp4-overrides")
netcfg["ethernets"][nic_name].pop("dhcp6-overrides", None)
netcfg["ethernets"][nic_name].pop("routes", None)
netcfg["ethernets"][nic_name].pop("routing-policy", None)
return netcfg


Expand Down
7 changes: 7 additions & 0 deletions doc/rtd/reference/datasources/ec2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,11 @@ Notes
For example: the primary NIC will have a DHCP route-metric of 100,
the next NIC will have 200.

* For EC2 instances with multiple NICs, policy-based routing will be
configured on secondary NICs / secondary IPs to ensure outgoing packets
are routed via the correct interface.
At the moment of writing, this network configuration is applied at first
boot only but it can be configured to be applied on every boot and when
NICs are hotplugged, see :ref:`events`.

.. _EC2 tags user guide: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html#work-with-tags-in-IMDS
88 changes: 88 additions & 0 deletions tests/integration_tests/modules/test_hotplug.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import contextlib
import time
from collections import namedtuple

import pytest
import yaml

from cloudinit.subp import subp
from tests.integration_tests.clouds import IntegrationCloud
from tests.integration_tests.instances import IntegrationInstance
from tests.integration_tests.integration_settings import PLATFORM
from tests.integration_tests.releases import CURRENT_RELEASE, FOCAL
from tests.integration_tests.util import verify_clean_log

USER_DATA = """\
#cloud-config
Expand Down Expand Up @@ -124,3 +128,87 @@ def test_no_hotplug_in_userdata(client: IntegrationInstance):
assert "disabled" == client.execute(
"cloud-init devel hotplug-hook -s net query"
)


@pytest.mark.skipif(PLATFORM != "ec2", reason="test is ec2 specific")
def test_multi_nic_hotplug(setup_image, session_cloud: IntegrationCloud):
"""Tests that additional secondary NICs are routable from non-local
networks after the hotplug hook is executed when network updates
are configured on the HOTPLUG event."""
ec2 = session_cloud.cloud_instance.client
with session_cloud.launch(launch_kwargs={}, user_data=USER_DATA) as client:
ips_before = _get_ip_addr(client)
instance_pub_ip = client.instance.ip
secondary_priv_ip = client.instance.add_network_interface()
response = ec2.describe_network_interfaces(
Filters=[
{
"Name": "private-ip-address",
"Values": [secondary_priv_ip],
},
],
)
nic_id = response["NetworkInterfaces"][0]["NetworkInterfaceId"]

# Create Elastic IP
allocation = ec2.allocate_address(Domain="vpc")
try:
secondary_pub_ip = allocation["PublicIp"]
association = ec2.associate_address(
AllocationId=allocation["AllocationId"],
NetworkInterfaceId=nic_id,
)
assert association["ResponseMetadata"]["HTTPStatusCode"] == 200

_wait_till_hotplug_complete(client)
ips_after_add = _get_ip_addr(client)

netplan_cfg = client.read_from_file(
"/etc/netplan/50-cloud-init.yaml"
)
config = yaml.safe_load(netplan_cfg)
new_addition = [
ip for ip in ips_after_add if ip.ip4 == secondary_priv_ip
][0]
assert new_addition.interface in config["network"]["ethernets"]
new_nic_cfg = config["network"]["ethernets"][
new_addition.interface
]
assert "routing-policy" in new_nic_cfg
assert [{"from": secondary_priv_ip, "table": 101}] == new_nic_cfg[
"routing-policy"
]

assert len(ips_after_add) == len(ips_before) + 1

# SSH over primary NIC works
subp("nc -w 5 -zv " + instance_pub_ip + " 22", shell=True)

# THE TEST: SSH over secondary NIC works
subp("nc -w 5 -zv " + secondary_pub_ip + " 22", shell=True)

# Remove new NIC
client.instance.remove_network_interface(secondary_priv_ip)
_wait_till_hotplug_complete(client, expected_runs=2)

# SSH over primary NIC works
subp("nc -w 1 -zv " + instance_pub_ip + " 22", shell=True)

ips_after_remove = _get_ip_addr(client)
assert len(ips_after_remove) == len(ips_before)
assert secondary_priv_ip not in [ip.ip4 for ip in ips_after_remove]

netplan_cfg = client.read_from_file(
"/etc/netplan/50-cloud-init.yaml"
)
config = yaml.safe_load(netplan_cfg)
assert new_addition.interface not in config["network"]["ethernets"]

log_content = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log_content)
finally:
with contextlib.suppress(Exception):
ec2.disassociate_address(
AssociationId=association["AssociationId"]
)
ec2.release_address(AllocationId=allocation["AllocationId"])
77 changes: 77 additions & 0 deletions tests/integration_tests/test_networking.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
"""Networking-related tests."""
import contextlib

import pytest
import yaml

from cloudinit.subp import subp
from tests.integration_tests import random_mac_address
from tests.integration_tests.clouds import IntegrationCloud
from tests.integration_tests.instances import IntegrationInstance
from tests.integration_tests.integration_settings import PLATFORM
from tests.integration_tests.releases import CURRENT_RELEASE, NOBLE
from tests.integration_tests.util import verify_clean_log


def _add_dummy_bridge_to_netplan(client: IntegrationInstance):
Expand Down Expand Up @@ -254,3 +258,76 @@ def test_invalid_network_v2_netplan(session_cloud: IntegrationCloud):
"# E1: Invalid netplan schema. Error in network definition:"
" invalid boolean value 'badval" in annotate_out
)


@pytest.mark.skipif(PLATFORM != "ec2", reason="test is ec2 specific")
def test_ec2_multi_nic_reboot(setup_image, session_cloud: IntegrationCloud):
"""Tests that additional secondary NICs and secondary IPs on them are
routable from non-local networks after a reboot event when network updates
are configured on every boot."""
ec2 = session_cloud.cloud_instance.client
with session_cloud.launch(launch_kwargs={}, user_data=USER_DATA) as client:
# Add secondary NIC
secondary_priv_ip_0 = client.instance.add_network_interface()
response = ec2.describe_network_interfaces(
Filters=[
{
"Name": "private-ip-address",
"Values": [secondary_priv_ip_0],
},
],
)
nic_id = response["NetworkInterfaces"][0]["NetworkInterfaceId"]
# Add secondary IP to secondary NIC
association_0 = ec2.assign_private_ip_addresses(
NetworkInterfaceId=nic_id, SecondaryPrivateIpAddressCount=1
)
assert association_0["ResponseMetadata"]["HTTPStatusCode"] == 200
secondary_priv_ip_1 = association_0["AssignedPrivateIpAddresses"][0][
"PrivateIpAddress"
]

# Assing elastic IPs
allocation_0 = ec2.allocate_address(Domain="vpc")
allocation_1 = ec2.allocate_address(Domain="vpc")
try:
secondary_pub_ip_0 = allocation_0["PublicIp"]
secondary_pub_ip_1 = allocation_1["PublicIp"]

association_0 = ec2.associate_address(
AllocationId=allocation_0["AllocationId"],
NetworkInterfaceId=nic_id,
PrivateIpAddress=secondary_priv_ip_0,
)
assert association_0["ResponseMetadata"]["HTTPStatusCode"] == 200
association_1 = ec2.associate_address(
AllocationId=allocation_1["AllocationId"],
NetworkInterfaceId=nic_id,
PrivateIpAddress=secondary_priv_ip_1,
)
assert association_1["ResponseMetadata"]["HTTPStatusCode"] == 200

# Reboot to update network config
client.execute("cloud-init clean --logs")
client.restart()

# SSH over primary NIC works
instance_pub_ip = client.instance.ip
subp("nc -w 5 -zv " + instance_pub_ip + " 22", shell=True)

# SSH over secondary NIC works
subp("nc -w 5 -zv " + secondary_pub_ip_0 + " 22", shell=True)
subp("nc -w 5 -zv " + secondary_pub_ip_1 + " 22", shell=True)

log_content = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log_content)
finally:
with contextlib.suppress(Exception):
ec2.disassociate_address(
AssociationId=association_0["AssociationId"]
)
ec2.release_address(AllocationId=allocation_0["AllocationId"])
ec2.disassociate_address(
AssociationId=association_1["AssociationId"]
)
ec2.release_address(AllocationId=allocation_1["AllocationId"])
4 changes: 4 additions & 0 deletions tests/integration_tests/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ def verify_clean_log(log: str, ignore_deprecations: bool = True):
# Old Ubuntu cloud-images contain /etc/apt/sources.list
"WARNING]: Removing /etc/apt/sources.list to favor deb822 source"
" format",
# https://bugs.launchpad.net/ubuntu/+source/netplan.io/+bug/2041727
"WARNING]: Running ['netplan', 'apply'] resulted in stderr output: "
"WARNING:root:Cannot call Open vSwitch: ovsdb-server.service is not "
"running.",
]
traceback_texts = []
if "install canonical-livepatch" in log:
Expand Down

0 comments on commit 3326610

Please sign in to comment.