Skip to content

Commit

Permalink
feat(ec2): support multi NIC/IP setups (#4799)
Browse files Browse the repository at this point in the history
For EC2 instances with multiple NICs, policy-based routing will be
configured on secondary NICs / secondary IPs to ensure outgoing packets
are routed via the correct interface.

Without this extra routing config, traffic coming via secondary NICs
was routed using the main routing table, which can only contain one
default route and the kernel only takes the destination IP address into
account when selecting a route.  Packets for destination
beyond local networks were always routed through the default route, the
one associated with the primary NIC.  If traffic based on specific
source IP addresses is associated with another NIC, wihtout these
routing policies, this traffic would flow over the default route and the
connection couldn't be established.

References:

[1] https://bootstack.canonical.com/cases/00336928
[2] https://bootstack.canonical.com/cases/00377150
  • Loading branch information
aciba90 authored and blackboxsw committed Feb 16, 2024
1 parent 9f602d5 commit 717f6e1
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 19 deletions.
122 changes: 114 additions & 8 deletions cloudinit/sources/DataSourceEc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from cloudinit import url_helper as uhelp
from cloudinit import util, warnings
from cloudinit.event import EventScope, EventType
from cloudinit.net import activators
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.net.ephemeral import EphemeralIPNetwork
from cloudinit.sources.helpers import ec2
Expand Down Expand Up @@ -55,7 +56,6 @@ def skip_404_tag_errors(exception):


class DataSourceEc2(sources.DataSource):

dsname = "Ec2"
# Default metadata urls that will be used if none are provided
# They will be checked for 'resolveability' and some of the
Expand Down Expand Up @@ -402,7 +402,7 @@ def device_name_to_device(self, name):
LOG.debug("block-device-mapping not a dictionary: '%s'", bdm)
return None

for (entname, device) in bdm.items():
for entname, device in bdm.items():
if entname == name:
found = device
break
Expand Down Expand Up @@ -508,6 +508,7 @@ def network_config(self):
# behavior on those releases.
result = convert_ec2_metadata_network_config(
net_md,
self.distro,
fallback_nic=iface,
full_network_config=util.get_cfg_option_bool(
self.ds_cfg, "apply_full_imds_network_config", True
Expand Down Expand Up @@ -872,14 +873,19 @@ def _collect_platform_data():


def convert_ec2_metadata_network_config(
network_md, macs_to_nics=None, fallback_nic=None, full_network_config=True
network_md,
distro,
macs_to_nics=None,
fallback_nic=None,
full_network_config=True,
):
"""Convert ec2 metadata to network config version 2 data dict.
@param: network_md: 'network' portion of EC2 metadata.
generally formed as {"interfaces": {"macs": {}} where
'macs' is a dictionary with mac address as key and contents like:
{"device-number": "0", "interface-id": "...", "local-ipv4s": ...}
@param: distro: instance of Distro.
@param: macs_to_nics: Optional dict of mac addresses and nic names. If
not provided, get_interfaces_by_mac is called to get it from the OS.
@param: fallback_nic: Optionally provide the primary nic interface name.
Expand Down Expand Up @@ -913,34 +919,134 @@ def convert_ec2_metadata_network_config(
netcfg["ethernets"][nic_name] = dev_config
return netcfg
# Apply network config for all nics and any secondary IPv4/v6 addresses
is_netplan = distro.network_activator == activators.NetplanActivator
nic_idx = 0
for mac, nic_name in sorted(macs_to_nics.items()):
nic_metadata = macs_metadata.get(mac)
if not nic_metadata:
continue # Not a physical nic represented in metadata
# device-number is zero-indexed, we want it 1-indexed for the
# multiplication on the following line
nic_idx = int(nic_metadata.get("device-number", nic_idx)) + 1
dhcp_override = {"route-metric": nic_idx * 100}
nic_idx = int(nic_metadata.get("device-number", nic_idx))
# nic_idx + 1 to start route_metric at 100
dhcp_override = {"route-metric": (nic_idx + 1) * 100}
dev_config = {
"dhcp4": True,
"dhcp4-overrides": dhcp_override,
"dhcp6": False,
"match": {"macaddress": mac.lower()},
"set-name": nic_name,
}
# Configure policy-based routing on secondary NICs / secondary IPs to
# ensure outgoing packets are routed via the correct interface.
#
# This config only works on systems using Netplan because Networking
# config V2 does not support `routing-policy`, but this config is
# passed through on systems using Netplan.
#
# If device-number is not present (AliYun or other ec2-like platforms),
# do not configure source-routing as we cannot determine which is the
# primary NIC.
if is_netplan and nic_metadata.get("device-number") and nic_idx > 0:
dhcp_override["use-routes"] = True
table = 100 + nic_idx
dev_config["routes"] = []
try:
lease = distro.dhcp_client.dhcp_discovery(
nic_name, distro=distro
)
gateway = lease["routers"]
except NoDHCPLeaseError as e:
LOG.warning(
"Could not perform dhcp discovery on %s to find its "
"gateway. Not adding default route via the gateway. "
"Error: %s",
nic_name,
e,
)
else:
# Add default route via the NIC's gateway
dev_config["routes"].append(
{
"to": "0.0.0.0/0",
"via": gateway,
"table": table,
},
)
subnet_prefix_routes = nic_metadata["subnet-ipv4-cidr-block"]
subnet_prefix_routes = (
[subnet_prefix_routes]
if isinstance(subnet_prefix_routes, str)
else subnet_prefix_routes
)
for prefix_route in subnet_prefix_routes:
dev_config["routes"].append(
{
"to": prefix_route,
"table": table,
},
)

dev_config["routing-policy"] = []
# Packets coming from any IPv4 associated with the current NIC
# will be routed using `table` routing table
ipv4s = nic_metadata["local-ipv4s"]
ipv4s = [ipv4s] if isinstance(ipv4s, str) else ipv4s
for ipv4 in ipv4s:
dev_config["routing-policy"].append(
{
"from": ipv4,
"table": table,
},
)
if nic_metadata.get("ipv6s"): # Any IPv6 addresses configured
dev_config["dhcp6"] = True
dev_config["dhcp6-overrides"] = dhcp_override
if (
is_netplan
and nic_metadata.get("device-number")
and nic_idx > 0
):
table = 100 + nic_idx
subnet_prefix_routes = nic_metadata["subnet-ipv6-cidr-block"]
subnet_prefix_routes = (
[subnet_prefix_routes]
if isinstance(subnet_prefix_routes, str)
else subnet_prefix_routes
)
for prefix_route in subnet_prefix_routes:
dev_config["routes"].append(
{
"to": prefix_route,
"table": table,
},
)

dev_config["routing-policy"] = []
ipv6s = nic_metadata["ipv6s"]
ipv6s = [ipv6s] if isinstance(ipv6s, str) else ipv6s
for ipv6 in ipv6s:
dev_config["routing-policy"].append(
{
"from": ipv6,
"table": table,
},
)
dev_config["addresses"] = get_secondary_addresses(nic_metadata, mac)
if not dev_config["addresses"]:
dev_config.pop("addresses") # Since we found none configured

netcfg["ethernets"][nic_name] = dev_config
# Remove route-metric dhcp overrides if only one nic configured

# Advance nic_idx on platforms without device-number
if not nic_metadata.get("device-number"):
nic_idx += 1
# Remove route-metric dhcp overrides and routes / routing-policy if only
# one nic configured
if len(netcfg["ethernets"]) == 1:
for nic_name in netcfg["ethernets"].keys():
netcfg["ethernets"][nic_name].pop("dhcp4-overrides")
netcfg["ethernets"][nic_name].pop("dhcp6-overrides", None)
netcfg["ethernets"][nic_name].pop("routes", None)
netcfg["ethernets"][nic_name].pop("routing-policy", None)
return netcfg


Expand Down
7 changes: 7 additions & 0 deletions doc/rtd/reference/datasources/ec2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,11 @@ Notes
For example: the primary NIC will have a DHCP route-metric of 100,
the next NIC will have 200.

* For EC2 instances with multiple NICs, policy-based routing will be
configured on secondary NICs / secondary IPs to ensure outgoing packets
are routed via the correct interface.
This network configuration is only applied on distros using Netplan and
at first boot only but it can be configured to be applied on every boot
and when NICs are hotplugged, see :ref:`events`.

.. _EC2 tags user guide: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html#work-with-tags-in-IMDS
95 changes: 95 additions & 0 deletions tests/integration_tests/modules/test_hotplug.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import contextlib
import time
from collections import namedtuple

import pytest
import yaml

from cloudinit.subp import subp
from tests.integration_tests.clouds import IntegrationCloud
from tests.integration_tests.instances import IntegrationInstance
from tests.integration_tests.integration_settings import PLATFORM
from tests.integration_tests.releases import CURRENT_RELEASE, FOCAL
from tests.integration_tests.util import verify_clean_log

USER_DATA = """\
#cloud-config
Expand Down Expand Up @@ -124,3 +128,94 @@ def test_no_hotplug_in_userdata(client: IntegrationInstance):
assert "disabled" == client.execute(
"cloud-init devel hotplug-hook -s net query"
)


@pytest.mark.skipif(PLATFORM != "ec2", reason="test is ec2 specific")
def test_multi_nic_hotplug(setup_image, session_cloud: IntegrationCloud):
"""Tests that additional secondary NICs are routable from non-local
networks after the hotplug hook is executed when network updates
are configured on the HOTPLUG event."""
ec2 = session_cloud.cloud_instance.client
with session_cloud.launch(launch_kwargs={}, user_data=USER_DATA) as client:
ips_before = _get_ip_addr(client)
instance_pub_ip = client.instance.ip
secondary_priv_ip = client.instance.add_network_interface()
response = ec2.describe_network_interfaces(
Filters=[
{
"Name": "private-ip-address",
"Values": [secondary_priv_ip],
},
],
)
nic_id = response["NetworkInterfaces"][0]["NetworkInterfaceId"]

# Create Elastic IP
# Refactor after https://github.com/canonical/pycloudlib/issues/337 is
# completed
allocation = ec2.allocate_address(Domain="vpc")
try:
secondary_pub_ip = allocation["PublicIp"]
association = ec2.associate_address(
AllocationId=allocation["AllocationId"],
NetworkInterfaceId=nic_id,
)
assert association["ResponseMetadata"]["HTTPStatusCode"] == 200

_wait_till_hotplug_complete(client)

log_content = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log_content)

ips_after_add = _get_ip_addr(client)

netplan_cfg = client.read_from_file(
"/etc/netplan/50-cloud-init.yaml"
)
config = yaml.safe_load(netplan_cfg)
new_addition = [
ip for ip in ips_after_add if ip.ip4 == secondary_priv_ip
][0]
assert new_addition.interface in config["network"]["ethernets"]
new_nic_cfg = config["network"]["ethernets"][
new_addition.interface
]
assert "routing-policy" in new_nic_cfg
assert [{"from": secondary_priv_ip, "table": 101}] == new_nic_cfg[
"routing-policy"
]

assert len(ips_after_add) == len(ips_before) + 1

# SSH over primary NIC works
subp("nc -w 5 -zv " + instance_pub_ip + " 22", shell=True)

# THE TEST: SSH over secondary NIC works
subp("nc -w 5 -zv " + secondary_pub_ip + " 22", shell=True)

# Remove new NIC
client.instance.remove_network_interface(secondary_priv_ip)
_wait_till_hotplug_complete(client, expected_runs=2)

# SSH over primary NIC works
subp("nc -w 1 -zv " + instance_pub_ip + " 22", shell=True)

ips_after_remove = _get_ip_addr(client)
assert len(ips_after_remove) == len(ips_before)
assert secondary_priv_ip not in [ip.ip4 for ip in ips_after_remove]

netplan_cfg = client.read_from_file(
"/etc/netplan/50-cloud-init.yaml"
)
config = yaml.safe_load(netplan_cfg)
assert new_addition.interface not in config["network"]["ethernets"]

log_content = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log_content)
finally:
with contextlib.suppress(Exception):
ec2.disassociate_address(
AssociationId=association["AssociationId"]
)
with contextlib.suppress(Exception):
ec2.release_address(AllocationId=allocation["AllocationId"])

0 comments on commit 717f6e1

Please sign in to comment.