Skip to content

Commit

Permalink
feat(ec2): support instances with repeated device-number (#4799)
Browse files Browse the repository at this point in the history
Some instances, as p5 instances, can have multiple network cards and
repeated device-numbers within them, see [0,1].

Add support to properly render the network configuration associated with
them.

References:

[0] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#network-cards
[1] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/p5-efa.html
  • Loading branch information
aciba90 authored and blackboxsw committed Feb 16, 2024
1 parent 19d9e07 commit aa98d4c
Show file tree
Hide file tree
Showing 4 changed files with 365 additions and 23 deletions.
82 changes: 71 additions & 11 deletions cloudinit/sources/DataSourceEc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import logging
import os
import time
from typing import List
from typing import Dict, List

from cloudinit import dmi, net, sources
from cloudinit import url_helper as uhelp
Expand Down Expand Up @@ -885,6 +885,63 @@ def _collect_platform_data():
return data


def _build_nic_order(
macs_metadata: Dict[str, Dict], macs: List[str]
) -> Dict[str, int]:
"""
Builds a dictionary containing macs as keys nad nic orders as values,
taking into account `network-card` and `device-number` if present.
Note that the first NIC will be the primary NIC as it will be the one with
[network-card] == 0 and device-number == 0 if present.
@param macs_metadata: dictionary with mac address as key and contents like:
{"device-number": "0", "interface-id": "...", "local-ipv4s": ...}
@macs: list of macs to consider
@return: Dictionary with macs as keys and nic orders as values.
"""
nic_order: Dict[str, int] = {}
if len(macs) == 0 or len(macs_metadata) == 0:
return nic_order

valid_macs_metadata = filter(
# filter out nics without metadata (not a physical nic)
lambda mmd: mmd[1] is not None,
# filter by macs
map(lambda mac: (mac, macs_metadata.get(mac)), macs),
)

def _get_key_as_int_or(dikt, key, alt_value):
value = dikt.get(key, None)
if value is not None:
return int(value)
return alt_value

# Sort by (network_card, device_index) as some instances could have
# multiple network cards with repeated device indexes.
#
# On platforms where network-card and device-number are not present,
# as AliYun, the order will be by mac, as before the introduction of this
# function.
return {
mac: i
for i, (mac, _mac_metadata) in enumerate(
sorted(
valid_macs_metadata,
key=lambda mmd: (
_get_key_as_int_or(
mmd[1], "network-card", float("infinity")
),
_get_key_as_int_or(
mmd[1], "device-number", float("infinity")
),
),
)
)
}


def convert_ec2_metadata_network_config(
network_md,
distro,
Expand Down Expand Up @@ -933,13 +990,16 @@ def convert_ec2_metadata_network_config(
return netcfg
# Apply network config for all nics and any secondary IPv4/v6 addresses
is_netplan = distro.network_activator == activators.NetplanActivator
nic_idx = 0
for mac, nic_name in sorted(macs_to_nics.items()):
macs = sorted(macs_to_nics.keys())
nic_order = _build_nic_order(macs_metadata, macs)
for mac in macs:
nic_name = macs_to_nics[mac]
nic_metadata = macs_metadata.get(mac)
if not nic_metadata:
continue # Not a physical nic represented in metadata
nic_idx = int(nic_metadata.get("device-number", nic_idx))
# nic_idx + 1 to start route_metric at 100
nic_idx = nic_order[mac]
is_primary_nic = nic_idx == 0
# nic_idx + 1 to start route_metric at 100 (nic_idx is 0-indexed)
dhcp_override = {"route-metric": (nic_idx + 1) * 100}
dev_config = {
"dhcp4": True,
Expand All @@ -958,7 +1018,11 @@ def convert_ec2_metadata_network_config(
# If device-number is not present (AliYun or other ec2-like platforms),
# do not configure source-routing as we cannot determine which is the
# primary NIC.
if is_netplan and nic_metadata.get("device-number") and nic_idx > 0:
if (
is_netplan
and nic_metadata.get("device-number")
and not is_primary_nic
):
dhcp_override["use-routes"] = True
table = 100 + nic_idx
dev_config["routes"] = []
Expand Down Expand Up @@ -1016,7 +1080,7 @@ def convert_ec2_metadata_network_config(
if (
is_netplan
and nic_metadata.get("device-number")
and nic_idx > 0
and not is_primary_nic
):
table = 100 + nic_idx
subnet_prefix_routes = nic_metadata["subnet-ipv6-cidr-block"]
Expand Down Expand Up @@ -1048,10 +1112,6 @@ def convert_ec2_metadata_network_config(
dev_config.pop("addresses") # Since we found none configured

netcfg["ethernets"][nic_name] = dev_config

# Advance nic_idx on platforms without device-number
if not nic_metadata.get("device-number"):
nic_idx += 1
# Remove route-metric dhcp overrides and routes / routing-policy if only
# one nic configured
if len(netcfg["ethernets"]) == 1:
Expand Down
28 changes: 16 additions & 12 deletions tests/integration_tests/clouds.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,20 +209,24 @@ def _get_initial_image(self, **kwargs) -> str:
)

def _perform_launch(
self, *, launch_kwargs, wait=True, **kwargs
self, *, launch_kwargs, wait=True, enable_ipv6=True, **kwargs
) -> EC2Instance:
"""Use a dual-stack VPC for cloud-init integration testing."""
if "vpc" not in launch_kwargs:
launch_kwargs["vpc"] = self.cloud_instance.get_or_create_vpc(
name="ec2-cloud-init-integration"
)
# Enable IPv6 metadata at http://[fd00:ec2::254]
if "Ipv6AddressCount" not in launch_kwargs:
launch_kwargs["Ipv6AddressCount"] = 1
if "MetadataOptions" not in launch_kwargs:
launch_kwargs["MetadataOptions"] = {}
if "HttpProtocolIpv6" not in launch_kwargs["MetadataOptions"]:
launch_kwargs["MetadataOptions"] = {"HttpProtocolIpv6": "enabled"}
if enable_ipv6:
if "vpc" not in launch_kwargs:
launch_kwargs["vpc"] = self.cloud_instance.get_or_create_vpc(
name="ec2-cloud-init-integration"
)

# Enable IPv6 metadata at http://[fd00:ec2::254]
if "Ipv6AddressCount" not in launch_kwargs:
launch_kwargs["Ipv6AddressCount"] = 1
if "MetadataOptions" not in launch_kwargs:
launch_kwargs["MetadataOptions"] = {}
if "HttpProtocolIpv6" not in launch_kwargs["MetadataOptions"]:
launch_kwargs["MetadataOptions"] = {
"HttpProtocolIpv6": "enabled"
}

pycloudlib_instance = self.cloud_instance.launch(**launch_kwargs)
self._maybe_wait(pycloudlib_instance, wait)
Expand Down
140 changes: 140 additions & 0 deletions tests/integration_tests/test_networking.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Networking-related tests."""
import contextlib
import json

import pytest
import yaml
Expand Down Expand Up @@ -364,3 +365,142 @@ def test_ec2_multi_nic_reboot(setup_image, session_cloud: IntegrationCloud):
)
with contextlib.suppress(Exception):
ec2.release_address(AllocationId=allocation_1["AllocationId"])


@pytest.mark.adhoc # costly instance not available in all regions / azs
@pytest.mark.skipif(PLATFORM != "ec2", reason="test is ec2 specific")
def test_ec2_multi_network_cards(setup_image, session_cloud: IntegrationCloud):
"""
Tests that with an interface type with multiple network cards (non unique
device indexes).
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html
https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/p5-efa.html
"""
ec2 = session_cloud.cloud_instance.client

vpc = session_cloud.cloud_instance.get_or_create_vpc(
name="ec2-cloud-init-integration"
)
[subnet_id] = [s.id for s in vpc.vpc.subnets.all()]
security_group_ids = [sg.id for sg in vpc.vpc.security_groups.all()]

launch_kwargs = {
"InstanceType": "p5.48xlarge",
"NetworkInterfaces": [
{
"NetworkCardIndex": 0,
"DeviceIndex": 0,
"InterfaceType": "efa",
"DeleteOnTermination": True,
"Groups": security_group_ids,
"SubnetId": subnet_id,
},
{
"NetworkCardIndex": 1,
"DeviceIndex": 1,
"InterfaceType": "efa",
"DeleteOnTermination": True,
"Groups": security_group_ids,
"SubnetId": subnet_id,
},
{
"NetworkCardIndex": 2,
"DeviceIndex": 1,
"InterfaceType": "efa",
"DeleteOnTermination": True,
"Groups": security_group_ids,
"SubnetId": subnet_id,
},
],
}
# Instances with this network setups do not get a public ip.
# Do not wait until we associate one to the primary interface so that we
# can interact with it.
with session_cloud.launch(
launch_kwargs=launch_kwargs,
user_data=USER_DATA,
enable_ipv6=False,
wait=False,
) as client:
client.instance._instance.wait_until_running(
Filters=[
{
"Name": "instance-id",
"Values": [client.instance.id],
}
]
)

network_interfaces = iter(
ec2.describe_network_interfaces(
Filters=[
{
"Name": "attachment.instance-id",
"Values": [client.instance.id],
}
]
)["NetworkInterfaces"]
)
nic_id_0 = next(network_interfaces)["NetworkInterfaceId"]

try:
allocation_0 = ec2.allocate_address(Domain="vpc")
association_0 = ec2.associate_address(
AllocationId=allocation_0["AllocationId"],
NetworkInterfaceId=nic_id_0,
)
assert association_0["ResponseMetadata"]["HTTPStatusCode"] == 200

result = client.execute(
"cloud-init query ds.meta-data.network.interfaces.macs"
)
assert result.ok, result.stderr
for _macs, net_metadata in json.load(result.stdout):
assert "network-card" in net_metadata

nic_id_1 = next(network_interfaces)["NetworkInterfaceId"]
allocation_1 = ec2.allocate_address(Domain="vpc")
association_1 = ec2.associate_address(
AllocationId=allocation_1["AllocationId"],
NetworkInterfaceId=nic_id_1,
)
assert association_1["ResponseMetadata"]["HTTPStatusCode"] == 200

nic_id_2 = next(network_interfaces)["NetworkInterfaceId"]
allocation_2 = ec2.allocate_address(Domain="vpc")
association_2 = ec2.associate_address(
AllocationId=allocation_2["AllocationId"],
NetworkInterfaceId=nic_id_2,
)
assert association_2["ResponseMetadata"]["HTTPStatusCode"] == 200

# Reboot to update network config
client.execute("cloud-init clean --logs")
client.restart()

log_content = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log_content)

# SSH over secondary NICs works
subp("nc -w 5 -zv " + allocation_1["PublicIp"] + " 22", shell=True)
subp("nc -w 5 -zv " + allocation_2["PublicIp"] + " 22", shell=True)
finally:
with contextlib.suppress(Exception):
ec2.disassociate_address(
AssociationId=association_0["AssociationId"]
)
with contextlib.suppress(Exception):
ec2.release_address(AllocationId=allocation_0["AllocationId"])
with contextlib.suppress(Exception):
ec2.disassociate_address(
AssociationId=association_1["AssociationId"]
)
with contextlib.suppress(Exception):
ec2.release_address(AllocationId=allocation_1["AllocationId"])
with contextlib.suppress(Exception):
ec2.disassociate_address(
AssociationId=association_2["AssociationId"]
)
with contextlib.suppress(Exception):
ec2.release_address(AllocationId=allocation_2["AllocationId"])

0 comments on commit aa98d4c

Please sign in to comment.