Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 50 additions & 27 deletions cli/src/pcluster/templates/queues_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DEFAULT_EPHEMERAL_DIR,
NODE_BOOTSTRAP_TIMEOUT,
OS_MAPPING,
P6E_GB200,
PCLUSTER_COMPUTE_RESOURCE_NAME_TAG,
PCLUSTER_QUEUE_NAME_TAG,
PCLUSTER_S3_ARTIFACTS_DICT,
Expand Down Expand Up @@ -150,33 +151,7 @@ def _add_compute_resource_launch_template(
instance_profiles,
is_detailed_monitoring_enabled,
):
# LT network interfaces
compute_lt_nw_interfaces = [
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
device_index=0,
network_card_index=0,
associate_public_ip_address=queue.networking.assign_public_ip,
interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None,
groups=queue_lt_security_groups,
subnet_id=(
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None
),
)
]

for network_card in compute_resource.network_cards_list[1:]:
compute_lt_nw_interfaces.append(
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
device_index=0 if network_card.maximum_network_interfaces() == 1 else 1,
network_card_index=network_card.network_card_index(),
associate_public_ip_address=False,
interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None,
groups=queue_lt_security_groups,
subnet_id=(
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None
),
)
)
compute_lt_nw_interfaces = add_network_interfaces(queue, compute_resource, queue_lt_security_groups)

conditional_template_properties = {}
if compute_resource.is_ebs_optimized:
Expand Down Expand Up @@ -385,3 +360,51 @@ def _add_compute_resource_launch_template(
)

return launch_template


def add_network_interfaces(
queue,
compute_resource,
queue_lt_security_groups,
):
"""Generate launch template network interfaces list."""
is_gb200 = compute_resource.instance_types[0].split(".")[0] == P6E_GB200
efa_enabled = compute_resource.efa and compute_resource.efa.enabled
interface_type = "efa" if efa_enabled and not is_gb200 else None

compute_lt_nw_interfaces = [
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
device_index=0,
network_card_index=0,
associate_public_ip_address=queue.networking.assign_public_ip,
interface_type=interface_type,
groups=queue_lt_security_groups,
subnet_id=(queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None),
)
]

for network_card in compute_resource.network_cards_list[1:]:
even = network_card.network_card_index() % 2 == 0
# if efa is disabled, and we have a gb200 instance we skip configuring odd numbered indexes because they only
# support efa-only interface type
if is_gb200 and not efa_enabled and not even:
continue

interface_type = "efa" if efa_enabled else None
# if efa is enabled with a gb200 instance, even indexes are configured as efa and the odd as efa-only
if is_gb200 and efa_enabled:
interface_type = "efa" if even else "efa-only"

compute_lt_nw_interfaces.append(
ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
device_index=0 if network_card.maximum_network_interfaces() == 1 else 1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this true for p4d's too?or gb200?

Maximum Network interfaces of Gb200 is 17 so the device Index would be 1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MaximumNetworkInterfaces for the network card is 1 for odd indexes and 2 for even indexes. The maximum network cards is 17.

This logic also works for all other instance types.

network_card_index=network_card.network_card_index(),
associate_public_ip_address=False,
interface_type=interface_type,
groups=queue_lt_security_groups,
subnet_id=(
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None
),
)
)
return compute_lt_nw_interfaces
93 changes: 93 additions & 0 deletions cli/tests/pcluster/templates/test_queues_stack.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import json
from unittest.mock import MagicMock

import pytest
from assertpy import assert_that
from freezegun import freeze_time

from pcluster.schemas.cluster_schema import ClusterSchema
from pcluster.templates.cdk_builder import CDKTemplateBuilder
from pcluster.templates.queues_stack import add_network_interfaces
from pcluster.utils import load_json_dict, load_yaml_dict
from tests.pcluster.aws.dummy_aws_api import mock_aws_api
from tests.pcluster.models.dummy_s3_bucket import dummy_cluster_bucket, mock_bucket_object_utils
Expand Down Expand Up @@ -152,6 +154,97 @@ def test_compute_nodes_dna_json(
assert_that(json.loads(compute_node_extra_json)).is_equal_to(expected_compute_node_extra_json)


class NetworkCard:
def __init__(self, index, max_interfaces=1):
self._index = index
self._max_interfaces = max_interfaces

def network_card_index(self):
return self._index

def maximum_network_interfaces(self):
return self._max_interfaces


@pytest.mark.parametrize(
"efa_enabled, instance_type, network_cards_list, expected_interfaces",
[
(
True,
"p6e-gb200.WHATEVER_SIZE",
[NetworkCard(0), NetworkCard(1), NetworkCard(2, 2), NetworkCard(3), NetworkCard(4, 2)],
[
{"network_card_index": 0, "interface_type": None, "device_index": 0},
{"network_card_index": 1, "interface_type": "efa-only", "device_index": 0},
{"network_card_index": 2, "interface_type": "efa", "device_index": 1},
{"network_card_index": 3, "interface_type": "efa-only", "device_index": 0},
{"network_card_index": 4, "interface_type": "efa", "device_index": 1},
],
),
(
False,
"p6e-gb200.WHATEVER_SIZE",
[NetworkCard(0), NetworkCard(1), NetworkCard(2, 2), NetworkCard(3), NetworkCard(4, 2)],
[
{"network_card_index": 0, "interface_type": None, "device_index": 0},
{"network_card_index": 2, "interface_type": None, "device_index": 1},
{"network_card_index": 4, "interface_type": None, "device_index": 1},
],
),
(
True,
"NOTp6e-gb200.WHATEVER_SIZE",
[NetworkCard(0), NetworkCard(1, 2), NetworkCard(2, 2)],
[
{"network_card_index": 0, "interface_type": "efa", "device_index": 0},
{"network_card_index": 1, "interface_type": "efa", "device_index": 1},
{"network_card_index": 2, "interface_type": "efa", "device_index": 1},
],
),
(
True,
"NOTp6e-gb200.WHATEVER_SIZE",
[NetworkCard(0), NetworkCard(1), NetworkCard(2)],
[
{"network_card_index": 0, "interface_type": "efa", "device_index": 0},
{"network_card_index": 1, "interface_type": "efa", "device_index": 0},
{"network_card_index": 2, "interface_type": "efa", "device_index": 0},
],
),
(
False,
"NOTp6e-gb200.WHATEVER_SIZE",
[NetworkCard(0), NetworkCard(1, 2), NetworkCard(2, 2)],
[
{"network_card_index": 0, "interface_type": None, "device_index": 0},
{"network_card_index": 1, "interface_type": None, "device_index": 1},
{"network_card_index": 2, "interface_type": None, "device_index": 1},
],
),
],
)
def test_add_compute_resource_launch_template(
mocker, efa_enabled, instance_type, test_datadir, network_cards_list, expected_interfaces
):
mock_compute_resource = MagicMock()
mock_compute_resource.name = "test-compute-resource"
mock_compute_resource.instance_types = [instance_type]
mock_compute_resource.efa.enabled = efa_enabled
mock_compute_resource.network_cards_list = network_cards_list

mock_queue = MagicMock()
mock_queue.name = "test-queue"

network_interfaces = add_network_interfaces(mock_queue, mock_compute_resource, ["sg-12345"])

assert len(network_interfaces) == len(expected_interfaces)

for actual, expected in zip(network_interfaces, expected_interfaces):
assert actual.network_card_index == expected["network_card_index"]
assert actual.interface_type == expected["interface_type"]
assert actual.device_index == expected["device_index"]


def render_join(elem: dict):
sep = str(elem[0])
body = elem[1]
Expand Down
Loading