-
Notifications
You must be signed in to change notification settings - Fork 315
Change network interface setup logic to account for gb200 #6930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a59f9c9
a3df6c4
11c2917
30dcfef
1110ee8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,6 +13,7 @@ | |
DEFAULT_EPHEMERAL_DIR, | ||
NODE_BOOTSTRAP_TIMEOUT, | ||
OS_MAPPING, | ||
P6E_GB200, | ||
PCLUSTER_COMPUTE_RESOURCE_NAME_TAG, | ||
PCLUSTER_QUEUE_NAME_TAG, | ||
PCLUSTER_S3_ARTIFACTS_DICT, | ||
|
@@ -150,33 +151,7 @@ def _add_compute_resource_launch_template( | |
instance_profiles, | ||
is_detailed_monitoring_enabled, | ||
): | ||
# LT network interfaces | ||
compute_lt_nw_interfaces = [ | ||
ec2.CfnLaunchTemplate.NetworkInterfaceProperty( | ||
device_index=0, | ||
network_card_index=0, | ||
associate_public_ip_address=queue.networking.assign_public_ip, | ||
interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None, | ||
groups=queue_lt_security_groups, | ||
subnet_id=( | ||
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None | ||
), | ||
) | ||
] | ||
|
||
for network_card in compute_resource.network_cards_list[1:]: | ||
compute_lt_nw_interfaces.append( | ||
ec2.CfnLaunchTemplate.NetworkInterfaceProperty( | ||
device_index=0 if network_card.maximum_network_interfaces() == 1 else 1, | ||
network_card_index=network_card.network_card_index(), | ||
associate_public_ip_address=False, | ||
interface_type="efa" if compute_resource.efa and compute_resource.efa.enabled else None, | ||
groups=queue_lt_security_groups, | ||
subnet_id=( | ||
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None | ||
), | ||
) | ||
) | ||
compute_lt_nw_interfaces = add_network_interfaces(queue, compute_resource, queue_lt_security_groups) | ||
|
||
conditional_template_properties = {} | ||
if compute_resource.is_ebs_optimized: | ||
|
@@ -385,3 +360,51 @@ def _add_compute_resource_launch_template( | |
) | ||
|
||
return launch_template | ||
|
||
|
||
def add_network_interfaces( | ||
queue, | ||
compute_resource, | ||
queue_lt_security_groups, | ||
): | ||
"""Generate launch template network interfaces list.""" | ||
is_gb200 = compute_resource.instance_types[0].split(".")[0] == P6E_GB200 | ||
efa_enabled = compute_resource.efa and compute_resource.efa.enabled | ||
interface_type = "efa" if efa_enabled and not is_gb200 else None | ||
|
||
compute_lt_nw_interfaces = [ | ||
ec2.CfnLaunchTemplate.NetworkInterfaceProperty( | ||
device_index=0, | ||
network_card_index=0, | ||
associate_public_ip_address=queue.networking.assign_public_ip, | ||
interface_type=interface_type, | ||
groups=queue_lt_security_groups, | ||
subnet_id=(queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None), | ||
) | ||
] | ||
|
||
for network_card in compute_resource.network_cards_list[1:]: | ||
even = network_card.network_card_index() % 2 == 0 | ||
# if efa is disabled, and we have a gb200 instance we skip configuring odd numbered indexes because they only | ||
# support efa-only interface type | ||
if is_gb200 and not efa_enabled and not even: | ||
continue | ||
|
||
interface_type = "efa" if efa_enabled else None | ||
# if efa is enabled with a gb200 instance, even indexes are configured as efa and the odd as efa-only | ||
if is_gb200 and efa_enabled: | ||
interface_type = "efa" if even else "efa-only" | ||
|
||
compute_lt_nw_interfaces.append( | ||
ec2.CfnLaunchTemplate.NetworkInterfaceProperty( | ||
device_index=0 if network_card.maximum_network_interfaces() == 1 else 1, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this true for p4d's too?or gb200? Maximum Network interfaces of Gb200 is 17 so the device Index would be 1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This logic also works for all other instance types. |
||
network_card_index=network_card.network_card_index(), | ||
associate_public_ip_address=False, | ||
interface_type=interface_type, | ||
groups=queue_lt_security_groups, | ||
subnet_id=( | ||
queue.networking.subnet_ids[0] if isinstance(compute_resource, SlurmComputeResource) else None | ||
), | ||
) | ||
) | ||
return compute_lt_nw_interfaces |
Uh oh!
There was an error while loading. Please reload this page.