diff --git a/cli/src/pcluster/aws/aws_resources.py b/cli/src/pcluster/aws/aws_resources.py index 13de2a6f9c..92a6ab5759 100644 --- a/cli/src/pcluster/aws/aws_resources.py +++ b/cli/src/pcluster/aws/aws_resources.py @@ -571,6 +571,10 @@ def instance_type(self): """Return the instance type associated to the Capacity Reservation.""" return self.capacity_reservation_data.get("InstanceType") + def instance_platform(self): + """Return the instance platform associated to the Capacity Reservation.""" + return self.capacity_reservation_data.get("InstancePlatform") + def availability_zone(self): """Return the availability zone associated to the Capacity Reservation.""" return self.capacity_reservation_data.get("AvailabilityZone") diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index b96cdbb8db..cdac31d589 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -3113,6 +3113,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 is_flexible=compute_resource.is_flexible(), subnet=queue.networking.subnet_ids[0], capacity_type=queue.capacity_type, + os=self.image.os, ) self._register_validator( CapacityReservationResourceGroupValidator, diff --git a/cli/src/pcluster/constants.py b/cli/src/pcluster/constants.py index 7eec8e01a9..aefdc983bd 100644 --- a/cli/src/pcluster/constants.py +++ b/cli/src/pcluster/constants.py @@ -49,6 +49,25 @@ SLURM = "slurm" AWSBATCH = "awsbatch" + +# Capacity Reservation Platform types we support. +CR_PLATFORM_LINUX_UNIX = "Linux/UNIX" +CR_PLATFORM_UBUNTU_PRO = "Ubuntu Pro" +CR_PLATFORM_RHEL = "Red Hat Enterprise Linux" + + +CAPACITY_RESERVATION_OS_MAP = { + "alinux2": CR_PLATFORM_LINUX_UNIX, + "alinux2023": CR_PLATFORM_LINUX_UNIX, + "ubuntu2004": CR_PLATFORM_UBUNTU_PRO, + "ubuntu2204": CR_PLATFORM_UBUNTU_PRO, + "ubuntu2404": CR_PLATFORM_UBUNTU_PRO, + "rhel8": CR_PLATFORM_RHEL, + "rocky8": CR_PLATFORM_RHEL, + "rhel9": CR_PLATFORM_RHEL, + "rocky9": CR_PLATFORM_RHEL, +} + OS_MAPPING = { "alinux2": {"user": "ec2-user"}, "alinux2023": {"user": "ec2-user"}, diff --git a/cli/src/pcluster/validators/ec2_validators.py b/cli/src/pcluster/validators/ec2_validators.py index aa945f7cb0..068e080a03 100644 --- a/cli/src/pcluster/validators/ec2_validators.py +++ b/cli/src/pcluster/validators/ec2_validators.py @@ -19,7 +19,11 @@ from pcluster.aws.aws_resources import CapacityReservationInfo from pcluster.aws.common import AWSClientError from pcluster.config.common import CapacityType -from pcluster.constants import NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES, UNSUPPORTED_OSES_FOR_MICRO_NANO +from pcluster.constants import ( + CAPACITY_RESERVATION_OS_MAP, + NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES, + UNSUPPORTED_OSES_FOR_MICRO_NANO, +) from pcluster.utils import get_resource_name_from_resource_arn from pcluster.validators.common import FailureLevel, Validator @@ -331,9 +335,18 @@ def _validate( is_flexible: bool, subnet: str, capacity_type: CapacityType, + os, ): if capacity_reservation_id: capacity_reservation = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id])[0] + cr_platform = capacity_reservation.instance_platform() + if CAPACITY_RESERVATION_OS_MAP.get(os) != cr_platform: + self._add_failure( + f"Capacity reservation {capacity_reservation_id} has platform {cr_platform}," + f" which is not compatible with the cluster OS {os}. " + f"Please use a reservation with platform {CAPACITY_RESERVATION_OS_MAP.get(os)}.", + FailureLevel.ERROR, + ) if not instance_types: # If the instance type doesn't exist, this is an invalid config, diff --git a/cli/tests/pcluster/validators/test_all_validators.py b/cli/tests/pcluster/validators/test_all_validators.py index 0c0b462398..9e5c413bab 100644 --- a/cli/tests/pcluster/validators/test_all_validators.py +++ b/cli/tests/pcluster/validators/test_all_validators.py @@ -398,6 +398,7 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) call( capacity_reservation_id="cr-34567", instance_types=["t3.large"], + os="alinux2", is_flexible=True, subnet="subnet-23456789", capacity_type=CapacityType.ONDEMAND, @@ -405,6 +406,7 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) call( capacity_reservation_id="cr-12345", instance_types=["t3.xlarge"], + os="alinux2", is_flexible=True, subnet="subnet-23456789", capacity_type=CapacityType.CAPACITY_BLOCK, @@ -412,6 +414,7 @@ def test_slurm_validators_are_called_with_correct_argument(test_datadir, mocker) call( capacity_reservation_id="cr-23456", instance_types=["t3.xlarge"], + os="alinux2", is_flexible=False, subnet="subnet-23456789", capacity_type=CapacityType.CAPACITY_BLOCK, diff --git a/cli/tests/pcluster/validators/test_ec2_validators.py b/cli/tests/pcluster/validators/test_ec2_validators.py index 57cb47077f..46c5ccc701 100644 --- a/cli/tests/pcluster/validators/test_ec2_validators.py +++ b/cli/tests/pcluster/validators/test_ec2_validators.py @@ -708,62 +708,93 @@ def test_placement_group_validator( "is_flexible", "subnet_availability_zone", "capacity_type", + "os", "expected_messages", ), [ ( - CapacityReservationInfo({"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a"}), + CapacityReservationInfo( + {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "InstancePlatform": "Linux/UNIX"} + ), ["c5.xlarge"], False, "us-east-1a", None, + "alinux2", [], ), # Wrong instance type ( - CapacityReservationInfo({"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1a"}), + CapacityReservationInfo( + {"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1a", "InstancePlatform": "Linux/UNIX"} + ), ["c5.xlarge"], False, "us-east-1a", CapacityType.ONDEMAND, + "alinux2023", ["Capacity reservation .* must have the same instance type as c5.xlarge."], ), # Wrong availability zone ( - CapacityReservationInfo({"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1b"}), + CapacityReservationInfo( + {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1b", "InstancePlatform": "Linux/UNIX"} + ), ["c5.xlarge"], False, "us-east-1a", CapacityType.SPOT, - ["Capacity reservation .* must use the same availability zone as subnet"], + "ubuntu2404", + [ + "Capacity reservation .* has platform Linux/UNIX, which is not compatible with " + + "the cluster OS ubuntu2404. Please use a reservation with platform Ubuntu Pro.", + "Capacity reservation .* must use the same availability zone as subnet", + ], ), # Both instance type and availability zone are wrong ( - CapacityReservationInfo({"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1b"}), + CapacityReservationInfo( + {"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1b", "InstancePlatform": "Ubuntu Pro"} + ), ["c5.xlarge"], False, "us-east-1a", CapacityType.ONDEMAND, + "ubuntu2204", [ "Capacity reservation .* must have the same instance type as c5.xlarge.", "Capacity reservation .* must use the same availability zone as subnet", ], ), ( - CapacityReservationInfo({"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1b"}), + CapacityReservationInfo( + { + "InstanceType": "m5.xlarge", + "AvailabilityZone": "us-east-1b", + "InstancePlatform": "Red Hat Enterprise Linux", + } + ), ["c5.xlarge"], False, "us-east-1a", CapacityType.SPOT, - ["Capacity reservation .* must use the same availability zone as subnet"], + "ubuntu2004", + [ + "Capacity reservation .* has platform Red Hat Enterprise Linux, which is not " + + "compatible with the cluster OS ubuntu2004. Please use a reservation with platform Ubuntu Pro.", + "Capacity reservation .* must use the same availability zone as subnet", + ], ), # empty instance type, this should not happen because instance type is automatically retrieved when usinc cr-id ( - CapacityReservationInfo({"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1b"}), + CapacityReservationInfo( + {"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1b", "InstancePlatform": "Linux/UNIX"} + ), None, False, "us-east-1a", CapacityType.ONDEMAND, + "alinux2", [ "Unexpected failure. InstanceType parameter cannot be empty when using CapacityReservationId", "Capacity reservation .* must use the same availability zone as subnet", @@ -771,11 +802,18 @@ def test_placement_group_validator( ), # empty instance type, this should not happen because instance type is automatically retrieved when usinc cr-id ( - CapacityReservationInfo({"InstanceType": "m5.xlarge", "AvailabilityZone": "us-east-1b"}), + CapacityReservationInfo( + { + "InstanceType": "m5.xlarge", + "AvailabilityZone": "us-east-1b", + "InstancePlatform": "Red Hat Enterprise Linux", + } + ), "", False, "us-east-1a", CapacityType.SPOT, + "rocky9", [ "Unexpected failure. InstanceType parameter cannot be empty when using CapacityReservationId", "Capacity reservation .* must use the same availability zone as subnet", @@ -783,11 +821,18 @@ def test_placement_group_validator( ), # wrong capacity type ( - CapacityReservationInfo({"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a"}), + CapacityReservationInfo( + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "InstancePlatform": "Red Hat Enterprise Linux", + } + ), ["c5.xlarge"], False, "us-east-1a", CapacityType.CAPACITY_BLOCK, + "rocky8", [ "Capacity reservation cr-123 is not a Capacity Block reservation. " "It cannot be used when specifying CapacityType: CAPACITY_BLOCK." @@ -795,22 +840,34 @@ def test_placement_group_validator( ), ( CapacityReservationInfo( - {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "ReservationType": "capacity-block"} + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "ReservationType": "capacity-block", + "InstancePlatform": "Ubuntu Pro", + } ), ["c5.xlarge"], False, "us-east-1a", CapacityType.ONDEMAND, + "ubuntu2004", [], # Do not check Ondemand capacity type ), ( CapacityReservationInfo( - {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "ReservationType": "ondemand"} + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "ReservationType": "ondemand", + "InstancePlatform": "Linux/UNIX", + } ), ["c5.xlarge"], False, "us-east-1a", CapacityType.CAPACITY_BLOCK, + "alinux2", [ "Capacity reservation cr-123 is not a Capacity Block reservation. " "It cannot be used when specifying CapacityType: CAPACITY_BLOCK." @@ -819,43 +876,88 @@ def test_placement_group_validator( # right capacity type ( CapacityReservationInfo( - {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "ReservationType": "ondemand"} + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "ReservationType": "ondemand", + "InstancePlatform": "Linux/UNIX", + } ), ["c5.xlarge"], False, "us-east-1a", CapacityType.ONDEMAND, + "alinux2023", [], ), ( CapacityReservationInfo( - {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "ReservationType": "capacity-block"} + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "ReservationType": "capacity-block", + "InstancePlatform": "Linux/UNIX", + } ), ["c5.xlarge"], False, "us-east-1a", CapacityType.CAPACITY_BLOCK, + "alinux2", [], ), + ( + CapacityReservationInfo( + {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "InstancePlatform": "SUSE Linux"} + ), + ["c5.xlarge"], + False, + "us-east-1a", + CapacityType.ONDEMAND, + "alinux2", + [ + "Capacity reservation .* has platform SUSE Linux, which is not compatible" + + " with the cluster OS alinux2. Please use a reservation with platform Linux/UNIX." + ], + ), # Flexible instance type, with a single instance and capacity_reservation_id ( - CapacityReservationInfo({"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a"}), + CapacityReservationInfo( + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "InstancePlatform": "Linux/UNIX with HA", + } + ), ["c5.xlarge"], True, "us-east-1a", None, - ["CapacityReservationId parameter cannot be used with Instances parameter."], + "alinux2023", + [ + "Capacity reservation .* has platform Linux/UNIX with HA, which is not compatible " + + "with the cluster OS alinux2023. Please use a reservation with platform Linux/UNIX.", + "CapacityReservationId parameter cannot be used with Instances parameter.", + ], ), # Flexible instance type with multiple instance types and capacity_reservation_id ( CapacityReservationInfo( - {"InstanceType": "c5.xlarge", "AvailabilityZone": "us-east-1a", "ReservationType": "ondemand"} + { + "InstanceType": "c5.xlarge", + "AvailabilityZone": "us-east-1a", + "ReservationType": "ondemand", + "InstancePlatform": "Ubuntu Pro", + } ), ["c5.xlarge", "m5.2xlarge"], True, "us-east-1a", CapacityType.ONDEMAND, - ["CapacityReservationId parameter cannot be used with Instances parameter."], + "ubuntu2004", + [ + "CapacityReservationId parameter cannot be used with Instances parameter.", + ], ), ], ) @@ -866,6 +968,7 @@ def test_capacity_reservation_validator( subnet_availability_zone, is_flexible, capacity_type, + os, expected_messages, ): mock_aws_api(mocker) @@ -877,6 +980,7 @@ def test_capacity_reservation_validator( is_flexible=is_flexible, subnet="subnet-123", capacity_type=capacity_type, + os=os, ) assert_failure_messages(actual_failures, expected_messages)