From d5fe1b85b2dffb02d75b828baa4ee972c7c44ba8 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 13 Dec 2024 08:35:14 -0800 Subject: [PATCH] Add validators to check head node instance type and shared storage type w.r.t cluster size The requirements set in these validators are minimum. Users should leave more safety margin considering their workloads. Signed-off-by: Hanwen --- cli/src/pcluster/config/cluster_config.py | 16 +++++++++ .../pcluster/validators/cluster_validators.py | 36 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 42c7cfc16f..2053eb7a31 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -88,6 +88,7 @@ FsxArchitectureOsValidator, HeadNodeImdsValidator, HeadNodeLaunchTemplateValidator, + HeadNodeMemorySizeValidator, HostedZoneValidator, InstanceArchitectureCompatibilityValidator, IntelHpcArchitectureValidator, @@ -108,6 +109,7 @@ SchedulerDisableSudoAccessForDefaultUserValidator, SchedulerOsValidator, SchedulerValidator, + SharedEbsPerformanceBottleNeckValidator, SharedFileCacheNotHomeValidator, SharedStorageMountDirValidator, SharedStorageNameValidator, @@ -3030,6 +3032,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 self._register_validator(MultiNetworkInterfacesInstancesValidator, queues=self.scheduling.queues) checked_images = [] capacity_reservation_id_max_count_map = {} + total_max_compute_nodes = 0 for index, queue in enumerate(self.scheduling.queues): queue_image = self.image_dict[queue.name] if index == 0: @@ -3064,6 +3067,7 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 self._register_validator(AmiOsCompatibleValidator, os=self.image.os, image_id=queue_image) for compute_resource in queue.compute_resources: + total_max_compute_nodes += compute_resource.max_count self._register_validator( InstanceArchitectureCompatibilityValidator, instance_type_info_list=list(compute_resource.instance_type_info_map.values()), @@ -3180,6 +3184,18 @@ def _register_validators(self, context: ValidatorContext = None): # noqa: C901 compute_resource_tags=compute_resource.get_tags(), ) + self._register_validator( + HeadNodeMemorySizeValidator, + head_node_instance_type=self.head_node.instance_type, + total_max_compute_nodes=total_max_compute_nodes, + ) + if self.shared_storage: + for storage in self.shared_storage: + if isinstance(storage, SharedEbs): + self._register_validator( + SharedEbsPerformanceBottleNeckValidator, + total_max_compute_nodes=total_max_compute_nodes, + ) for capacity_reservation_id, num_of_instances in capacity_reservation_id_max_count_map.items(): self._register_validator( CapacityReservationSizeValidator, diff --git a/cli/src/pcluster/validators/cluster_validators.py b/cli/src/pcluster/validators/cluster_validators.py index 548db963c4..e65921c9e5 100644 --- a/cli/src/pcluster/validators/cluster_validators.py +++ b/cli/src/pcluster/validators/cluster_validators.py @@ -1311,6 +1311,42 @@ def _validate(self, imds_secured: bool, scheduler: str): ) +class HeadNodeMemorySizeValidator(Validator): + """ + Head Node Memory Size Validator. + + Verify if the Head Node has enough memory to manage compute nodes. + """ + + def _validate(self, head_node_instance_type: str, total_max_compute_nodes: int): + head_node_memory = ( + AWSApi.instance().ec2.get_instance_type_info(head_node_instance_type).ec2memory_size_in_mib() / 1024 + ) + # Assume OS takes up 0.6GB memory. Only check upto 16GB memory to prevent usage of small instance types. + required_memory = min(total_max_compute_nodes / 25 + 0.6, 16) + if head_node_memory < required_memory: + self._add_failure( + f"Head node instance type {head_node_instance_type} has {head_node_memory} GB of memory. " + f"Please choose a head node instance type with at least {required_memory} GB of memory" + f" to manage {total_max_compute_nodes} compute nodes.", + FailureLevel.ERROR, + ) + + +class SharedEbsPerformanceBottleNeckValidator(Validator): + """Warn potential performance bottleneck of using Shared EBS.""" + + def _validate(self, total_max_compute_nodes: int): + if total_max_compute_nodes > 100: + self._add_failure( + "EBS shared storage is mounted on the head node and shared to the compute nodes. " + "Therefore, the head node network bandwidth is a network performance bottle neck " + "if the compute nodes rely on this shared storage. " + "Please use FSx and EFS for better performance.", + FailureLevel.WARNING, + ) + + class ComputeResourceLaunchTemplateValidator(_LaunchTemplateValidator): """Try to launch the requested instances (in dry-run mode) to verify configuration parameters."""