diff --git a/CHANGELOG.md b/CHANGELOG.md index 6bdb4e352e..949e90f97f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ CHANGELOG **CHANGES** +- Upgrade Slurm to version 20.11.4. + - Add new SlurmctldParameters, power_save_min_interval=30, so power actions will be processed every 30 seconds + - Specify instance GPU model as GRES GPU Type in gres.conf, instead of previous hardcoded value for all GPU, Type=tesla - Make `key_name` parameter optional to support cluster configurations without a key pair. - Remove support for Python 3.4 - Root volume size increased from 25GB to 35GB on all AMIs. Minimum root volume size is now 35GB. diff --git a/cli/src/pcluster/config/json_param_types.py b/cli/src/pcluster/config/json_param_types.py index 097d941183..db4fc3f0bd 100644 --- a/cli/src/pcluster/config/json_param_types.py +++ b/cli/src/pcluster/config/json_param_types.py @@ -303,6 +303,7 @@ def refresh_compute_resource(self, compute_resource_section): # Set gpus according to instance features gpus = instance_type_info.gpu_count() compute_resource_section.get_param("gpus").value = gpus + compute_resource_section.get_param("gpu_type").value = instance_type_info.gpu_type() # Set enable_efa according to queues' enable_efa and instance features # Instance type must support EFA diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py index 4ae7a510f6..60e9d44a59 100644 --- a/cli/src/pcluster/config/mappings.py +++ b/cli/src/pcluster/config/mappings.py @@ -681,6 +681,13 @@ "visibility": Visibility.PRIVATE, "default": 0 }), + ("gpu_type", { + "type": JsonParam, + # This param is managed automatically + "update_policy": UpdatePolicy.IGNORED, + "visibility": Visibility.PRIVATE, + "default": "no_gpu_type" + }), ("network_interfaces", { "type": IntJsonParam, # This param is managed automatically diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py index d7844d509f..1be368aeda 100644 --- a/cli/src/pcluster/utils.py +++ b/cli/src/pcluster/utils.py @@ -1286,6 +1286,12 @@ def gpu_count(self): return gpu_count + def gpu_type(self): + """Return name or type of the GPU for the instance.""" + gpu_info = self.instance_type_data.get("GpuInfo", None) + # Remove space and change to all lowercase for name + return "no_gpu_type" if not gpu_info else gpu_info.get("Gpus")[0].get("Name").replace(" ", "").lower() + def max_network_interface_count(self): """Max number of NICs for the instance.""" needed_interfaces = int(self.instance_type_data.get("NetworkInfo").get("MaximumNetworkCards", 1)) diff --git a/cli/tests/pcluster/config/test_json_param_types/s3_config.json b/cli/tests/pcluster/config/test_json_param_types/s3_config.json index db0971e034..9e36d3adfe 100644 --- a/cli/tests/pcluster/config/test_json_param_types/s3_config.json +++ b/cli/tests/pcluster/config/test_json_param_types/s3_config.json @@ -19,6 +19,7 @@ "spot_price": 0, "vcpus": 2, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": true, "disable_hyperthreading_via_cpu_options": true, @@ -33,6 +34,7 @@ "spot_price": 0, "vcpus": 48, "gpus": 8, + "gpu_type": "t4", "enable_efa": true, "disable_hyperthreading": true, "disable_hyperthreading_via_cpu_options": false, @@ -47,6 +49,7 @@ "spot_price": 0, "vcpus": 48, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": true, "disable_hyperthreading": true, "disable_hyperthreading_via_cpu_options": true, @@ -61,6 +64,7 @@ "spot_price": 0, "vcpus": 4, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -75,6 +79,7 @@ "spot_price": 0, "vcpus": 4, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -98,6 +103,7 @@ "spot_price": 0.4, "vcpus": 4, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -112,6 +118,7 @@ "spot_price": 0.5, "vcpus": 96, "gpus": 8, + "gpu_type": "t4", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -126,6 +133,7 @@ "spot_price": 0.6, "vcpus": 96, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -140,6 +148,7 @@ "spot_price": 0, "vcpus": 4, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -154,6 +163,7 @@ "spot_price": 0, "vcpus": 4, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": false, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -177,6 +187,7 @@ "spot_price": 0.4, "vcpus": 96, "gpus": 0, + "gpu_type": "no_gpu_type", "enable_efa": true, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, @@ -191,6 +202,7 @@ "spot_price": 0, "vcpus": 96, "gpus": 8, + "gpu_type": "a100", "enable_efa": true, "disable_hyperthreading": false, "disable_hyperthreading_via_cpu_options": false, diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index 59fe1aaeff..c61e6a8f91 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -57,18 +57,28 @@ def assert_no_errors_in_logs(remote_command_executor, scheduler): log_files = [] for log_file in log_files: - log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout + log = remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout for error_level in ["CRITICAL", "ERROR"]: assert_that(log).does_not_contain(error_level) +def assert_no_msg_in_logs(remote_command_executor, log_files, log_msg): + """Assert log msgs are not in logs.""" + __tracebackhide__ = True + log = "" + for log_file in log_files: + log += remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout + for message in log_msg: + assert_that(log).does_not_contain(message) + + def assert_errors_in_logs(remote_command_executor, log_files, expected_errors): # assert every expected error exists in at least one of the log files __tracebackhide__ = True log = "" for log_file in log_files: - log += remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout + log += remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout for message in expected_errors: assert_that(log).matches(message) diff --git a/tests/integration-tests/tests/common/schedulers_common.py b/tests/integration-tests/tests/common/schedulers_common.py index 25c889a2c4..1003e4156c 100644 --- a/tests/integration-tests/tests/common/schedulers_common.py +++ b/tests/integration-tests/tests/common/schedulers_common.py @@ -471,6 +471,12 @@ def get_nodes_status(self, filter_by_nodes=None): else current_node_states ) + def get_node_addr_host(self): + """Return a list of nodename, nodeaddr, nodehostname entries.""" + return self._remote_command_executor.run_remote_command( + "/opt/slurm/bin/sinfo -O NodeList:' ',NodeAddr:' ',NodeHost:' ' -N -h | awk '{print$1, $2, $3}'" + ).stdout.splitlines() + def submit_command_and_assert_job_accepted(self, submit_command_args): """Submit a command and assert the job is accepted by scheduler.""" result = self.submit_command(**submit_command_args) diff --git a/tests/integration-tests/tests/schedulers/test_slurm.py b/tests/integration-tests/tests/schedulers/test_slurm.py index a1dfea402a..3b7504e7a0 100644 --- a/tests/integration-tests/tests/schedulers/test_slurm.py +++ b/tests/integration-tests/tests/schedulers/test_slurm.py @@ -24,6 +24,7 @@ from tests.common.assertions import ( assert_errors_in_logs, assert_no_errors_in_logs, + assert_no_msg_in_logs, assert_no_node_in_ec2, assert_num_instances_constant, assert_num_instances_in_cluster, @@ -80,7 +81,12 @@ def test_slurm(region, pcluster_config_reader, clusters_factory, test_datadir, a slurm_commands, partition="ondemand", instance_type="c5.xlarge", max_count=5, cpu_per_instance=4 ) _test_cluster_gpu_limits( - slurm_commands, partition="gpu", instance_type="g3.8xlarge", max_count=5, gpu_per_instance=2 + slurm_commands, + partition="gpu", + instance_type="g3.8xlarge", + max_count=5, + gpu_per_instance=2, + gpu_type="m60", ) # Test torque command wrapper _test_torque_job_submit(remote_command_executor, test_datadir) @@ -182,16 +188,25 @@ def test_error_handling(scheduler, region, instance, pcluster_config_reader, clu scheduler_commands = get_scheduler_commands(scheduler, remote_command_executor) _assert_cluster_initial_conditions(scheduler_commands, instance, 10, 10, 1, 1) - _test_status_check_replacement( + _test_cloud_node_health_check( remote_command_executor, scheduler_commands, cluster.cfn_name, region, partition="ondemand1", num_static_nodes=1, + # Test only works with num_dynamic = 1 num_dynamic_nodes=1, dynamic_instance_type=instance, ) + _test_ec2_status_check_replacement( + remote_command_executor, + scheduler_commands, + cluster.cfn_name, + region, + partition="ondemand1", + num_static_nodes=1, + ) # Next test will introduce error in logs, assert no error now assert_no_errors_in_logs(remote_command_executor, scheduler) _test_clustermgtd_down_logic( @@ -370,7 +385,7 @@ def _test_keep_or_replace_suspended_nodes( assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) -def _test_status_check_replacement( +def _test_cloud_node_health_check( remote_command_executor, scheduler_commands, cluster_name, @@ -380,8 +395,12 @@ def _test_status_check_replacement( num_dynamic_nodes, dynamic_instance_type, ): - """Test nodes failing static check are correctly replaced.""" - logging.info("Testing that nodes failing static check are correctly replaced") + """ + Test nodes with networking failure are correctly replaced. + + This will test if slurm is performing health check on CLOUD nodes correctly. + """ + logging.info("Testing that nodes with networking failure fails slurm health check and replaced") job_id = submit_initial_job( scheduler_commands, "sleep 500", @@ -393,33 +412,66 @@ def _test_status_check_replacement( static_nodes, dynamic_nodes = assert_initial_conditions( scheduler_commands, num_static_nodes, num_dynamic_nodes, partition, job_id ) - # Get network interface name from Head node, assuming Head node and Compute are of the same instance type - interface_name = remote_command_executor.run_remote_command( - "nmcli device status | grep ether | awk '{print $1}'" - ).stdout - logging.info("Detaching network interface {} on Compute nodes".format(interface_name)) - # Only use dynamic nodes to test status check behavior - # Because static nodes will likely be placed in DOWN before status check failure shows in EC2 - # Submit job that will detach network interface on all dynamic nodes, this will cause EC2 status check to fail - kill_job_id = scheduler_commands.submit_command_and_assert_job_accepted( - submit_command_args={ - "command": "sudo ifconfig {} down && sleep 600".format(interface_name), - "partition": partition, - "constraint": "dynamic", - "other_options": "-a 1-{} --exclusive --no-requeue".format(num_dynamic_nodes), - } + # Assert that the default SlurmdTimeout=180 is in effect + _assert_slurmd_timeout(remote_command_executor, timeout=180) + # Nodes with networking failures should fail slurm health check before failing ec2_status_check + # Test on freshly launched dynamic nodes + kill_job_id = _submit_kill_networking_job( + remote_command_executor, scheduler_commands, partition, node_type="dynamic", num_nodes=num_dynamic_nodes + ) + # Sleep for a bit so the command to detach network interface can be run + time.sleep(15) + # Job will hang, cancel it manually to avoid waiting for job failing + scheduler_commands.cancel_job(kill_job_id) + # Assert nodes are put into DOWN for not responding + # TO-DO: this test only works with num_dynamic = 1 because slurm will record this error in nodelist format + # i.e. error: Nodes q2-st-t2large-[1-2] not responding, setting DOWN + # To support multiple nodes, need to convert list of node into nodelist format string + retry(wait_fixed=seconds(20), stop_max_delay=minutes(5))(assert_errors_in_logs)( + remote_command_executor, + ["/var/log/slurmctld.log"], + ["Nodes {} not responding, setting DOWN".format(",".join(dynamic_nodes))], ) - # Assert that dynamic nodes with failing status check are put into DRAIN - # Can take up to 15 mins for status check to show + # Assert dynamic nodes are reset + _wait_for_node_reset(scheduler_commands, static_nodes=[], dynamic_nodes=dynamic_nodes) + assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) + # Assert ec2_status_check code path is not triggered + assert_no_msg_in_logs( + remote_command_executor, + ["/var/log/parallelcluster/clustermgtd"], + ["Setting nodes failing health check type ec2_health_check to DRAIN"], + ) + + +def _test_ec2_status_check_replacement( + remote_command_executor, + scheduler_commands, + cluster_name, + region, + partition, + num_static_nodes, +): + """Test nodes with failing ec2 status checks are correctly replaced.""" + logging.info("Testing that nodes with failing ec2 status checks are correctly replaced") + static_nodes, _ = assert_initial_conditions(scheduler_commands, num_static_nodes, 0, partition) + # Can take up to 15 mins for ec2_status_check to show + # Need to increase SlurmdTimeout to avoid slurm health check and trigger ec2_status_check code path + _set_slurmd_timeout(remote_command_executor, timeout=10000) + kill_job_id = _submit_kill_networking_job( + remote_command_executor, scheduler_commands, partition, node_type="static", num_nodes=num_static_nodes + ) + # Assert ec2_status_check code path is triggered retry(wait_fixed=seconds(20), stop_max_delay=minutes(15))(assert_errors_in_logs)( remote_command_executor, ["/var/log/parallelcluster/clustermgtd"], ["Setting nodes failing health check type ec2_health_check to DRAIN"], ) scheduler_commands.cancel_job(kill_job_id) - # Assert dynamic nodes are reset - _wait_for_node_reset(scheduler_commands, static_nodes=[], dynamic_nodes=dynamic_nodes) + # Assert static nodes are reset + _wait_for_node_reset(scheduler_commands, static_nodes=static_nodes, dynamic_nodes=[]) assert_num_instances_in_cluster(cluster_name, region, len(static_nodes)) + # Reset SlurmdTimeout to 180s + _set_slurmd_timeout(remote_command_executor, timeout=180) def _test_clustermgtd_down_logic( @@ -500,6 +552,14 @@ def _wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes): if dynamic_nodes: logging.info("Assert dynamic nodes are power saved") _wait_for_compute_nodes_states(scheduler_commands, dynamic_nodes, expected_states=["idle~"]) + node_addr_host = scheduler_commands.get_node_addr_host() + _assert_node_addr_host_reset(node_addr_host, dynamic_nodes) + + +def _assert_node_addr_host_reset(addr_host_list, nodes): + """Assert that NodeAddr and NodeHostname are reset.""" + for nodename in nodes: + assert_that(addr_host_list).contains("{0} {0} {0}".format(nodename)) def _assert_nodes_not_terminated(scheduler_commands, nodes, timeout=5): @@ -589,7 +649,7 @@ def _check_mpi_process(remote_command_executor, slurm_commands, test_datadir, nu assert_that(proc_track_result.stdout).contains("IMB-MPI1") -def _test_cluster_gpu_limits(slurm_commands, partition, instance_type, max_count, gpu_per_instance): +def _test_cluster_gpu_limits(slurm_commands, partition, instance_type, max_count, gpu_per_instance, gpu_type): """Test edge cases regarding the number of GPUs.""" logging.info("Testing scheduler does not accept jobs when requesting for more GPUs than available") # Expect commands below to fail with exit 1 @@ -642,7 +702,7 @@ def _test_cluster_gpu_limits(slurm_commands, partition, instance_type, max_count "partition": partition, "constraint": instance_type, "slots": gpu_per_instance, - "other_options": "-G {0} --gpus-per-task=1".format(gpu_per_instance), + "other_options": "-G {0}:{1} --gpus-per-task={0}:1".format(gpu_type, gpu_per_instance), } ) slurm_commands.submit_command_and_assert_job_accepted( @@ -650,7 +710,7 @@ def _test_cluster_gpu_limits(slurm_commands, partition, instance_type, max_count "command": "sleep 1", "partition": partition, "constraint": instance_type, - "other_options": "--gres=gpu:{0}".format(gpu_per_instance), + "other_options": "--gres=gpu:{0}:{1}".format(gpu_type, gpu_per_instance), } ) # Submit job without '-N' option(nodes=-1) @@ -729,7 +789,7 @@ def _gpu_resource_check(slurm_commands, partition, instance_type): def _test_slurm_version(remote_command_executor): logging.info("Testing Slurm Version") version = remote_command_executor.run_remote_command("sinfo -V").stdout - assert_that(version).is_equal_to("slurm 20.02.4") + assert_that(version).is_equal_to("slurm 20.11.4") def _test_job_dependencies(slurm_commands, region, stack_name, scaledown_idletime): @@ -813,3 +873,38 @@ def _test_torque_job_submit(remote_command_executor, test_datadir): torque_commands = TorqueCommands(remote_command_executor) result = torque_commands.submit_script(str(test_datadir / "torque_job.sh")) torque_commands.assert_job_submitted(result.stdout) + + +def _submit_kill_networking_job(remote_command_executor, scheduler_commands, partition, node_type, num_nodes): + """Submit job that will detach network interface on compute.""" + # Get network interface name from Head node, assuming Head node and Compute are of the same instance type + interface_name = remote_command_executor.run_remote_command( + "nmcli device status | grep ether | awk '{print $1}'" + ).stdout + logging.info("Detaching network interface {} on {} Compute nodes".format(interface_name, node_type)) + # Submit job that will detach network interface on all dynamic nodes + return scheduler_commands.submit_command_and_assert_job_accepted( + submit_command_args={ + "command": "sudo ifconfig {} down && sleep 600".format(interface_name), + "partition": partition, + "constraint": "{}".format(node_type), + "other_options": "-a 1-{} --exclusive --no-requeue".format(num_nodes), + } + ) + + +def _set_slurmd_timeout(remote_command_executor, timeout): + """Set SlurmdTimeout in slurm.conf.""" + remote_command_executor.run_remote_command( + "sudo sed -i '/SlurmdTimeout/s/=.*/={0}/' /opt/slurm/etc/slurm.conf".format(timeout) + ) + remote_command_executor.run_remote_command("sudo /opt/slurm/bin/scontrol reconfigure") + _assert_slurmd_timeout(remote_command_executor, timeout) + + +def _assert_slurmd_timeout(remote_command_executor, timeout): + """Assert that SlurmdTimeout is correctly set.""" + configured_timeout = remote_command_executor.run_remote_command( + 'scontrol show config | grep -oP "^SlurmdTimeout\\s*\\=\\s*\\K(.+)"' + ).stdout + assert_that(configured_timeout).is_equal_to("{0} sec".format(timeout))