aws · rexcsn · Mar 3, 2021 · Feb 22, 2021 · Mar 2, 2021 · Mar 3, 2021
@@ -7,6 +7,9 @@ CHANGELOG
 
 **CHANGES**
 
+- Upgrade Slurm to version 20.11.4.
+  - Add new SlurmctldParameters, power_save_min_interval=30, so power actions will be processed every 30 seconds
+  - Specify instance GPU model as GRES GPU Type in gres.conf, instead of previous hardcoded value for all GPU, Type=tesla
 - Make `key_name` parameter optional to support cluster configurations without a key pair. 
 - Remove support for Python 3.4
 - Root volume size increased from 25GB to 35GB on all AMIs. Minimum root volume size is now 35GB.

@@ -303,6 +303,7 @@ def refresh_compute_resource(self, compute_resource_section):
             # Set gpus according to instance features
             gpus = instance_type_info.gpu_count()
             compute_resource_section.get_param("gpus").value = gpus
+            compute_resource_section.get_param("gpu_type").value = instance_type_info.gpu_type()
 
             # Set enable_efa according to queues' enable_efa and instance features
             # Instance type must support EFA

@@ -681,6 +681,13 @@
             "visibility": Visibility.PRIVATE,
             "default": 0
         }),
+        ("gpu_type", {
+            "type": JsonParam,
+            # This param is managed automatically
+            "update_policy": UpdatePolicy.IGNORED,
+            "visibility": Visibility.PRIVATE,
+            "default": "no_gpu_type"
+        }),
         ("network_interfaces", {
             "type": IntJsonParam,
             # This param is managed automatically

@@ -1286,6 +1286,12 @@ def gpu_count(self):
 
         return gpu_count
 
+    def gpu_type(self):
+        """Return name or type of the GPU for the instance."""
+        gpu_info = self.instance_type_data.get("GpuInfo", None)
+        # Remove space and change to all lowercase for name
+        return "no_gpu_type" if not gpu_info else gpu_info.get("Gpus")[0].get("Name").replace(" ", "").lower()
+
     def max_network_interface_count(self):
         """Max number of NICs for the instance."""
         needed_interfaces = int(self.instance_type_data.get("NetworkInfo").get("MaximumNetworkCards", 1))

@@ -19,6 +19,7 @@
             "spot_price": 0,
             "vcpus": 2,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": true,
             "disable_hyperthreading_via_cpu_options": true,
@@ -33,6 +34,7 @@
             "spot_price": 0,
             "vcpus": 48,
             "gpus": 8,
+            "gpu_type": "t4",
             "enable_efa": true,
             "disable_hyperthreading": true,
             "disable_hyperthreading_via_cpu_options": false,
@@ -47,6 +49,7 @@
             "spot_price": 0,
             "vcpus": 48,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": true,
             "disable_hyperthreading": true,
             "disable_hyperthreading_via_cpu_options": true,
@@ -61,6 +64,7 @@
             "spot_price": 0,
             "vcpus": 4,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -75,6 +79,7 @@
             "spot_price": 0,
             "vcpus": 4,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -98,6 +103,7 @@
             "spot_price": 0.4,
             "vcpus": 4,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -112,6 +118,7 @@
             "spot_price": 0.5,
             "vcpus": 96,
             "gpus": 8,
+            "gpu_type": "t4",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -126,6 +133,7 @@
             "spot_price": 0.6,
             "vcpus": 96,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -140,6 +148,7 @@
             "spot_price": 0,
             "vcpus": 4,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -154,6 +163,7 @@
             "spot_price": 0,
             "vcpus": 4,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": false,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -177,6 +187,7 @@
             "spot_price": 0.4,
             "vcpus": 96,
             "gpus": 0,
+            "gpu_type": "no_gpu_type",
             "enable_efa": true,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,
@@ -191,6 +202,7 @@
             "spot_price": 0,
             "vcpus": 96,
             "gpus": 8,
+            "gpu_type": "a100",
             "enable_efa": true,
             "disable_hyperthreading": false,
             "disable_hyperthreading_via_cpu_options": false,

@@ -57,18 +57,28 @@ def assert_no_errors_in_logs(remote_command_executor, scheduler):
         log_files = []
 
     for log_file in log_files:
-        log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout
+        log = remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout
         for error_level in ["CRITICAL", "ERROR"]:
             assert_that(log).does_not_contain(error_level)
 
 
+def assert_no_msg_in_logs(remote_command_executor, log_files, log_msg):
+    """Assert log msgs are not in logs."""
+    __tracebackhide__ = True
+    log = ""
+    for log_file in log_files:
+        log += remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout
+    for message in log_msg:
+        assert_that(log).does_not_contain(message)
+
+
 def assert_errors_in_logs(remote_command_executor, log_files, expected_errors):
     # assert every expected error exists in at least one of the log files
     __tracebackhide__ = True
 
     log = ""
     for log_file in log_files:
-        log += remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout
+        log += remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout
     for message in expected_errors:
         assert_that(log).matches(message)
 

@@ -471,6 +471,12 @@ def get_nodes_status(self, filter_by_nodes=None):
             else current_node_states
         )
 
+    def get_node_addr_host(self):
+        """Return a list of nodename, nodeaddr, nodehostname entries."""
+        return self._remote_command_executor.run_remote_command(
+            "/opt/slurm/bin/sinfo -O NodeList:' ',NodeAddr:' ',NodeHost:' ' -N -h | awk '{print$1, $2, $3}'"
+        ).stdout.splitlines()
+
     def submit_command_and_assert_job_accepted(self, submit_command_args):
         """Submit a command and assert the job is accepted by scheduler."""
         result = self.submit_command(**submit_command_args)