Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ CHANGELOG

**CHANGES**

- Upgrade Slurm to version 20.11.4.
- Add new SlurmctldParameters, power_save_min_interval=30, so power actions will be processed every 30 seconds
- Specify instance GPU model as GRES GPU Type in gres.conf, instead of previous hardcoded value for all GPU, Type=tesla
- Make `key_name` parameter optional to support cluster configurations without a key pair.
- Remove support for Python 3.4
- Root volume size increased from 25GB to 35GB on all AMIs. Minimum root volume size is now 35GB.
Expand Down
1 change: 1 addition & 0 deletions cli/src/pcluster/config/json_param_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ def refresh_compute_resource(self, compute_resource_section):
# Set gpus according to instance features
gpus = instance_type_info.gpu_count()
compute_resource_section.get_param("gpus").value = gpus
compute_resource_section.get_param("gpu_type").value = instance_type_info.gpu_type()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if we just skip adding the gpu_type entry if there is no gpu rather than using the no_gpu_type?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter will still be there in the Json because it is defined in mappings.py? Are you saying we should use an empty string "" instead of "no_gpu_type" as the default?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either empty "" or None so that it corresponds to a false value in Python. But feel free to leave it as is if you prefer.


# Set enable_efa according to queues' enable_efa and instance features
# Instance type must support EFA
Expand Down
7 changes: 7 additions & 0 deletions cli/src/pcluster/config/mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,13 @@
"visibility": Visibility.PRIVATE,
"default": 0
}),
("gpu_type", {
"type": JsonParam,
# This param is managed automatically
"update_policy": UpdatePolicy.IGNORED,
"visibility": Visibility.PRIVATE,
"default": "no_gpu_type"
}),
("network_interfaces", {
"type": IntJsonParam,
# This param is managed automatically
Expand Down
6 changes: 6 additions & 0 deletions cli/src/pcluster/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,12 @@ def gpu_count(self):

return gpu_count

def gpu_type(self):
"""Return name or type of the GPU for the instance."""
gpu_info = self.instance_type_data.get("GpuInfo", None)
# Remove space and change to all lowercase for name
return "no_gpu_type" if not gpu_info else gpu_info.get("Gpus")[0].get("Name").replace(" ", "").lower()

def max_network_interface_count(self):
"""Max number of NICs for the instance."""
needed_interfaces = int(self.instance_type_data.get("NetworkInfo").get("MaximumNetworkCards", 1))
Expand Down
12 changes: 12 additions & 0 deletions cli/tests/pcluster/config/test_json_param_types/s3_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"spot_price": 0,
"vcpus": 2,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": true,
"disable_hyperthreading_via_cpu_options": true,
Expand All @@ -33,6 +34,7 @@
"spot_price": 0,
"vcpus": 48,
"gpus": 8,
"gpu_type": "t4",
"enable_efa": true,
"disable_hyperthreading": true,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -47,6 +49,7 @@
"spot_price": 0,
"vcpus": 48,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": true,
"disable_hyperthreading": true,
"disable_hyperthreading_via_cpu_options": true,
Expand All @@ -61,6 +64,7 @@
"spot_price": 0,
"vcpus": 4,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -75,6 +79,7 @@
"spot_price": 0,
"vcpus": 4,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -98,6 +103,7 @@
"spot_price": 0.4,
"vcpus": 4,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -112,6 +118,7 @@
"spot_price": 0.5,
"vcpus": 96,
"gpus": 8,
"gpu_type": "t4",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -126,6 +133,7 @@
"spot_price": 0.6,
"vcpus": 96,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -140,6 +148,7 @@
"spot_price": 0,
"vcpus": 4,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -154,6 +163,7 @@
"spot_price": 0,
"vcpus": 4,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": false,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -177,6 +187,7 @@
"spot_price": 0.4,
"vcpus": 96,
"gpus": 0,
"gpu_type": "no_gpu_type",
"enable_efa": true,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand All @@ -191,6 +202,7 @@
"spot_price": 0,
"vcpus": 96,
"gpus": 8,
"gpu_type": "a100",
"enable_efa": true,
"disable_hyperthreading": false,
"disable_hyperthreading_via_cpu_options": false,
Expand Down
14 changes: 12 additions & 2 deletions tests/integration-tests/tests/common/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,28 @@ def assert_no_errors_in_logs(remote_command_executor, scheduler):
log_files = []

for log_file in log_files:
log = remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout
log = remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout
for error_level in ["CRITICAL", "ERROR"]:
assert_that(log).does_not_contain(error_level)


def assert_no_msg_in_logs(remote_command_executor, log_files, log_msg):
"""Assert log msgs are not in logs."""
__tracebackhide__ = True
log = ""
for log_file in log_files:
log += remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout
for message in log_msg:
assert_that(log).does_not_contain(message)


def assert_errors_in_logs(remote_command_executor, log_files, expected_errors):
# assert every expected error exists in at least one of the log files
__tracebackhide__ = True

log = ""
for log_file in log_files:
log += remote_command_executor.run_remote_command("cat {0}".format(log_file), hide=True).stdout
log += remote_command_executor.run_remote_command("sudo cat {0}".format(log_file), hide=True).stdout
for message in expected_errors:
assert_that(log).matches(message)

Expand Down
6 changes: 6 additions & 0 deletions tests/integration-tests/tests/common/schedulers_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,12 @@ def get_nodes_status(self, filter_by_nodes=None):
else current_node_states
)

def get_node_addr_host(self):
"""Return a list of nodename, nodeaddr, nodehostname entries."""
return self._remote_command_executor.run_remote_command(
"/opt/slurm/bin/sinfo -O NodeList:' ',NodeAddr:' ',NodeHost:' ' -N -h | awk '{print$1, $2, $3}'"
).stdout.splitlines()

def submit_command_and_assert_job_accepted(self, submit_command_args):
"""Submit a command and assert the job is accepted by scheduler."""
result = self.submit_command(**submit_command_args)
Expand Down
Loading