Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
CHANGELOG
=========

3.4.1
-----

**BUG FIXES**
- Fix an issue with the Slurm scheduler that might incorrectly apply updates to its internal registry of compute nodes. This might result in EC2 instances to become inaccessible or backed by an incorrect instance type.

3.4.0
-----

Expand Down
6 changes: 6 additions & 0 deletions tests/integration-tests/configs/common/common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,12 @@ schedulers:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2004"]
schedulers: ["slurm"]
test_slurm.py::test_scontrol_update_nodelist_sorting:
dimensions:
- regions: ["ca-central-2"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["alinux2"]
schedulers: ["slurm"]
test_slurm_accounting.py::test_slurm_accounting:
dimensions:
- regions: ["us-east-1", "ap-south-1"]
Expand Down
50 changes: 50 additions & 0 deletions tests/integration-tests/tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,56 @@ def test_update_slurm_reconfigure_race_condition(
)


@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
def test_scontrol_update_nodelist_sorting(
pcluster_config_reader,
clusters_factory,
test_datadir,
scheduler_commands_factory,
):
"""
Test that scontrol update node follows the order of the nodelist provided by the user.

In Slurm 22.05 the scontrol update node logic was modified and a sorting routine was
introduced, which modified the order of the nodes in the nodelist.
If `scontrol update node nodename=nodelist nodeaddr=nodeaddrlist` is called, only the
nodelist was sorted (not the nodeaddrlist). This causes mismatches between the Slurm
nodenames and the assigned addresses.

See https://bugs.schedmd.com/show_bug.cgi?id=15731
"""

max_count_cr1 = max_count_cr2 = 4

cluster_config = pcluster_config_reader(
config_file="pcluster.config.yaml",
output_file="pcluster.config.initial.yaml",
max_count_cr1=max_count_cr1,
max_count_cr2=max_count_cr2,
)
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
slurm_commands = scheduler_commands_factory(remote_command_executor)

assert_compute_node_states(slurm_commands, compute_nodes=None, expected_states=["idle~"])

nodes_in_queue1 = slurm_commands.get_compute_nodes("queue1", all_nodes=True)
nodes_in_queue2 = slurm_commands.get_compute_nodes("queue2", all_nodes=True)

# Create an unsorted list of nodes to be updated (queue2 is alphabetically after queue1)``:s
nodelist = f"{nodes_in_queue2[0]},{nodes_in_queue1[0]}"

# Stop clustermgtd since it may fix the situation under the hood if it calls scontrol update
# with a sorted list of nodes
remote_command_executor.run_remote_command("sudo systemctl stop supervisord")

# Run scontrol update with unsorted list of nodes
remote_command_executor.run_remote_command(f"sudo -i scontrol update nodename={nodelist} nodeaddr={nodelist}")

assert_that(slurm_commands.get_node_attribute(nodes_in_queue1[0], "NodeAddr")).is_equal_to(nodes_in_queue1[0])
assert_that(slurm_commands.get_node_attribute(nodes_in_queue2[0], "NodeAddr")).is_equal_to(nodes_in_queue2[0])


@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
def test_slurm_overrides(
scheduler,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Image:
Os: {{ os }}
HeadNode:
InstanceType: {{ instance }}
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
KeyName: {{ key_name }}
Scheduling:
Scheduler: slurm
SlurmQueues:
- Name: queue1
Networking:
SubnetIds:
- {{ private_subnet_id }}
ComputeResources:
- Name: resource1
Instances:
- InstanceType: {{ instance }}
MinCount: 0
MaxCount: {{ max_count_cr1 }}
- Name: queue2
Networking:
SubnetIds:
- {{ private_subnet_id }}
ComputeResources:
- Name: resource2
Instances:
- InstanceType: {{ instance }}
MinCount: 0
MaxCount: {{ max_count_cr2 }}