Skip to content

Commit fbb995d

Browse files
hanwen-clusterhanwen-pcluste
authored andcommitted
Set the default root volume size according to AMIs being used
1. Set the default root volume sizes of all nodes to the size of snapshot in the AMI used for each node respectively. The old behavior was to have a constant 35GB default 2. Users can overwrite the defaults by the `Size` parameter under `RootVolume` 3. Add a validator to make sure the `Size` specified by the user is equal or greater than the size of the snapshot in the AMI. For example: ``` Region: us-east-1 Image: Os: alinux2 HeadNode: ... Scheduling: Scheduler: slurm SlurmQueues: - Name: queue1 ... - Name: queue2 Image: CustomAmi: ami-00f25fb4d0adb70a5 ... ``` where `ami-00f25fb4d0adb70a5` is a deep learning AMI. The head node and compute nodes in queue1 will have 35GB of root volume, the size of official ParallelCluster AMI. The compute nodes in queue2 will have 167GB of root volume, the size of deep learning AMI Signed-off-by: Hanwen <hanwenli@amazon.com>
1 parent 70573a4 commit fbb995d

File tree

8 files changed

+55
-23
lines changed

8 files changed

+55
-23
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ x.x.x
3030
- Add `parallelcluster:cluster-name` tag to all resources created by ParallelCluster.
3131
- Do not allow setting `PlacementGroup/Id` when `PlacementGroup/Enabled` is explicitly set to `false`.
3232
- Restrict IPv6 access to IMDS to root and cluster admin users only.
33+
- Change the default root volume size from 35 GiB to the size of AMIs. The default can be overwritten in cluster configuration file.
3334

3435
**BUG FIXES**
3536
- Fix default for disable validate and test components when building custom AMI. The default was to disable those components, but it wasn't effective.

cli/src/pcluster/config/cluster_config.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
NumberOfStorageValidator,
9191
OverlappingMountDirValidator,
9292
RegionValidator,
93+
RootVolumeSizeValidator,
9394
SchedulableMemoryValidator,
9495
SchedulerOsValidator,
9596
SharedStorageMountDirValidator,
@@ -159,15 +160,13 @@ class Ebs(Resource):
159160

160161
def __init__(
161162
self,
162-
size: int = None,
163163
encrypted: bool = None,
164164
volume_type: str = None,
165165
iops: int = None,
166166
throughput: int = None,
167167
**kwargs,
168168
):
169169
super().__init__(**kwargs)
170-
self.size = Resource.init_param(size, default=EBS_VOLUME_SIZE_DEFAULT)
171170
self.encrypted = Resource.init_param(encrypted, default=True)
172171
self.volume_type = Resource.init_param(volume_type, default=EBS_VOLUME_TYPE_DEFAULT)
173172
self.iops = Resource.init_param(iops, default=EBS_VOLUME_TYPE_IOPS_DEFAULT.get(self.volume_type))
@@ -192,8 +191,9 @@ def _register_validators(self):
192191
class RootVolume(Ebs):
193192
"""Represent the root volume configuration."""
194193

195-
def __init__(self, delete_on_termination: bool = None, **kwargs):
194+
def __init__(self, size: int = None, delete_on_termination: bool = None, **kwargs):
196195
super().__init__(**kwargs)
196+
self.size = Resource.init_param(size)
197197
# The default delete_on_termination takes effect both on head and compute nodes.
198198
# If the default of the head node is to be changed, please separate this class for different defaults.
199199
self.delete_on_termination = Resource.init_param(delete_on_termination, default=True)
@@ -241,6 +241,7 @@ def __init__(
241241
self,
242242
mount_dir: str,
243243
name: str,
244+
size: int = None,
244245
kms_key_id: str = None,
245246
snapshot_id: str = None,
246247
volume_id: str = None,
@@ -249,6 +250,7 @@ def __init__(
249250
**kwargs,
250251
):
251252
super().__init__(**kwargs)
253+
self.size = Resource.init_param(size, default=EBS_VOLUME_SIZE_DEFAULT)
252254
self.kms_key_id = Resource.init_param(kms_key_id)
253255
self.mount_dir = Resource.init_param(mount_dir)
254256
self.name = Resource.init_param(name)
@@ -1178,6 +1180,11 @@ def _register_validators(self):
11781180
self._register_validator(
11791181
HeadNodeImdsValidator, imds_secured=self.head_node.imds.secured, scheduler=self.scheduling.scheduler
11801182
)
1183+
self._register_validator(
1184+
RootVolumeSizeValidator,
1185+
root_volume_size=self.head_node.local_storage.root_volume.size,
1186+
ami_id=self.head_node_ami,
1187+
)
11811188

11821189
def _register_storage_validators(self):
11831190
if self.shared_storage:

cli/src/pcluster/schemas/cluster_schema.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,6 @@
108108
from pcluster.constants import (
109109
DELETION_POLICIES,
110110
DELETION_POLICIES_WITH_SNAPSHOT,
111-
EBS_VOLUME_SIZE_DEFAULT,
112111
FSX_LUSTRE,
113112
FSX_ONTAP,
114113
FSX_OPENZFS,
@@ -169,14 +168,6 @@ def make_resource(self, data, **kwargs):
169168
"""Generate resource."""
170169
return RootVolume(**data)
171170

172-
@validates("size")
173-
def validate_size(self, value):
174-
"""Validate the size of root volume."""
175-
if value < EBS_VOLUME_SIZE_DEFAULT:
176-
raise ValidationError(
177-
f"Root volume size {value} is invalid. It must be at least {EBS_VOLUME_SIZE_DEFAULT}."
178-
)
179-
180171

181172
class QueueRootVolumeSchema(BaseSchema):
182173
"""Represent the RootVolume schema for the queue."""
@@ -192,14 +183,6 @@ def make_resource(self, data, **kwargs):
192183
"""Generate resource."""
193184
return RootVolume(**data)
194185

195-
@validates("size")
196-
def validate_size(self, value):
197-
"""Validate the size of root volume."""
198-
if value < EBS_VOLUME_SIZE_DEFAULT:
199-
raise ValidationError(
200-
f"Root volume size {value} is invalid. It must be at least {EBS_VOLUME_SIZE_DEFAULT}."
201-
)
202-
203186

204187
class RaidSchema(BaseSchema):
205188
"""Represent the schema of the parameters specific to Raid. It is a child of EBS schema."""

cli/src/pcluster/validators/cluster_validators.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,6 +1078,20 @@ def _test_compute_resource(
10781078
)
10791079

10801080

1081+
class RootVolumeSizeValidator(Validator):
1082+
"""Verify the root volume size is equal or greater to the size of the snapshot of the AMI."""
1083+
1084+
def _validate(self, root_volume_size, ami_id):
1085+
if root_volume_size:
1086+
ami_volume_size = AWSApi.instance().ec2.describe_image(ami_id).volume_size
1087+
if root_volume_size < ami_volume_size:
1088+
self._add_failure(
1089+
f"Root volume size {root_volume_size} GiB must be equal or greater than the volume size of "
1090+
f"the AMI {ami_id}: {ami_volume_size} GiB.",
1091+
FailureLevel.ERROR,
1092+
)
1093+
1094+
10811095
class HostedZoneValidator(Validator):
10821096
"""Validate custom private domain in the same VPC as head node."""
10831097

cli/src/pcluster/validators/ebs_validators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class EbsVolumeTypeSizeValidator(Validator):
4545
"""
4646

4747
def _validate(self, volume_type: str, volume_size: int):
48-
if volume_type in EBS_VOLUME_TYPE_TO_VOLUME_SIZE_BOUNDS:
48+
if volume_size is not None and volume_type in EBS_VOLUME_TYPE_TO_VOLUME_SIZE_BOUNDS:
4949
min_size, max_size = EBS_VOLUME_TYPE_TO_VOLUME_SIZE_BOUNDS.get(volume_type)
5050
if volume_size > max_size:
5151
self._add_failure(

cli/tests/pcluster/schemas/test_schema_validators.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ def test_mount_dir_validator(mount_dir, expected_message):
7171
("", "Not a valid integer"),
7272
("NONE", "Not a valid integer"),
7373
("wrong_value", "Not a valid integer"),
74-
(19, "must be at least 35"),
7574
(36, None),
7675
],
7776
)

cli/tests/pcluster/validators/test_cluster_validators.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from assertpy import assert_that
1313
from munch import DefaultMunch
1414

15-
from pcluster.aws.aws_resources import InstanceTypeInfo
15+
from pcluster.aws.aws_resources import ImageInfo, InstanceTypeInfo
1616
from pcluster.config.cluster_config import PlacementGroup, Tag
1717
from pcluster.constants import PCLUSTER_NAME_MAX_LENGTH
1818
from pcluster.validators.cluster_validators import (
@@ -41,6 +41,7 @@
4141
NumberOfStorageValidator,
4242
OverlappingMountDirValidator,
4343
RegionValidator,
44+
RootVolumeSizeValidator,
4445
SchedulableMemoryValidator,
4546
SchedulerOsValidator,
4647
SharedStorageMountDirValidator,
@@ -1137,3 +1138,24 @@ def test_mixed_security_group_overwrite_validator(head_node_security_groups, que
11371138
)
11381139
expected_message = "make sure.*cluster nodes are reachable" if expect_warning else None
11391140
assert_failure_messages(actual_failures, expected_message)
1141+
1142+
1143+
@pytest.mark.parametrize(
1144+
"root_volume_size, ami_size, expected_message",
1145+
[
1146+
(65, 50, None),
1147+
(
1148+
25,
1149+
50,
1150+
"Root volume size 25 GiB must be equal or greater than .* 50 GiB.",
1151+
),
1152+
],
1153+
)
1154+
def test_root_volume_size_validator(mocker, root_volume_size, ami_size, expected_message):
1155+
mock_aws_api(mocker)
1156+
mocker.patch(
1157+
"pcluster.aws.ec2.Ec2Client.describe_image",
1158+
return_value=ImageInfo({"BlockDeviceMappings": [{"Ebs": {"VolumeSize": ami_size}}]}),
1159+
)
1160+
actual_failures = RootVolumeSizeValidator().execute(root_volume_size, "ami-123456789a8a37250")
1161+
assert_failure_messages(actual_failures, expected_message)

cli/tests/pcluster/validators/test_ebs_validators.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,24 +94,30 @@ def test_ebs_volume_iops_validators(volume_type, volume_size, volume_iops, expec
9494
("standard", 15, None),
9595
("standard", 0, "The size of standard volumes must be at least 1 GiB"),
9696
("standard", 1025, "The size of standard volumes can not exceed 1024 GiB"),
97+
("standard", None, None),
9798
("io1", 15, None),
9899
("io1", 3, "The size of io1 volumes must be at least 4 GiB"),
99100
("io1", 16385, "The size of io1 volumes can not exceed 16384 GiB"),
101+
("io1", None, None),
100102
("io2", 15, None),
101103
("io2", 3, "The size of io2 volumes must be at least 4 GiB"),
102104
("io2", 65537, "The size of io2 volumes can not exceed 65536 GiB"),
105+
("io2", None, None),
103106
("gp2", 15, None),
104107
("gp2", 0, "The size of gp2 volumes must be at least 1 GiB"),
105108
("gp2", 16385, "The size of gp2 volumes can not exceed 16384 GiB"),
109+
("gp2", None, None),
106110
("gp3", 15, None),
107111
("gp3", 0, "The size of gp3 volumes must be at least 1 GiB"),
108112
("gp3", 16385, "The size of gp3 volumes can not exceed 16384 GiB"),
113+
("gp3", None, None),
109114
("st1", 500, None),
110115
("st1", 20, "The size of st1 volumes must be at least 500 GiB"),
111116
("st1", 16385, "The size of st1 volumes can not exceed 16384 GiB"),
112117
("sc1", 500, None),
113118
("sc1", 20, "The size of sc1 volumes must be at least 500 GiB"),
114119
("sc1", 16385, "The size of sc1 volumes can not exceed 16384 GiB"),
120+
("sc1", None, None),
115121
],
116122
)
117123
def test_ebs_volume_type_size_validator(volume_type, volume_size, expected_message):

0 commit comments

Comments
 (0)